def test_plot_remove_keys_manyfiles(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        print "Remember, the parse only deletes what got parsed. We import the folder. So we double import. That should work now"
        tryList = [
            ("file_1[0-9].dat.gz", 'c10', 600),
            ("file_[1-2][0-9].dat.gz", 'c20', 600),
            ("file_[1-4][0-9].dat.gz", 'c40', 600),
            ("file_[1-8][0-9].dat.gz", 'c80', 600),
            # don't do this case. timesout at 300 sec on polling with 172-180
            # ("file_[1-2][1-8][0-9].dat.gz", 'c160', 1200),
        ]
        
        xList = []
        eList = []
        fList = []
        importFolderPath = "manyfiles-nflx-gz"
        for (csvFilePattern, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvPathname = importFolderPath + "/" + csvFilePattern
            start = time.time()
            parseResult = h2i.import_parse(bucket="home-0xdiag-datasets", path=csvPathname, hex_key=hex_key, 
                retryDelaySecs=3, timeoutSecs=timeoutSecs, doSummary=False)
            parseElapsed = time.time() - start
            print "Parse only:", parseResult['destination_key'], "took", parseElapsed, "seconds"
            h2o.check_sandbox_for_errors()

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            parsedBytes = inspect['byteSize']

            node = h2o.nodes[0]
            print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?"
            start = time.time()
            node.remove_key(hex_key, timeoutSecs=30)
            removeElapsed = time.time() - start
            print "Deleting", hex_key, "took", removeElapsed, "seconds"

            # xList.append(ntrees)
            xList.append(parsedBytes)
            eList.append(parseElapsed)
            fList.append(removeElapsed)

        # just plot the last one
        if 1==1:
            xLabel = 'parsedBytes'
            eLabel = 'parseElapsed'
            fLabel = 'removeElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
    def test_rapids_vec_fail1(self):
        start = time.time()
        xList = []
        eList = []
        fList = []

        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        # stop if > 1G (fails memory cleaner assetion
        maxx = 29
        # for trial in range(maxx):
        for trial in range(int(1e6),int(100e6),int(10e6)):
            
            # length = (2 ** trial)
            # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
            length = trial
            execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
    
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
            elapsed1 = time.time() - start
            if execResult['num_rows']:
                keys.append(execExpr)

            # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))'
            execExpr = '(= !v (+ %v %v))'
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=30)
            elapsed2 = time.time() - start

            if execResult['num_rows']:
                keys.append(execExpr)
            
            xList.append(length)
            eList.append(elapsed1)
            fList.append(elapsed2)


        if 1==1:
            xLabel = 'vector length'
            eLabel = 'elapsed (create v)'
            fLabel = 'elapsed (v = v + v)'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)



        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
    def test_plot_remove_keys(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (100000, 100, 'cG', 400),
            (200000, 100, 'cH', 400),
            (400000, 100, 'cI', 400),
            (800000, 100, 'cJ', 400),
            (1000000, 100, 'cK', 400),
        ]
        
        xList = []
        eList = []
        fList = []
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            NUM_CASES = h2o_util.fp_format()
            sel = random.randint(0, NUM_CASES-1)
            csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            parseElapsed = pA.python_elapsed
            parse_key = pA.parse_key
            byteSize = pA.byteSize
            numRows = iA.numRows
            numCols = iA.numCols
            print parse_key, parseElapsed, byteSize, numRows, numCols

            labelList = iA.labelList
            node = h2o.nodes[0]

            print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?"
            start = time.time()
            node.remove_key(hex_key, timeoutSecs=30)
            removeElapsed = time.time() - start
            print "Deleting", hex_key, "took", removeElapsed, "seconds"

            # xList.append(ntrees)
            xList.append(byteSize)
            eList.append(parseElapsed)
            fList.append(removeElapsed)

        # just plot the last one
        if 1==1:
            xLabel = 'byteSize'
            eLabel = 'parseElapsed'
            fLabel = 'removeElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
示例#4
0
    def test_speedrf_covtype_fvec(self):
        importFolderPath = "standard"

        # Parse Train ******************************************************
        # csvTrainFilename = 'covtype.data'
        csvTrainFilename = 'covtype20x.data'
        csvTrainPathname = importFolderPath + "/" + csvTrainFilename
        hex_key = csvTrainFilename + ".hex"
        parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=hex_key,
            timeoutSecs=180, doSummary=False)
        inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key'])

        xList = []
        eList = []
        fList = []
        trial = 0
        for trial in range(10):
            timeoutSecs = 30
            # have unique model names
            start = time.time()
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print 'summary end', trial, 'on', csvTrainPathname, 'took', elapsed, 'seconds'

            fList.append(elapsed)
            eList.append(elapsed)

            if DO_PLOT:
                xLabel = 'trial'
                xList.append(trial)

        if DO_PLOT:
            eLabel = 'elapsed'
            fLabel = 'elapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
    def test_RF_many_cols_enum(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u']

        tryList = [
            (10000, 100, 'cA', 300),
            (10000, 300, 'cB', 500),
            # (10000,  500, 'cC', 700),
            # (10000,  700, 'cD', 3600),
            # (10000,  900, 'cE', 3600),
            # (10000,  1000, 'cF', 3600),
            # (10000,  1300, 'cG', 3600),
            # (10000,  1700, 'cH', 3600),
            # (10000,  2000, 'cI', 3600),
            # (10000,  2500, 'cJ', 3600),
            (10000, 3000, 'cK', 3600),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList)

            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            modelKey = 'RFModelKey'

            # Parse (train)****************************************
            start = time.time()
            parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', header=0,
                                                hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", csvPathname, 'took', elapsed, 'seconds', \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # RF(train iterate)****************************************
            ntrees = 10
            for max_depth in [5,10,20,40]:
                params = {
                    'nbins': 1024,
                    'classification': 1,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'response': 'C' + str(numCols-1),
                    'ignored_cols_by_name': None,
                    }

                print "Using these parameters for RF: ", params
                kwargs = params.copy()

                trainStart = time.time()
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseTrainResult,
                                         timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                trainElapsed = time.time() - trainStart
                print "RF training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "RF " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed)
                print l
                h2o.cloudPerfH2O.message(l)
                rfResult["drf_model"] = rfResult.pop("speedrf_model")
                errsLast = rfResult['drf_model']['errs'][-1]
                print "RF 'errsLast'", errsLast

                cm = rfResult['drf_model']['cms'][-1]['_arr'] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

        # just plot the last one
        if 1==1:
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
    def test_rapids_vec_fail(self):
        start = time.time()
        xList = []
        eList = []
        fList = []

        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)

        keys = []
        # stop if > 1G (fails memory cleaner assetion
        maxx = 29
        # for trial in range(maxx):
        for trial in range(int(1e6), int(8e6), int(1e6)):

            # length = (2 ** trial)
            # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
            length = trial

            execExpr = '(= !vreal (c {(: #0 #%s)})' % (length - 1)
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0],
                                               execExpr,
                                               resultKey=None,
                                               timeoutSecs=10)
            elapsed1 = time.time() - start
            if execResult['num_rows']:
                keys.append(execExpr)

            # change it to all 1s? v = v==0
            execExpr = '(= !vint (N %vreal #0))'
            execResult, result = h2e.exec_expr(h2o.nodes[0],
                                               execExpr,
                                               resultKey=None,
                                               timeoutSecs=10)

            # comparing the sum times for int vs real..maybe the other guy isn't real. at least: different compression
            # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))'

            # recursively expand
            execExpr = '(= !v2 (+ %vint <patt>))'
            for j in range(3):
                execExpr = re.sub('<patt>', '(+ %vint <patt>)', execExpr)
            # last one
            execExpr = re.sub('<patt>', '(+ %vint %vint)', execExpr)

            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0],
                                               execExpr,
                                               resultKey=None,
                                               timeoutSecs=10)
            elapsed2 = time.time() - start

            execExpr = '(= !v1 (+ %vreal %vreal))'
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0],
                                               execExpr,
                                               resultKey=None,
                                               timeoutSecs=10)
            elapsed1 = time.time() - start

            inspectResult = h2o_cmd.runInspect(key='vreal')
            h2o_cmd.infoFromInspect(inspectResult)

            inspectResult = h2o_cmd.runInspect(key='vint')
            h2o_cmd.infoFromInspect(inspectResult)

            summaryResult = h2o_cmd.runSummary(key='vreal')

            if execResult['num_rows']:
                keys.append(execExpr)

            xList.append(length)
            eList.append(elapsed1)
            fList.append(elapsed2)

        if 1 == 1:
            xLabel = 'vector length'
            eLabel = 'elapsed (v1 = vint + vint)'
            fLabel = 'elapsed (v2 = vreal + vreal)'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
示例#7
0
    def test_exec_enums_rand_cut2(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = ROWS
        tryList = [
            # (n, 10, 9, 'cE', 300), 
            (n, 1, 1, 'cE', 300), 
            ]

        # create key names to use for exec
        eKeys = ['e%s' % i for i in range(10)]

        # h2b.browseTheCloud()
        trial = 0
        for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList:
            colCount = iColCount + oColCount

            hex_key = 'p'
            colEnumList = create_col_enum_list(iColCount)

            # create 100 possible cut expressions here, so we don't waste time below
            rowExprList = []
            print "Creating", CUT_EXPR_CNT, 'cut expressions'
            for j in range(CUT_EXPR_CNT):
                # init cutValue. None means no compare
                cutValue = [None for i in range(iColCount)]
                # build up a random cut expression
                MAX_COLS_IN_EXPR = iColCount
                cols = random.sample(range(MAX_COLS_IN_EXPR), random.randint(1,MAX_COLS_IN_EXPR))
                for c in cols:
                    # possible choices within the column
                    cel = colEnumList[c]
                    # for now the cutValues are numbers for the enum mappings
                    if 1==1:
                        # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like
                        celChoice = str(random.choice(range(len(cel))))
                    else:
                        celChoice = random.choice(cel)
                    cutValue[c] = celChoice
    
                cutExprList = []
                for i,c in enumerate(cutValue):
                    if c is None:   
                        continue
                    else:
                        # new ...ability to reference cols
                        # src[ src$age<17 && src$zip=95120 && ... , ]
                        # randomly pick == or !=
                        if random.randint(0,1)==0:
                            cutExprList.append('p$C'+str(i+1)+'!='+c)
                        else:
                            cutExprList.append('p$C'+str(i+1)+'=='+c)

                cutExpr = ' & '.join(cutExprList)
                # print "cutExpr:", cutExpr    

                # just extract one output col (the first one)
                rowExpr = '%s[%s,%s];' % (hex_key, cutExpr, iColCount+1)
                # print "rowExpr:", rowExpr
                print rowExpr
                rowExprList.append(rowExpr)


            # CREATE DATASET*******************************************
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList)

            # PARSE*******************************************************

            src_key = csvFilename
            parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='A'+src_key, timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='B'+src_key, timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='C'+src_key, timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='D'+src_key, timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='E'+src_key, timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='F'+src_key, timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='G'+src_key, timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='H'+src_key, timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='I'+src_key, timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='J'+src_key, timeoutSecs=200)

            parseResult = h2i.parse_only(pattern='*'+src_key, hex_key=hex_key, timeoutSecs=800)

            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            pNumRows = inspect['numRows']
            pNumCols = inspect['numCols']
            # print h2o.dump_json(inspect)
            levels = h2o.nodes[0].levels(source=hex_key)
            print "levels result:", h2o.dump_json(levels)

            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

            # error if any col has constant values
            if len(constantValuesDict) != 0:
                raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict)

            # INIT all possible key names used***************************
            # remember. 1 indexing!

            # is this needed?
            if 1==1:
                a = 'a=c(1,2,3);' + ';'.join(['a[,%s]=a[,%s-1]'% (i,i) for i in range(2,colCount)])
                print a
                for eKey in eKeys:
                    # build up the columns
                    e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey), print_params=False)
                    ## print h2o.dump_json(e)


            xList = []
            eList = []
            fList = []
            for repeat in range(CUT_LOOP_CNT):
                # EXEC*******************************************************
                # don't use exec_expr to avoid issues with Inspect following etc.
                randICol = random.randint(0,iColCount-1)
                randOCol = random.randint(iColCount, iColCount+oColCount-1)

                # should be two different keys in the sample
                e = random.sample(eKeys,2)
                fKey = e[0]
                eKey = e[1]

                start = time.time()
                h2o.nodes[0].exec_query(str="%s=%s" % (fKey, random.choice(rowExprList)))
                elapsed = time.time() - start
                execTime = elapsed
                print "exec 2 took", elapsed, "seconds."
            
                inspect = h2o_cmd.runInspect(key=fKey)
                h2o_cmd.infoFromInspect(inspect, fKey)
                numRows = inspect['numRows']
                numCols = inspect['numCols']

                if numRows==0 or numCols!=colCount:
                    h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort")

                # QUANTILE*******************************************************
                quantile = 0.5 if DO_MEDIAN else .999
                # first output col. always fed by an exec cut, so 0?
                column = iColCount
                column = 0
                start = time.time()
                q = h2o.nodes[0].quantiles(source_key=fKey, column=column, 
                    quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS)
                h2p.red_print("quantile", quantile, q['result'])
                elapsed = time.time() - start
                print "quantile end on ", fKey, 'took', elapsed, 'seconds.'
                quantileTime = elapsed


                # remove all keys*******************************************************
                # what about hex_key?
                if 1==0:
                    start = time.time()
                    h2o.nodes[0].remove_all_keys()
                    elapsed = time.time() - start
                    print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.'

                trial += 1
                xList.append(trial)
                eList.append(execTime)
                fList.append(quantileTime)



        #****************************************************************
        # QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET
        print "QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET. Although it's a real col, not an enum col"
        quantile = 0.5 if DO_MEDIAN else .999
        # first output col. always fed by an exec cut, so 0?
        column = iColCount
        start = time.time()
        q = h2o.nodes[0].quantiles(source_key=hex_key, column='C'+str(iColCount+1), 
            quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=0)
        elapsed = time.time() - start
        h2p.red_print(hex_key, pNumRows, "rows Baseline: quantile single col (C" + str(iColCount+1) + ")", "one iteration", elapsed, "secs. threshold:", quantile, q['result'])
        print "quantile single col 1 iteration end on", hex_key, "took", elapsed, 'seconds.'
        quantileTime = elapsed

        #****************************************************************
        # PLOTS. look for eplot.jpg and fplot.jpg in local dir?
        if DO_PLOT:
            xLabel = 'trial'
            eLabel = 'exec cut time'
            fLabel = 'quantile time'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
示例#8
0
    def test_RF_many_cols_enum(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u'
        ]

        tryList = [
            (10000, 100, 'cA', 300),
            (10000, 300, 'cB', 500),
            # (10000,  500, 'cC', 700),
            # (10000,  700, 'cD', 3600),
            # (10000,  900, 'cE', 3600),
            # (10000,  1000, 'cF', 3600),
            # (10000,  1300, 'cG', 3600),
            # (10000,  1700, 'cH', 3600),
            # (10000,  2000, 'cI', 3600),
            # (10000,  2500, 'cJ', 3600),
            # (10000,  3000, 'cK', 3600),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE,
                              translateList)

            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            modelKey = 'RFModelKey'

            # Parse (train)****************************************
            start = time.time()
            parseTrainResult = h2i.import_parse(bucket=None,
                                                path=csvPathname,
                                                schema='put',
                                                header=0,
                                                hex_key=hex_key,
                                                timeoutSecs=timeoutSecs,
                                                doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename,
                elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(
                key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # RF(train iterate)****************************************
            ntrees = 10
            for max_depth in [5, 10, 20, 40]:
                params = {
                    'nbins': 1024,
                    'classification': 1,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': 'C' + str(numCols - 1),
                    'ignored_cols_by_name': None,
                }

                print "Using these parameters for RF: ", params
                kwargs = params.copy()

                trainStart = time.time()
                rfResult = h2o_cmd.runRF(parseResult=parseTrainResult,
                                         timeoutSecs=timeoutSecs,
                                         destination_key=modelKey,
                                         **kwargs)
                trainElapsed = time.time() - trainStart
                print "RF training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "RF " + " ntrees=" + str(ntrees) + " max_depth=" + str(
                    max_depth)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo,
                    csvFilename, trainElapsed)
                print l
                h2o.cloudPerfH2O.message(l)

                errsLast = rfResult['drf_model']['errs'][-1]
                print "RF 'errsLast'", errsLast

                cm = rfResult['drf_model']['cms'][-1][
                    '_arr']  # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

        # just plot the last one
        if 1 == 1:
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
示例#9
0
    def test_rf_covtype_fvec(self):
        h2o.beta_features = True  # fvec
        importFolderPath = "standard"

        # Parse Train ******************************************************
        csvTrainFilename = 'covtype.shuffled.90pct.data'
        csvTrainPathname = importFolderPath + "/" + csvTrainFilename
        hex_key = csvTrainFilename + ".hex"
        parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                            path=csvTrainPathname,
                                            hex_key=hex_key,
                                            timeoutSecs=180,
                                            doSummary=False)
        inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key'])

        # Parse Test ******************************************************
        csvTestFilename = 'covtype.shuffled.10pct.data'
        csvTestPathname = importFolderPath + "/" + csvTestFilename
        hex_key = csvTestFilename + ".hex"
        parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvTestPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseTestResult['destination_key'])

        rfViewInitial = []
        xList = []
        eList = []
        fList = []
        trial = 0

        depthList = [10, 20, 30, 40]
        ntreesList = [5, 10, 20, 30]
        # ntreesList = [2]
        nbinsList = [10, 100, 1000]

        if TRY == 'max_depth':
            tryList = depthList
        elif TRY == 'ntrees':
            tryList = ntreesList
        elif TRY == 'nbins':
            tryList = nbinsList
        else:
            raise Exception("huh? %s" % TRY)

        for d in tryList:
            if TRY == 'max_depth':
                paramDict['max_depth'] = d
            elif TRY == 'ntrees':
                paramDict['ntrees'] = d
            elif TRY == 'nbins':
                paramDict['nbins'] = d
            else:
                raise Exception("huh? %s" % TRY)

            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            if DO_OOBE:
                paramDict['validation'] = None
            else:
                paramDict['validation'] = parseTestResult['destination_key']

            timeoutSecs = 30 + paramDict['ntrees'] * 200

            # do ten starts, to see the bad id problem?
            TRIES = 5
            for i in range(TRIES):
                lastOne = i == (TRIES - 1)

                # have unique model names
                trial += 1
                kwargs = paramDict.copy()
                model_key = 'RFModel_' + str(trial)
                kwargs['destination_key'] = model_key
                data_key = parseTrainResult['destination_key']

                start = time.time()
                rfResult = h2o_cmd.runRF(parseResult=parseTrainResult,
                                         timeoutSecs=timeoutSecs,
                                         noPoll=True,
                                         rfView=False,
                                         **kwargs)
                trainElapsed = time.time() - start
                print 'rf train end', i, 'on', csvTrainPathname, 'took', trainElapsed, 'seconds'

                # don't cancel the last one
                if not lastOne:
                    time.sleep(1)
                    h2o_jobs.cancelAllJobs(timeoutSecs=2)

            ### print "rfView", h2o.dump_json(rfView)
            print "We have a result from the RF above, completed but didn't do RFView yet"
            # could the RF indicate 'done' too soon?
            # if rfResult['state']=='RUNNING':
            #    raise Exception("Why is this RF still in RUNNING state? %s" % h2o.dump_json(rfResult))

            # if 'drf_model' not in rfResult:
            #    raise Exception("How come there's no drf_model in this RF result? %s" % h2o.dump_json(rfResult))
            h2o_jobs.pollWaitJobs(timeoutSecs=300)
            rfView = h2o_cmd.runRFView(None,
                                       model_key=model_key,
                                       timeoutSecs=60,
                                       retryDelaySecs=5,
                                       doSimpleCheck=False)
            print "rfView:", h2o.dump_json(rfView)

            rf_model = rfView['drf_model']
            cms = rf_model['cms']
            ### print "cm:", h2o.dump_json(cm)
            ntrees = rf_model['N']
            errs = rf_model['errs']
            N = rf_model['N']
            varimp = rf_model['varimp']
            treeStats = rf_model['treeStats']

            print "maxDepth:", treeStats['maxDepth']
            print "maxLeaves:", treeStats['maxLeaves']
            print "minDepth:", treeStats['minDepth']
            print "minLeaves:", treeStats['minLeaves']
            print "meanLeaves:", treeStats['meanLeaves']
            print "meanDepth:", treeStats['meanDepth']
            print "errs[0]:", errs[0]
            print "errs[-1]:", errs[-1]
            print "errs:", errs

            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)
            # we iterate over params, so can't really do this check
            # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)

            print "classErrorPctList:", classErrorPctList
            self.assertEqual(
                len(classErrorPctList), 7,
                "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict"
            )
            # FIX! should update this expected classification error
            predict = h2o.nodes[0].generate_predictions(model_key=model_key,
                                                        data_key=data_key)

            eList.append(classErrorPctList[4])
            fList.append(trainElapsed)
            if DO_PLOT:
                if TRY == 'max_depth':
                    xLabel = 'max_depth'
                elif TRY == 'ntrees':
                    xLabel = 'ntrees'
                elif TRY == 'nbins':
                    xLabel = 'nbins'
                else:
                    raise Exception("huh? %s" % TRY)
                xList.append(paramDict[xLabel])

        if DO_PLOT:
            eLabel = 'class 4 pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
示例#10
0
    def test_exec_enums_rand_cut2(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = ROWS
        tryList = [
            # (n, 10, 9, 'cE', 300),
            (n, 1, 1, 'cE', 300),
        ]

        # create key names to use for exec
        eKeys = ['e%s' % i for i in range(10)]

        # h2b.browseTheCloud()
        trial = 0
        for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList:
            colCount = iColCount + oColCount

            hex_key = 'p'
            colEnumList = create_col_enum_list(iColCount)

            # create 100 possible cut expressions here, so we don't waste time below
            rowExprList = []
            print "Creating", CUT_EXPR_CNT, 'cut expressions'
            for j in range(CUT_EXPR_CNT):
                # init cutValue. None means no compare
                cutValue = [None for i in range(iColCount)]
                # build up a random cut expression
                MAX_COLS_IN_EXPR = iColCount
                cols = random.sample(range(MAX_COLS_IN_EXPR),
                                     random.randint(1, MAX_COLS_IN_EXPR))
                for c in cols:
                    # possible choices within the column
                    cel = colEnumList[c]
                    # for now the cutValues are numbers for the enum mappings
                    if 1 == 1:
                        # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like
                        celChoice = str(random.choice(range(len(cel))))
                    else:
                        celChoice = random.choice(cel)
                    cutValue[c] = celChoice

                cutExprList = []
                for i, c in enumerate(cutValue):
                    if c is None:
                        continue
                    else:
                        # new ...ability to reference cols
                        # src[ src$age<17 && src$zip=95120 && ... , ]
                        # randomly pick == or !=
                        if random.randint(0, 1) == 0:
                            cutExprList.append('p$C' + str(i + 1) + '!=' + c)
                        else:
                            cutExprList.append('p$C' + str(i + 1) + '==' + c)

                cutExpr = ' & '.join(cutExprList)
                # print "cutExpr:", cutExpr

                # just extract one output col (the first one)
                rowExpr = '%s[%s,%s];' % (hex_key, cutExpr, iColCount + 1)
                # print "rowExpr:", rowExpr
                print rowExpr
                rowExprList.append(rowExpr)

            # CREATE DATASET*******************************************
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname,
                              rowCount,
                              iColCount,
                              oColCount,
                              SEEDPERFILE,
                              colEnumList=colEnumList)

            # PARSE*******************************************************

            src_key = csvFilename
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='A' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='B' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='C' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='D' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='E' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='F' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='G' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='H' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='I' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='J' + src_key,
                                          timeoutSecs=200)

            parseResult = h2i.parse_only(pattern='*' + src_key,
                                         hex_key=hex_key,
                                         timeoutSecs=800)

            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            pNumRows = inspect['numRows']
            pNumCols = inspect['numCols']
            # print h2o.dump_json(inspect)
            levels = h2o.nodes[0].levels(source=hex_key)
            print "levels result:", h2o.dump_json(levels)

            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

            # error if any col has constant values
            if len(constantValuesDict) != 0:
                raise Exception(
                    "Probably got a col NA'ed and constant values as a result %s"
                    % constantValuesDict)

            # INIT all possible key names used***************************
            # remember. 1 indexing!

            # is this needed?
            if 1 == 1:
                a = 'a=c(1,2,3);' + ';'.join(
                    ['a[,%s]=a[,%s-1]' % (i, i) for i in range(2, colCount)])
                print a
                for eKey in eKeys:
                    # build up the columns
                    e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey),
                                                print_params=False)
                    ## print h2o.dump_json(e)

            xList = []
            eList = []
            fList = []
            for repeat in range(CUT_LOOP_CNT):
                # EXEC*******************************************************
                # don't use exec_expr to avoid issues with Inspect following etc.
                randICol = random.randint(0, iColCount - 1)
                randOCol = random.randint(iColCount, iColCount + oColCount - 1)

                # should be two different keys in the sample
                e = random.sample(eKeys, 2)
                fKey = e[0]
                eKey = e[1]

                start = time.time()
                h2o.nodes[0].exec_query(str="%s=%s" %
                                        (fKey, random.choice(rowExprList)))
                elapsed = time.time() - start
                execTime = elapsed
                print "exec 2 took", elapsed, "seconds."

                inspect = h2o_cmd.runInspect(key=fKey)
                h2o_cmd.infoFromInspect(inspect, fKey)
                numRows = inspect['numRows']
                numCols = inspect['numCols']

                if numRows == 0 or numCols != colCount:
                    h2p.red_print("Warning: Cut resulted in", numRows,
                                  "rows and", numCols,
                                  "cols. Quantile will abort")

                # QUANTILE*******************************************************
                quantile = 0.5 if DO_MEDIAN else .999
                # first output col. always fed by an exec cut, so 0?
                column = iColCount
                column = 0
                start = time.time()
                q = h2o.nodes[0].quantiles(source_key=fKey,
                                           column=column,
                                           quantile=quantile,
                                           max_qbins=MAX_QBINS,
                                           multiple_pass=MULTI_PASS)
                h2p.red_print("quantile", quantile, q['result'])
                elapsed = time.time() - start
                print "quantile end on ", fKey, 'took', elapsed, 'seconds.'
                quantileTime = elapsed

                # remove all keys*******************************************************
                # what about hex_key?
                if 1 == 0:
                    start = time.time()
                    h2o.nodes[0].remove_all_keys()
                    elapsed = time.time() - start
                    print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.'

                trial += 1
                xList.append(trial)
                eList.append(execTime)
                fList.append(quantileTime)

        #****************************************************************
        # QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET
        print "QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET. Although it's a real col, not an enum col"
        quantile = 0.5 if DO_MEDIAN else .999
        # first output col. always fed by an exec cut, so 0?
        column = iColCount
        start = time.time()
        q = h2o.nodes[0].quantiles(source_key=hex_key,
                                   column='C' + str(iColCount + 1),
                                   quantile=quantile,
                                   max_qbins=MAX_QBINS,
                                   multiple_pass=0)
        elapsed = time.time() - start
        h2p.red_print(
            hex_key, pNumRows,
            "rows Baseline: quantile single col (C" + str(iColCount + 1) + ")",
            "one iteration", elapsed, "secs. threshold:", quantile,
            q['result'])
        print "quantile single col 1 iteration end on", hex_key, "took", elapsed, 'seconds.'
        quantileTime = elapsed

        #****************************************************************
        # PLOTS. look for eplot.jpg and fplot.jpg in local dir?
        if DO_PLOT:
            xLabel = 'trial'
            eLabel = 'exec cut time'
            fLabel = 'quantile time'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList,
                              xLabel,
                              eListTitle,
                              eList,
                              eLabel,
                              fListTitle,
                              fList,
                              fLabel,
                              server=True)
示例#11
0
    def test_exec_enums_rand_cut(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (ROWS, 3, 2, 'cE', 300), 
            ]

        # create key names to use for exec
        eKeys = ['e%s' % i for i in range(10)]

        # h2b.browseTheCloud()
        trial = 0
        for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList:
            colCount = iColCount + oColCount

            hex_key = 'p'
            colEnumList = create_col_enum_list(iColCount)

            # create 100 possible cut expressions here, so we don't waste time below
            rowExprList = []
            for j in range(CUT_EXPR_CNT):
                print "Creating", CUT_EXPR_CNT, 'cut expressions'
                # init cutValue. None means no compare
                cutValue = [None for i in range(iColCount)]
                # build up a random cut expression
                cols = random.sample(range(iColCount), random.randint(1,iColCount))
                for c in cols:
                    # possible choices within the column
                    # cel = colEnumList[c]
                    cel = colEnumList
                    # for now the cutValues are numbers for the enum mappings
                    if 1==1:
                        # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like
                        celChoice = str(random.choice(range(len(cel))))
                    else:
                        celChoice = random.choice(cel)
                    cutValue[c] = celChoice
    
                cutExprList = []
                for i,c in enumerate(cutValue):
                    if c is None:   
                        continue
                    else:
                        # new ...ability to reference cols
                        # src[ src$age<17 && src$zip=95120 && ... , ]
                        cutExprList.append('p$C'+str(i+1)+'=='+c)

                cutExpr = ' && '.join(cutExprList)
                print "cutExpr:", cutExpr    

                # should be two different keys in the sample
                e = random.sample(eKeys,2)
                fKey = e[0]
                eKey = e[1]

                rowExpr = '%s[%s,];' % (hex_key, cutExpr)
                print "rowExpr:", rowExpr
                rowExprList.append(rowExpr)

                print "j:", j

            # CREATE DATASET*******************************************
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList)

            # PARSE*******************************************************

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False, header=0)

            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            # print h2o.dump_json(inspect)

            rSummary = h2o_cmd.runSummary(key=parseResult['destination_key'])
            h2o_cmd.infoFromSummary(rSummary)

            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

            # error if any col has constant values
            if len(constantValuesDict) != 0:
                raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict)

            # INIT all possible key names used***************************
            # remember. 1 indexing!

            # is this needed?
            if 1==1:
                a = 'a=c(1,2,3);' + ';'.join(['a[,%s]=a[,%s-1]'% (i,i) for i in range(2,colCount)])
                print a
                for eKey in eKeys:
                    # build up the columns
                    e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey), print_params=False)
                    ## print h2o.dump_json(e)


            xList = []
            eList = []
            fList = []
            for repeat in range(200):
                # EXEC*******************************************************
                # don't use exec_expr to avoid issues with Inspect following etc.
                randICol = random.randint(0,iColCount-1)
                randOCol = random.randint(iColCount, iColCount+oColCount-1)

                # should be two different keys in the sample
                e = random.sample(eKeys,2)
                fKey = e[0]
                eKey = e[1]

                if 1==0:
                    start = time.time()
                    e = h2o.nodes[0].exec_query(str='%s=%s[,%s]' % (fKey, hex_key, randOCol+1))

                    elapsed = time.time() - start
                    print "exec 1 took", elapsed, "seconds."
                    execTime = elapsed

                if 1==1:
                    start = time.time()
                    h2o.nodes[0].exec_query(str="%s=%s" % (fKey, random.choice(rowExprList)))
                    elapsed = time.time() - start
                    execTime = elapsed
                    print "exec 2 took", elapsed, "seconds."
                
                if 1==0:
                    gKey = random.choice(eKeys)
                    # do a 2nd random to see if things blow up
                    start = time.time()
                    h2o.nodes[0].exec_query(str="%s=%s" % (gKey, fKey))
                    elapsed = time.time() - start
                    print "exec 3 took", elapsed, "seconds."

                if 1==1:
                    inspect = h2o_cmd.runInspect(key=fKey)
                    h2o_cmd.infoFromInspect(inspect, fKey)
                    numRows = inspect['numRows']
                    numCols = inspect['numCols']

                if numRows==0 or numCols!=colCount:
                    h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort")

                # QUANTILE*******************************************************
                quantile = 0.5 if DO_MEDIAN else .999
                # first output col. always fed by an exec cut, so 0?
                column = iColCount
                start = time.time()
                q = h2o.nodes[0].quantiles(source_key=fKey, column=column, 
                    quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS)
                h2p.red_print("quantile", quantile, q['result'])
                elapsed = time.time() - start
                print "quantile end on ", fKey, 'took', elapsed, 'seconds.'
                quantileTime = elapsed


                # remove all keys*******************************************************
                # what about hex_key?
                if 1==0:
                    start = time.time()
                    h2o.nodes[0].remove_all_keys()
                    elapsed = time.time() - start
                    print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.'

                trial += 1
                xList.append(trial)
                eList.append(execTime)
                fList.append(quantileTime)


        # just get a plot of the last one (biggest)
        if DO_PLOT:
            xLabel = 'trial'
            eLabel = 'exec cut time'
            fLabel = 'quantile time'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
    def test_GBM_covtype_train_test(self):
        h2o.beta_features = False
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        files = [
                ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 'C55', 'covtype.shuffled.10pct.data', 'covtype.test.hex')
                ]

        # h2b.browseTheCloud()

        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            h2o.beta_features = False #turn off beta_features
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Parse (test)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTestResult['destination_key'] for h2o"
                parseTestResult['destination_key'] = testKey

            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # GBM (train iterate)****************************************
            inspect = h2o_cmd.runInspect(key=parseTestResult['destination_key'])
            ntrees = 2
            # fails with 40
            for max_depth in [40, 5]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': response,
                    'ignored_cols_by_name': None,
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                h2o.beta_features = True
                # translate it (only really need to do once . out of loop?
                h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
                ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'], 
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                    actual=parseTestResult['destination_key'],
                    vactual=response,
                    predict=predictKey,
                    vpredict='predict', # choices are 7 (now) and 'predict'
                    )

                # errrs from end of list? is that the last tree?
                # all we get is cm
                cm = gbmPredictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrong)
                fList.append(trainElapsed)

            h2o.beta_features = False
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
示例#13
0
    def test_GBM_params_rand2(self):
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        files = [
            # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex')
            ('standard', 'covtype.shuffled.10pct.sorted.data',
             'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data',
             'covtype.test.hex')
        ]

        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response,
             testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            parseTrainResult = h2i.import_parse(bucket=bucket,
                                                path=importFolderPath + "/" +
                                                trainFilename,
                                                schema='local',
                                                hex_key=trainKey,
                                                timeoutSecs=timeoutSecs,
                                                doSummary=False)

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Parse (test)****************************************
            parseTestResult = h2i.import_parse(bucket=bucket,
                                               path=importFolderPath + "/" +
                                               testFilename,
                                               schema='local',
                                               hex_key=testKey,
                                               timeoutSecs=timeoutSecs,
                                               doSummary=False)
            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # GBM (train iterate)****************************************
            inspect = h2o_cmd.runInspect(
                key=parseTestResult['destination_key'])
            paramsDict = define_gbm_params()
            for trial in range(3):
                # translate it (only really need to do once . out of loop?
                h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
                ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

                # use this to set any defaults you want if the pick doesn't set
                params = {
                    'response': 54,
                    'ignored_cols_by_name': 'C1,C2,C3,C4,C5',
                    'ntrees': 2,
                    'validation': parseTestResult['destination_key'],
                }
                h2o_gbm.pickRandGbmParams(paramsDict, params)
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                                                timeoutSecs=timeoutSecs,
                                                destination_key=modelKey,
                                                **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1][
                    '_arr']  # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'],
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                if DO_PREDICT_CM:
                    gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix(
                        actual=parseTestResult['destination_key'],
                        vactual='predict',
                        predict=predictKey,
                        vpredict='predict',  # choices are 7 (now) and 'predict'
                    )

                    # errrs from end of list? is that the last tree?
                    # all we get is cm
                    cm = gbmPredictCMResult['cm']

                    # These will move into the h2o_gbm.py
                    pctWrong = h2o_gbm.pp_cm_summary(cm)
                    print "Last line of this cm is really NAs, not CM"
                    print "\nTest\n==========\n"
                    print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                if 'max_depth' in params and params['max_depth']:
                    xList.append(params['max_depth'])
                    eList.append(pctWrongTrain)
                    fList.append(trainElapsed)

            xLabel = 'max_depth'
            eLabel = 'pctWrongTrain'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
    def test_exec2_enums_rand_cut(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = ROWS
        tryList = [
            (n, 10, 9, 'cE', 300),
        ]

        # create key names to use for exec
        eKeys = ['e%s' % i for i in range(10)]

        # h2b.browseTheCloud()
        trial = 0
        for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList:
            colCount = iColCount + oColCount

            hex_key = 'p'
            colEnumList = create_col_enum_list(iColCount)

            # create 100 possible cut expressions here, so we don't waste time below
            rowExprList = []
            print "Creating", CUT_EXPR_CNT, 'cut expressions'
            for j in range(CUT_EXPR_CNT):
                # init cutValue. None means no compare
                cutValue = [None for i in range(iColCount)]
                # build up a random cut expression
                cols = random.sample(range(iColCount),
                                     random.randint(1, iColCount))
                for c in cols:
                    # possible choices within the column
                    cel = colEnumList[c]
                    # for now the cutValues are numbers for the enum mappings

                    # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like
                    # celChoice = str(random.choice(range(len(cel))))
                    celChoice = random.choice(range(len(cel)))
                    cutValue[c] = celChoice

                cutExprList = []

                pKey = Key('p')
                for i, c in enumerate(cutValue):
                    if c is None:
                        continue
                    else:
                        # new ...ability to reference cols
                        # src[ src$age<17 && src$zip=95120 && ... , ]
                        # cutExprList.append('p$C'+str(i+1)+'=='+c)
                        # all column indexing in h2o-dev is with number
                        e = Fcn('==', c, pKey[:, i])
                        cutExprList.append(e)

                cutExpr = None
                for ce in cutExprList:
                    if cutExpr:
                        cutExpr = Fcn('&', cutExpr, ce)
                    else:
                        cutExpr = ce

                print "cutExpr:", cutExpr

                # should be two different keys in the sample
                e = random.sample(eKeys, 2)
                fKey = e[0]
                eKey = e[1]

                # rowExpr = '%s[%s,];' % (hex_key, cutExpr)
                hKey = Key(hex_key)
                rowExpr = hKey[cutExpr, :]

                print "rowExpr:", rowExpr
                rowExprList.append(rowExpr)

            # CREATE DATASET*******************************************
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname,
                              rowCount,
                              iColCount,
                              oColCount,
                              SEEDPERFILE,
                              colEnumList=colEnumList)

            # PARSE*******************************************************
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30)
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)

            inspect = h2o_cmd.runInspect(key=parse_key)
            missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect(
                inspect)
            # print h2o.dump_json(inspect)

            # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
            #    h2o_cmd.columnInfoFromInspect(parse_key, exceptionOnMissingValues=False)

            # error if any col has constant values
            # if len(constantValuesDict) != 0:
            #    raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict)

            # INIT all possible key names used***************************
            # remember. 1 indexing!

            # build up the columns
            Assign('b', [1, 2, 3])
            # could also append 1 col at a time, by assigning to the next col number?
            Assign('a', Cbind(['b' for i in range(colCount)]))

            for eKey in eKeys:
                Assign(eKey, 'a')
                ## print h2o.dump_json(e)

            xList = []
            eList = []
            fList = []
            for repeat in range(200):
                # EXEC*******************************************************
                # don't use exec_expr to avoid issues with Inspect following etc.
                randICol = random.randint(0, iColCount - 1)
                randOCol = random.randint(iColCount, iColCount + oColCount - 1)

                # should be two different keys in the sample
                e = random.sample(eKeys, 2)
                fKey = e[0]
                eKey = e[1]

                if 1 == 1:
                    start = time.time()
                    Assign(fKey, random.choice(rowExprList)).do()
                    elapsed = time.time() - start
                    execTime = elapsed
                    print "exec 2 took", elapsed, "seconds."

                    inspect = h2o_cmd.runInspect(key=fKey)
                    missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect(
                        inspect)

                if numRows == 0 or numCols != colCount:
                    h2p.red_print("Warning: Cut resulted in", numRows,
                                  "rows and", numCols,
                                  "cols. Quantile will abort")

                # FIX! put quantile back in?
                quantileTime = 0

                # remove all keys*******************************************************
                # what about hex_key?
                if 1 == 0:
                    start = time.time()
                    h2o.nodes[0].remove_all_keys()
                    elapsed = time.time() - start
                    print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.'

                trial += 1
                xList.append(trial)
                eList.append(execTime)
                fList.append(quantileTime)

        # just get a plot of the last one (biggest)
        if DO_PLOT:
            xLabel = 'trial'
            eLabel = 'exec cut time'
            fLabel = 'quantile time'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
示例#15
0
    def test_ddply_plot(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (1000000, 5, 'cD', 0, 10, 30),
            (1000000, 5, 'cD', 0, 20, 30),
            (1000000, 5, 'cD', 0, 30, 30),
            (1000000, 5, 'cD', 0, 40, 30),
            (1000000, 5, 'cD', 0, 50, 30),
            (1000000, 5, 'cD', 0, 70, 30),
            (1000000, 5, 'cD', 0, 100, 30),
            (1000000, 5, 'cD', 0, 130, 30),
            (1000000, 5, 'cD', 0, 160, 30),
            # (1000000, 5, 'cD', 0, 320, 30),
            # starts to fail here. too many groups?
            # (1000000, 5, 'cD', 0, 640, 30),
            # (1000000, 5, 'cD', 0, 1280, 30),
        ]

        ### h2b.browseTheCloud()
        xList = []
        eList = []
        fList = []
        trial = 0
        for (rowCount, colCount, hex_key, minInt, maxInt,
             timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'

            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname, "with range", (maxInt -
                                                                 minInt) + 1
            write_syn_dataset(csvPathname, rowCount, colCount, minInt, maxInt,
                              SEEDPERFILE)

            # PARSE train****************************************
            hexKey = 'r.hex'
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hexKey)

            for resultKey, execExpr in initList:
                h2e.exec_expr(h2o.nodes[0],
                              execExpr,
                              resultKey=resultKey,
                              timeoutSecs=60)

            # do it twice..to get the optimal cached delay for time?
            execExpr = "a1 = ddply(r.hex, c(1,2), " + PHRASE + ")"
            start = time.time()
            h2e.exec_expr(h2o.nodes[0],
                          execExpr,
                          resultKey=None,
                          timeoutSecs=60)
            ddplyElapsed = time.time() - start
            print "ddplyElapsed:", ddplyElapsed

            execExpr = "a2 = ddply(r.hex, c(1,2), " + PHRASE + ")"
            start = time.time()
            (execResult, result) = h2e.exec_expr(h2o.nodes[0],
                                                 execExpr,
                                                 resultKey=None,
                                                 timeoutSecs=60)
            groups = execResult['num_rows']
            maxExpectedGroups = ((maxInt - minInt) + 1)**2
            h2o_util.assertApproxEqual(
                groups,
                maxExpectedGroups,
                rel=0.2,
                msg="groups %s isn't close to expected amount %s" %
                (groups, maxExpectedGroups))

            ddplyElapsed = time.time() - start
            print "ddplyElapsed:", ddplyElapsed
            print "execResult", h2o.dump_json(execResult)

            # should be same answer in both cases
            execExpr = "d=sum(a1!=a2)==0"
            (execResult, result) = h2e.exec_expr(h2o.nodes[0],
                                                 execExpr,
                                                 resultKey=None,
                                                 timeoutSecs=60)
            print "execResult", h2o.dump_json(execResult)
            self.assertEqual(result, 1, "a1 and a2 weren't equal? %s" % result)

            # xList.append(ntrees)
            trial += 1
            # this is the biggest it might be ..depends on the random combinations
            # groups = ((maxInt - minInt) + 1) ** 2
            xList.append(groups)
            eList.append(ddplyElapsed)
            fList.append(ddplyElapsed)

        if DO_PLOT:
            xLabel = 'groups'
            eLabel = 'ddplyElapsed'
            fLabel = 'ddplyElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
示例#16
0
    def test_GBM_manyfiles_train_test(self):
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if localhost:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_1[0-9][0-9].dat.gz',
                 'file_100.hex', 1800, None, 'file_1.dat.gz', 'file_1_test.hex'
                 )
            ]
        else:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'file_10.hex', 1800,
                 None, 'file_1[0-9].dat.gz', 'file_10_test.hex')
            ]

        # if I got to hdfs, it's here
        # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        # h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response,
             testFilename, testKey) in files:
            h2o.beta_features = False  #turn off beta_features
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket,
                                                path=csvPathname,
                                                schema='s3n',
                                                hex_key=trainKey,
                                                timeoutSecs=timeoutSecs,
                                                noPoll=h2o.beta_features,
                                                doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(
                key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (trainKey, trainKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=500)

            # Parse (test)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"

            parseTestResult = h2i.import_parse(bucket=bucket,
                                               path=importFolderPath + "/" +
                                               testFilename,
                                               schema='local',
                                               hex_key=testKey,
                                               timeoutSecs=timeoutSecs,
                                               noPoll=h2o.beta_features,
                                               doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTestResult['destination_key'] for h2o"
                parseTestResult['destination_key'] = testKey

            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            print "Slow! exec is converting all imported keys?, not just what was parsed"
            execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (testKey, testKey,
                                                           testKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = num_cols - 1
            response = 378
            print "Using the same response %s for train and test (which should have a output value too)" % response

            ntrees = 10
            for max_depth in [5, 10, 20, 40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': response,
                    # 'ignored_cols':
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                                                noPoll=True,
                                                timeoutSecs=timeoutSecs,
                                                destination_key=modelKey,
                                                **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cm']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                if doPredict:
                    predictKey = 'Predict.hex'
                    ### h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                    start = time.time()
                    gbmTestResult = h2o_cmd.runPredict(
                        data_key=parseTestResult['destination_key'],
                        model_key=modelKey,
                        destination_key=predictKey,
                        timeoutSecs=timeoutSecs)
                    # hack
                    if h2o.beta_features:
                        h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                         pollTimeoutSecs=timeoutSecs)
                    elapsed = time.time() - start
                    print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                    print "This is crazy!"
                    gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix(
                        actual=parseTestResult['destination_key'],
                        vactual=response,
                        predict=predictKey,
                        vpredict='predict',  # choices are 0 and 'predict'
                    )

                    # errrs from end of list? is that the last tree?
                    # all we get is cm
                    cm = gbmPredictCMResult['cm']

                    # These will move into the h2o_gbm.py
                    pctWrong = h2o_gbm.pp_cm_summary(cm)
                    print "Last line of this cm is really NAs, not CM"
                    print "\nTest\n==========\n"
                    print h2o_gbm.pp_cm(cm)

                    # xList.append(ntrees)
                    xList.append(max_depth)
                    eList.append(pctWrong)
                    fList.append(trainElapsed)

            h2o.beta_features = False

            if doPredict:
                xLabel = 'max_depth'
                eLabel = 'pctWrong'
                fLabel = 'trainElapsed'
                eListTitle = ""
                fListTitle = ""
                h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                                  fListTitle, fList, fLabel)
示例#17
0
    def test_GBM_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if h2o.localhost:
            tryList = [
                (10000, 100, 'cA', 300), 
                ]
        else:
            tryList = [
                # (10000, 10, 'cB', 300), 
                # (10000, 50, 'cC', 300), 
                (10000, 100, 'cD', 300), 
                (10000, 200, 'cE', 300), 
                (10000, 300, 'cF', 300), 
                (10000, 400, 'cG', 300), 
                (10000, 500, 'cH', 300), 
                (10000, 1000, 'cI', 300), 
                ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            hdrFilename = 'hdr_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'

            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)


            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            modelKey = 'GBMModelKey'

            # Parse (train)****************************************
            parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put',
                hex_key=hex_key, timeoutSecs=timeoutSecs, 
                doSummary=False)
            # hack

            elapsed = time.time() - start
            print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']


            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # GBM(train iterate)****************************************
            ntrees = 5 
            prefixList = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
            # for max_depth in [5,10,20,40]:
            for max_depth in [5, 10, 20]:

                # PARSE a new header****************************************
                print "Creating new header", hdrPathname
                prefix = prefixList.pop(0)
                write_syn_header(hdrPathname, rowCount, colCount, prefix)

                # upload and parse the header to a hex

                hdr_hex_key = prefix + "_hdr.hex"
                parseHdrResult = h2i.import_parse(bucket=None, path=hdrPathname, schema='put',
                    header=1, # REQUIRED! otherwise will interpret as enums
                    hex_key=hdr_hex_key, timeoutSecs=timeoutSecs, doSummary=False)
                # Set Column Names (before autoframe is created)
                h2o.nodes[0].set_column_names(source=hex_key, copy_from=hdr_hex_key)

                # GBM
                print "response col name is changing each iteration: parsing a new header"
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': prefix + "_response",
                    'ignored_cols_by_name': None,
                }

                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed)
                print l
                h2o.cloudPerfH2O.message(l)

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

                # works if you delete the autoframe
                ### h2o_import.delete_keys_at_all_nodes(pattern='autoframe')

        # just plot the last one
        if DO_PLOT:
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
    def test_PCA_many_cols_enum_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            # (10000, 10, 'cB', 300), 
            # (10000, 50, 'cC', 300), 
            (10, 5, 'cD', 300), 
            (10, 10, 'cD', 300), 
            (10, 20, 'cD', 300), 
            (10, 40, 'cD', 300), 
            (10, 80, 'cD', 300), 
            (10, 160, 'cD', 300), 
            (10, 200, 'cD', 300), 
            ]

        xList = []
        eList = []
        fList = []

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            # PARSE ****************************************
            start = time.time()
            modelKey = 'PCAModelKey'

            # Parse ****************************************
            parseResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put',
                hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)

            elapsed = time.time() - start
            print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # PCA(tolerance iterate)****************************************
            for tolerance in [0.1]:
            # for tolerance in [i/10.0 for i in range(1)]:
                params = {
                    'ignored_cols': 'C1',
                    'destination_key': modelKey,
                    'tolerance': tolerance,
                    'standardize': 1,
                }
                print "Using these parameters for PCA: ", params
                kwargs = params.copy()
                PCAResult = {'python_elapsed': 0, 'python_%timeout': 0}
                start = time.time()
                pcaResult = h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
                elapsed = time.time() - start
                PCAResult['python_elapsed']  = elapsed
                PCAResult['python_%timeout'] = 1.0*elapsed / timeoutSecs
                print "PCA completed in",     PCAResult['python_elapsed'], "seconds.", \
                      "%f pct. of timeout" % (PCAResult['python_%timeout'])            
    
                print "Checking PCA results: "
                pcaView = h2o_cmd.runPCAView(modelKey = modelKey) 
                h2o_pca.simpleCheckPCA(self,pcaView)
                h2o_pca.resultsCheckPCA(self,pcaView)

                # Logging to a benchmark file
                algo = "PCA " + " tolerance=" + str(tolerance)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, PCAResult['python_elapsed'])
                print l
                h2o.cloudPerfH2O.message(l)
                pcaInspect = pcaView
                # errrs from end of list? is that the last tree?
                sdevs = pcaInspect["pca_model"]["sdev"] 
                print "PCA: standard deviations are :", sdevs
                print
                print
                propVars = pcaInspect["pca_model"]["propVar"]
                print "PCA: Proportions of variance by eigenvector are :", propVars
                print
                print
                # xList.append(ntrees)
                xList.append(numCols)
                eList.append(tolerance)
                fList.append(elapsed)


        # just plot the last one
        if 1==1:
            xLabel = 'numCols'
            eLabel = 'tolerance'
            fLabel = 'elapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
示例#19
0
    def test_rapids_cbind_vec(self):

        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        # stop if > 1G (fails memory cleaner assetion
        maxx = 29
        # for trial in range(maxx):
        # for trial in range(int(1e6),int(200e6),int(1e6)):
        for trial in [int(100e6)]:
            
            # length = (2 ** trial)
            # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
            length = trial
            execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
    
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
            elapsed1 = time.time() - start
            if execResult['num_rows']:
                keys.append(execExpr)

            # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))'
            # cols = 100
            xList = []
            eList = []
            fList = []
            for trial2 in range(0, 16):
            # for trial2 in range(0, 10):
            # fails. Post size?
            # for trial2 in range(0, 16):
                col = 2 ** trial2
                # assert col < 16384, "h2o can't take col == 16384 or more"
             
                vString = ' '.join(['%v' for x in range(col)])
                execExpr = '(= !v2 (cbind %s))' % vString

                # FIX! check the colnames. 2 cols get C1 and C10? odd 
                # try:
                start = time.time()
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=40)
                elapsed2 = time.time() - start

                if execResult['num_rows']:
                    keys.append(execExpr)
                
                # except:
                #     elapsed2 = 0
                #     h2p.red_print("ERROR: col = %s failed" % col)

                if 1==0:
                    start = time.time()
                    execExpr = '(sum %v2 %TRUE)'
                    execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60)
                    elapsed1 = time.time() - start

                # xList.append(length)
                xList.append(col)
                eList.append(elapsed1)
                fList.append(elapsed2)


        if 1==1:
            xLabel = 'col'
            eLabel = 'elapsed (sum)'
            fLabel = 'elapsed (cbind cols)'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)



        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
示例#20
0
    def test_quant_cols(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if getpass.getuser() == 'kevin':
            tryList = [
                (None, '/home/kevin/Downloads/t.csv', 15, 11, 'cE', 300),
                ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None,
                 'cE', 300),
            ]
        else:
            tryList = [
                ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None,
                 'cE', 300),
            ]

        # h2b.browseTheCloud()
        trial = 0
        for (bucket, csvPathname, iColCount, oColCount, hex_key,
             timeoutSecs) in tryList:
            xList = []
            eList = []
            fList = []

            # PARSE*******************************************************
            parseResult = h2i.import_parse(bucket=bucket,
                                           path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=200,
                                           doSummary=False)
            csvPathnameFull = h2i.find_folder_and_filename(bucket,
                                                           csvPathname,
                                                           returnFullPath=True)

            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            if not oColCount:
                iColCount = 0

            if not oColCount:
                oColCount = numCols

            colCount = iColCount + oColCount
            for i in range(0, numCols):
                print "Column", i, "summary"
                h2o_cmd.runSummary(key=hex_key, max_qbins=1, cols=i)

            # print h2o.dump_json(inspect)
            levels = h2o.nodes[0].levels(source=hex_key)
            # print "levels result:", h2o.dump_json(levels)

            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

            # error if any col has constant values
            if len(constantValuesDict) != 0:
                # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict)
                print "Probably got a col NA'ed and constant values as a result %s" % constantValuesDict

            # start after the last input col
            levels = h2o.nodes[0].levels(source=hex_key)
            l = levels['levels']
            for column in range(iColCount, iColCount + oColCount):
                if l[column]:
                    print "Skipping", column, "because it's enum (says levels)"
                    continue

                # QUANTILE*******************************************************

                quantile = 0.5 if DO_MEDIAN else .999
                # first output col. always fed by an exec cut, so 0?
                start = time.time()
                # file has headers. use col index
                q = h2o.nodes[0].quantiles(source_key=hex_key,
                                           column=column,
                                           quantile=quantile,
                                           max_qbins=MAX_QBINS,
                                           multiple_pass=1)
                qresult = q['result']
                h2p.red_print("result:", q['result'], "quantile", quantile,
                              "interpolated:", q['interpolated'], "iterations",
                              q['iterations'])
                elapsed = time.time() - start
                print "quantile end on ", hex_key, 'took', elapsed, 'seconds.'
                quantileTime = elapsed

                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                if 1 == 1:
                    h2o_summ.quantile_comparisons(
                        csvPathnameFull,
                        skipHeader=True,
                        col=column,  # what col to extract from the csv
                        datatype='float',
                        quantile=0.5 if DO_MEDIAN else 0.999,
                        # h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                        # h2oQuantilesApprox=qresult_single,
                        h2oQuantilesExact=qresult,
                        use_genfromtxt=True,
                    )

                trial += 1
                execTime = 0
                xList.append(column)
                eList.append(execTime)
                fList.append(quantileTime)

                # remove all keys*******************************************************
                # what about hex_key?
                if 1 == 0:
                    start = time.time()
                    h2o.nodes[0].remove_all_keys()
                    elapsed = time.time() - start
                    print "remove all keys end on took", elapsed, 'seconds.'

        #****************************************************************
        # PLOTS. look for eplot.jpg and fplot.jpg in local dir?
        if DO_PLOT:
            xLabel = 'column (0 is first)'
            eLabel = 'exec cut time'
            fLabel = 'quantile time'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList,
                              xLabel,
                              eListTitle,
                              eList,
                              eLabel,
                              fListTitle,
                              fList,
                              fLabel,
                              server=True)
示例#21
0
    def test_GBM_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if localhost:
            tryList = [
                (100000, 400, 'cA', 300), 
                ]
        else:
            tryList = [
                # (10000, 10, 'cB', 300), 
                # (10000, 50, 'cC', 300), 
                (100000, 100, 'cD', 300), 
                (100000, 200, 'cE', 300), 
                (100000, 500, 'cG', 300), 
                (100000, 1000, 'cI', 300), 
                ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            # PARSE train****************************************
            h2o.beta_features = False #turn off beta_features
            start = time.time()
            xList = []
            eList = []
            fList = []

            h2o.beta_features = False
            modelKey = 'GBMModelKey'

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put',
                hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            # l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                # len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            l = '{:d} jvms, {:d}MB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_MB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # GBM(train iterate)****************************************
            h2o.beta_features = True
            # was failing with 100 trees
            # ntrees = 100
            # for max_depth in [5,10,20,40]:
            ntrees = 10
            for max_depth in [5]:
                params = {
                    'learn_rate': .2,
                    'nbins': 10, # 1024 fail
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': num_cols-1,
                    'ignored_cols_by_name': None,
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True

                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_MB, algo, csvFilename, trainElapsed)
                print l
                h2o.cloudPerfH2O.message(l)

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cm']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

        h2o.beta_features = False
        # just plot the last one
        if DO_PLOT_IF_KEVIN:
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
    def test_rapids_vec_fail(self):
        start = time.time()
        xList = []
        eList = []
        fList = []

        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        # stop if > 1G (fails memory cleaner assetion
        maxx = 29
        # for trial in range(maxx):
        for trial in range(int(1e6),int(8e6),int(1e6)):
            
            # length = (2 ** trial)
            # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
            length = trial

            execExpr = '(= !vreal (c {(: #0 #%s)})' % (length - 1)
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
            elapsed1 = time.time() - start
            if execResult['num_rows']:
                keys.append(execExpr)

            # change it to all 1s? v = v==0
            execExpr = '(= !vint (N %vreal #0))'
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)

            # comparing the sum times for int vs real..maybe the other guy isn't real. at least: different compression
            # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))'


            # recursively expand
            execExpr = '(= !v2 (+ %vint <patt>))'
            for j in range(3):
                execExpr = re.sub('<patt>', '(+ %vint <patt>)', execExpr)
            # last one
            execExpr = re.sub('<patt>', '(+ %vint %vint)', execExpr)

            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
            elapsed2 = time.time() - start

            execExpr = '(= !v1 (+ %vreal %vreal))'
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
            elapsed1 = time.time() - start

            inspectResult = h2o_cmd.runInspect(key='vreal')
            h2o_cmd.infoFromInspect(inspectResult)

            inspectResult = h2o_cmd.runInspect(key='vint')
            h2o_cmd.infoFromInspect(inspectResult)

            summaryResult = h2o_cmd.runSummary(key='vreal')

            if execResult['num_rows']:
                keys.append(execExpr)
            
            xList.append(length)
            eList.append(elapsed1)
            fList.append(elapsed2)


        if 1==1:
            xLabel = 'vector length'
            eLabel = 'elapsed (v1 = vint + vint)'
            fLabel = 'elapsed (v2 = vreal + vreal)'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)



        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
示例#23
0
    def test_GBM_poker_1m(self):
        h2o.beta_features = True
        for trial in range(2):
            # PARSE train****************************************
            h2o.beta_features = False #turn off beta_features
            start = time.time()
            xList = []
            eList = []
            fList = []

            modelKey = 'GBMModelKey'
            timeoutSecs = 900
            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"

            csvPathname = 'poker/poker-hand-testing.data'
            hex_key = 'poker-hand-testing.data.hex'
            parseTrainResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put',
                hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)

            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # GBM(train iterate)****************************************
            h2o.beta_features = True
            ntrees = 2
            for max_depth in [5,10,20]:
                params = {
                    'learn_rate': .1,
                    'nbins': 10,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': numCols-1,
                    'ignored_cols_by_name': None,
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True

                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, trainElapsed)
                print l
                h2o.cloudPerfH2O.message(l)

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

        h2o.beta_features = False
        # just plot the last one
        if DO_PLOT_IF_KEVIN:
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
示例#24
0
                    vpredict='predict', # choices are 7 (now) and 'predict'
                    )

                # errrs from end of list? is that the last tree?
                # all we get is cm
                cm = gbmPredictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(trial)
                eList.append(pctWrong)
                fList.append(trainElapsed)

<<<<<<< HEAD
            xLabel = 'trial'
=======
            xLabel = 'trial. importance=0,1,default,...'
>>>>>>> 50f5b1b8c94b6ce7cd5ec175fecdca811f41487f
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)

if __name__ == '__main__':
    h2o.unit_main()
示例#25
0
    def test_GBM_many_cols_enum(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u']

        tryList = [
            (10000,  100, 'cA', 300),
            (10000,  300, 'cB', 500),
            # (10000,  500, 'cC', 700),
            # (10000,  700, 'cD', 3600),
            # (10000,  900, 'cE', 3600),
            # (10000,  1000, 'cF', 3600),
            # (10000,  1300, 'cG', 3600),
            # (10000,  1700, 'cH', 3600),
            # (10000,  2000, 'cI', 3600),
            # (10000,  2500, 'cJ', 3600),
            # (10000,  3000, 'cK', 3600),
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList)

            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            modelKey = 'GBMModelKey'

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', header=0,
                hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = hex_key

            elapsed = time.time() - start
            print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # GBM(train iterate)****************************************
            h2o.beta_features = True
            ntrees = 10
            for max_depth in [5,10,20,40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': 'C' + str(numCols-1),
                    'ignored_cols_by_name': None,
                }
               # both response variants should work?
                # if random.randint(0,1):
                #    params['response'] = numCols-1,
                
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True

                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed)
                print l
                h2o.cloudPerfH2O.message(l)

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

        h2o.beta_features = False
        # just plot the last one
        if 1==1:
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
示例#26
0
    def test_plot_remove_keys(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (100000, 50, 'cG', 400, 400),
            (200000, 50, 'cH', 400, 400),
            (400000, 50, 'cI', 400, 400),
            (800000, 50, 'cJ', 400, 400),
            (1000000, 50, 'cK', 400, 400),
        ]
        
        xList = []
        eList = []
        fList = []
        for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            NUM_CASES = h2o_util.fp_format()
            sel = random.randint(0, NUM_CASES-1)
            csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel)

            start = time.time()
            print csvFilename, "parse starting"
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)
            parseElapsed = time.time() - start
            print "Parse only:", parseResult['destination_key'], "took", parseElapsed, "seconds"
            h2o.check_sandbox_for_errors()

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # should match # of cols in header or ??
            self.assertEqual(inspect['numCols'], colCount,
                "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount))
            self.assertEqual(inspect['numRows'], rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['numRows'], rowCount))

            parsedBytes = inspect['byteSize']

            node = h2o.nodes[0]
            print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?"
            start = time.time()
            node.remove_key(hex_key, timeoutSecs=30)
            removeElapsed = time.time() - start
            print "Deleting", hex_key, "took", removeElapsed, "seconds"

            # xList.append(ntrees)
            xList.append(parsedBytes)
            eList.append(parseElapsed)
            fList.append(removeElapsed)

        # just plot the last one
        if 1==1:
            xLabel = 'parsedBytes'
            eLabel = 'parseElapsed'
            fLabel = 'removeElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
示例#27
0
    def test_exec2_log_like_R(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'airlines/year2013.csv'
        # csvPathname = '1B/reals_100000x1000_15f.data'
        # csvPathname = '1B/reals_1000000x1000_15f.data'
        # csvPathname = '1B/reals_1000000x1_15f.data'
        # csvPathname = '1B/reals_1B_15f.data'
        # csvPathname = '1B/reals_100M_15f.data'

        hex_key = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', 
            hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2, doSummary=False)
        inspect = h2o_cmd.runInspect(key=hex_key)
        print "numRows:", inspect['numRows']
        print "numCols:", inspect['numCols']
        inspect = h2o_cmd.runInspect(key=hex_key, offset=-1)
        print "inspect offset = -1:", h2o.dump_json(inspect)

        xList = []
        eList = []
        fList = []
        for execExpr in initList:
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300)
        for trial in range(300):
            for execExpr in exprList:
                # put the trial number into the temp for uniqueness
                execExpr = re.sub('Last.value', 'Last.value%s' % trial, execExpr)
                start = time.time()
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300)
                execTime = time.time() - start
                print 'exec took', execTime, 'seconds'
                c = h2o.nodes[0].get_cloud()
                c = c['nodes']

                # print (h2o.dump_json(c))
                k = [i['num_keys'] for i in c]
                v = [i['value_size_bytes'] for i in c]

                
                print "keys: %s" % " ".join(map(str,k))
                print "value_size_bytes: %s" % " ".join(map(str,v))

                # print "result:", result
                if DO_ORIG:
                    if 'r1' in execExpr:
                        xList.append(trial)
                        eList.append(execTime)
                    if 'log' in execExpr:
                        fList.append(execTime)
                else:
                    xList.append(trial)
                    eList.append(execTime)
                    fList.append(execTime)

        h2o.check_sandbox_for_errors()
        # PLOTS. look for eplot.jpg and fplot.jpg in local dir?
        if DO_PLOT:
            xLabel = 'trial'
            if DO_ORIG:
                eLabel = 'time: Last.value<trial>.4 = r1[,c(1)]'
                fLabel = 'time: Last.value<trial>.7 = log(Last.value<trial>.6)'
            else:
                eLabel = 'time: Last.value.3 = r2+1'
                fLabel = 'time: Last.value.3 = r2+1'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
示例#28
0
    def test_GBM_poker_1m(self):
        for trial in range(2):
            # PARSE train****************************************
            h2o.beta_features = False #turn off beta_features
            start = time.time()
            xList = []
            eList = []
            fList = []

            modelKey = 'GBMModelKey'
            timeoutSecs = 900
            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"

            csvPathname = 'poker/poker-hand-testing.data'
            hex_key = 'poker-hand-testing.data.hex'
            parseTrainResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='put',
                hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)

            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']
            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # GBM(train iterate)****************************************
            h2o.beta_features = True
            ntrees = 2
            for max_depth in [5,10,20]:
                params = {
                    'learn_rate': .1,
                    'nbins': 10,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': num_cols-1,
                    'ignored_cols_by_name': None,
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True

                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvPathname, trainElapsed)
                print l
                h2o.cloudPerfH2O.message(l)

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

        h2o.beta_features = False
        # just plot the last one
        if DO_PLOT_IF_KEVIN:
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
示例#29
0
    def test_GBM_manyfiles_train_test(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if localhost:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                # problems with categoricals not in the train data set? (warnings in h2o stdout)
                ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex')
                # just use matching
                ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex')
                ]
        else:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex')
                ]

        # if I got to hdfs, it's here
        # hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300)

            # Parse (test)****************************************
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = numCols - 1
            response = 378

            # randomly ignore a bunch of cols, just to make it go faster
            x = range(numCols)
            del x[response]
            ignored_cols_by_name = ",".join(map(lambda x: 'C' + str(x+1), random.sample(x, 300)))

            print "Using the same response %s for train and test (which should have a output value too)" % "C" + str(response+1)

            ntrees = 10
            # ignore 200 random cols (not the response)
            for max_depth in [5, 40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': 'C' + str(response+1),
                    'ignored_cols_by_name': ignored_cols_by_name,
                }
            



                ### print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                ### h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'], 
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                    actual=parseTestResult['destination_key'],
                    vactual='C' + str(response+1),
                    predict=predictKey,
                    vpredict='predict', # choices are 0 and 'predict'
                    )

                # errrs from end of list? is that the last tree?
                # all we get is cm
                cm = gbmPredictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm is really NAs, not CM"
                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrong)
                fList.append(trainElapsed)

            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
示例#30
0
    def test_quant_cols(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

    
        if getpass.getuser()=='kevin':
            tryList = [
                ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None, 'cE', 300), 
                (None, '/home/kevin/Downloads/t.csv', 15, 11, 'cE', 300), 
                ]
        else:
            tryList = [
                ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None, 'cE', 300), 
                ]

        # h2b.browseTheCloud()
        trial = 0
        for (bucket, csvPathname, iColCount, oColCount, hex_key, timeoutSecs) in tryList:
            xList = []
            eList = []
            fList = []

            # PARSE*******************************************************
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=200, doSummary=False)
            csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)

            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            if not oColCount:
                iColCount = 0

            if not oColCount:
                oColCount = numCols

            colCount = iColCount + oColCount
            for i in range (0,numCols):
                print "Column", i, "summary"
                h2o_cmd.runSummary(key=hex_key, max_qbins=1, cols=i);

            # print h2o.dump_json(inspect)
            levels = h2o.nodes[0].levels(source=hex_key)
            print "levels result:", h2o.dump_json(levels)

            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

            # error if any col has constant values
            if len(constantValuesDict) != 0:
                # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict)
                print "Probably got a col NA'ed and constant values as a result %s" % constantValuesDict
            
            # start after the last input col
            levels = h2o.nodes[0].levels(source=hex_key);
            l = levels['levels']
            for column in range(iColCount, iColCount+oColCount):
                if l[column]:
                    print "Skipping", column, "because it's enum (says levels)"
                    continue

                # QUANTILE*******************************************************
                
                quantile = 0.5 if DO_MEDIAN else .999
                # first output col. always fed by an exec cut, so 0?
                start = time.time()
                # file has headers. use col index
                q = h2o.nodes[0].quantiles(source_key=hex_key, column=column,
                    quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=1)
                qresult = q['result']
                h2p.red_print("result:", q['result'], "quantile", quantile, 
                    "interpolated:", q['interpolated'], "iterations", q['iterations'])
                elapsed = time.time() - start
                print "quantile end on ", hex_key, 'took', elapsed, 'seconds.'
                quantileTime = elapsed

                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                if 1==0:
                    h2o_summ.quantile_comparisons(
                        csvPathnameFull,
                        skipHeader=True,
                        col=column, # what col to extract from the csv
                        datatype='float',
                        quantile=0.5 if DO_MEDIAN else 0.999,
                        # h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                        # h2oQuantilesApprox=qresult_single,
                        h2oQuantilesExact=qresult,
                        use_genfromtxt=True,
                        )

                trial += 1
                execTime = 0
                xList.append(column)
                eList.append(execTime)
                fList.append(quantileTime)

                # remove all keys*******************************************************
                # what about hex_key?
                if 1==0:
                    start = time.time()
                    h2o.nodes[0].remove_all_keys()
                    elapsed = time.time() - start
                    print "remove all keys end on took", elapsed, 'seconds.'

        #****************************************************************
        # PLOTS. look for eplot.jpg and fplot.jpg in local dir?
        if DO_PLOT:
            xLabel = 'column (0 is first)'
            eLabel = 'exec cut time'
            fLabel = 'quantile time'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
示例#31
0
    def test_exec_enums_rand_cut(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (ROWS, 3, 2, 'cE', 300), 
            ]

        # create key names to use for exec
        eKeys = ['e%s' % i for i in range(10)]

        # h2b.browseTheCloud()
        trial = 0
        for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList:
            colCount = iColCount + oColCount

            hex_key = 'p'
            colEnumList = create_col_enum_list(iColCount)

            # create 100 possible cut expressions here, so we don't waste time below
            rowExprList = []
            for j in range(CUT_EXPR_CNT):
                print "Creating", CUT_EXPR_CNT, 'cut expressions'
                # init cutValue. None means no compare
                cutValue = [None for i in range(iColCount)]
                # build up a random cut expression
                cols = random.sample(range(iColCount), random.randint(1,iColCount))
                for c in cols:
                    # possible choices within the column
                    # cel = colEnumList[c]
                    cel = colEnumList
                    # for now the cutValues are numbers for the enum mappings
                    if 1==1:
                        # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like
                        celChoice = str(random.choice(range(len(cel))))
                    else:
                        celChoice = random.choice(cel)
                    cutValue[c] = celChoice
    
                cutExprList = []
                for i,c in enumerate(cutValue):
                    if c is None:   
                        continue
                    else:
                        # new ...ability to reference cols
                        # src[ src$age<17 && src$zip=95120 && ... , ]
                        cutExprList.append('p$C'+str(i+1)+'=='+c)

                cutExpr = ' && '.join(cutExprList)
                print "cutExpr:", cutExpr    

                # should be two different keys in the sample
                e = random.sample(eKeys,2)
                fKey = e[0]
                eKey = e[1]

                rowExpr = '%s[%s,];' % (hex_key, cutExpr)
                print "rowExpr:", rowExpr
                rowExprList.append(rowExpr)

                print "j:", j

            # CREATE DATASET*******************************************
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList)

            # PARSE*******************************************************

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False, header=0)

            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            # print h2o.dump_json(inspect)

            rSummary = h2o_cmd.runSummary(key=parseResult['destination_key'])
            h2o_cmd.infoFromSummary(rSummary)

            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

            # error if any col has constant values
            if len(constantValuesDict) != 0:
                raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict)

            # INIT all possible key names used***************************
            # remember. 1 indexing!

            # is this needed?
            if 1==1:
                a = 'a=c(1,2,3);' + ';'.join(['a[,%s]=a[,%s-1]'% (i,i) for i in range(2,colCount)])
                print a
                for eKey in eKeys:
                    # build up the columns
                    e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey), print_params=False)
                    ## print h2o.dump_json(e)


            xList = []
            eList = []
            fList = []
            for repeat in range(200):
                # EXEC*******************************************************
                # don't use exec_expr to avoid issues with Inspect following etc.
                randICol = random.randint(0,iColCount-1)
                randOCol = random.randint(iColCount, iColCount+oColCount-1)

                # should be two different keys in the sample
                e = random.sample(eKeys,2)
                fKey = e[0]
                eKey = e[1]

                if 1==0:
                    start = time.time()
                    e = h2o.nodes[0].exec_query(str='%s=%s[,%s]' % (fKey, hex_key, randOCol+1))

                    elapsed = time.time() - start
                    print "exec 1 took", elapsed, "seconds."
                    execTime = elapsed

                if 1==1:
                    start = time.time()
                    h2o.nodes[0].exec_query(str="%s=%s" % (fKey, random.choice(rowExprList)))
                    elapsed = time.time() - start
                    execTime = elapsed
                    print "exec 2 took", elapsed, "seconds."
                
                if 1==0:
                    gKey = random.choice(eKeys)
                    # do a 2nd random to see if things blow up
                    start = time.time()
                    h2o.nodes[0].exec_query(str="%s=%s" % (gKey, fKey))
                    elapsed = time.time() - start
                    print "exec 3 took", elapsed, "seconds."

                if 1==1:
                    inspect = h2o_cmd.runInspect(key=fKey)
                    h2o_cmd.infoFromInspect(inspect, fKey)
                    numRows = inspect['numRows']
                    numCols = inspect['numCols']

                if numRows==0 or numCols!=colCount:
                    h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort")

                # QUANTILE*******************************************************
                quantile = 0.5 if DO_MEDIAN else .999
                # first output col. always fed by an exec cut, so 0?
                column = iColCount
                start = time.time()
                q = h2o.nodes[0].quantiles(source_key=fKey, column=column, 
                    quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS)
                h2p.red_print("quantile", quantile, q['result'])
                elapsed = time.time() - start
                print "quantile end on ", fKey, 'took', elapsed, 'seconds.'
                quantileTime = elapsed


                # remove all keys*******************************************************
                # what about hex_key?
                if 1==0:
                    start = time.time()
                    h2o.nodes[0].remove_all_keys()
                    elapsed = time.time() - start
                    print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.'

                trial += 1
                xList.append(trial)
                eList.append(execTime)
                fList.append(quantileTime)


        # just get a plot of the last one (biggest)
        if DO_PLOT:
            xLabel = 'trial'
            eLabel = 'exec cut time'
            fLabel = 'quantile time'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
示例#32
0
    def test_GBM_params_rand2(self):
        h2o.beta_features = False
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        files = [
                # ('standard', 'covtype.shuffled.90pct.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex')
                ('standard', 'covtype.shuffled.10pct.sorted.data', 'covtype.train.hex', 1800, 54, 'covtype.shuffled.10pct.data', 'covtype.test.hex')
                ]

        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            h2o.beta_features = False #turn off beta_features
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTrainResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + trainFilename, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Parse (test)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTestResult['destination_key'] for h2o"
                parseTestResult['destination_key'] = testKey

            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # GBM (train iterate)****************************************
            inspect = h2o_cmd.runInspect(key=parseTestResult['destination_key'])
            paramsDict = define_gbm_params()
            for trial in range(3):
                h2o.beta_features = True
                # translate it (only really need to do once . out of loop?
                h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
                ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

                # use this to set any defaults you want if the pick doesn't set
                params = {
                    'response': 54, 
                    'ignored_cols_by_name': 
                    '0,1,2,3,4', 
                    'ntrees': 2,
                    'validation': parseTestResult['destination_key'],
                }
                h2o_gbm.pickRandGbmParams(paramsDict, params)
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'], 
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                if DO_PREDICT_CM:
                    gbmPredictCMResult = h2o.nodes[0].predict_confusion_matrix(
                        actual=parseTestResult['destination_key'],
                        vactual='predict',
                        predict=predictKey,
                        vpredict='predict', # choices are 7 (now) and 'predict'
                        )

                    # errrs from end of list? is that the last tree?
                    # all we get is cm
                    cm = gbmPredictCMResult['cms'][-1] # use the last one

                    # These will move into the h2o_gbm.py
                    pctWrong = h2o_gbm.pp_cm_summary(cm);
                    print "Last line of this cm is really NAs, not CM"
                    print "\nTest\n==========\n"
                    print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                if 'max_depth' in params and params['max_depth']:
                    xList.append(params['max_depth'])
                    eList.append(pctWrongTrain)
                    fList.append(trainElapsed)

            h2o.beta_features = False
            xLabel = 'max_depth'
            eLabel = 'pctWrongTrain'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
示例#33
0
    def test_rapids_rbind(self):

        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        # stop if > 1G (fails memory cleaner assetion
        maxx = 29
        # for trial in range(maxx):
        # for trial in range(int(1e6),int(200e6),int(1e6)):
        ROWS = int(100e6)
        for trial in [ROWS]:
            
            # length = (2 ** trial)
            # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
            length = trial
            execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
    
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
            elapsed1 = time.time() - start
            if execResult['num_rows']:
                keys.append(execExpr)

            xList = []
            eList = []
            fList = []
            # gets out of memory error if we rbind too much
            for trial2 in range(1, 8, 2):
            # for trial2 in range(0, 10):
            # fails. Post size?
            # for trial2 in range(0, 16):
                rows = ROWS * trial2
             
                vString = ' '.join(['%v' for x in range(trial2)])
                execExpr = '(= !v2 (rbind %s))' % vString

                start = time.time()
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=40)
                elapsed2 = time.time() - start

                if execResult['num_rows']:
                    keys.append(execExpr)
                
                if 1==1:
                    start = time.time()
                    execExpr = '(sum %v2 %TRUE)'
                    execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60)
                    elapsed1 = time.time() - start

                # xList.append(length)
                xList.append(rows)
                eList.append(elapsed1)
                fList.append(elapsed2)


        if 1==1:
            xLabel = 'rows'
            eLabel = 'elapsed (sum)'
            fLabel = 'elapsed (rbind)'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)



        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
    def test_exec2_enums_rand_cut(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = ROWS
        tryList = [
            (n, 10, 9, 'cE', 300), 
            ]

        # create key names to use for exec
        eKeys = ['e%s' % i for i in range(10)]

        # h2b.browseTheCloud()
        trial = 0
        for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList:
            colCount = iColCount + oColCount

            hex_key = 'p'
            colEnumList = create_col_enum_list(iColCount)

            # create 100 possible cut expressions here, so we don't waste time below
            rowExprList = []
            print "Creating", CUT_EXPR_CNT, 'cut expressions'
            for j in range(CUT_EXPR_CNT):
                # init cutValue. None means no compare
                cutValue = [None for i in range(iColCount)]
                # build up a random cut expression
                cols = random.sample(range(iColCount), random.randint(1,iColCount))
                for c in cols:
                    # possible choices within the column
                    cel = colEnumList[c]
                    # for now the cutValues are numbers for the enum mappings

                    # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like
                    # celChoice = str(random.choice(range(len(cel))))
                    celChoice = random.choice(range(len(cel)))
                    cutValue[c] = celChoice
    
                cutExprList = []

                pKey = Key('p')
                for i,c in enumerate(cutValue):
                    if c is None:   
                        continue
                    else:
                        # new ...ability to reference cols
                        # src[ src$age<17 && src$zip=95120 && ... , ]
                        # cutExprList.append('p$C'+str(i+1)+'=='+c)
                        # all column indexing in h2o-dev is with number
                        e = Fcn('==', c, pKey[:,i])
                        cutExprList.append(e)

                cutExpr = None
                for ce in cutExprList:
                    if cutExpr:
                        cutExpr = Fcn('&', cutExpr, ce)
                    else: 
                        cutExpr = ce

                print "cutExpr:", cutExpr    

                # should be two different keys in the sample
                e = random.sample(eKeys,2)
                fKey = e[0]
                eKey = e[1]

                # rowExpr = '%s[%s,];' % (hex_key, cutExpr)
                hKey = Key(hex_key)
                rowExpr = hKey[cutExpr, :]

                print "rowExpr:", rowExpr
                rowExprList.append(rowExpr)


            # CREATE DATASET*******************************************
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList)

            # PARSE*******************************************************
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)

            inspect = h2o_cmd.runInspect(key=parse_key)
            missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)
            # print h2o.dump_json(inspect)

            # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
            #    h2o_cmd.columnInfoFromInspect(parse_key, exceptionOnMissingValues=False)

            # error if any col has constant values
            # if len(constantValuesDict) != 0:
            #    raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict)

            # INIT all possible key names used***************************
            # remember. 1 indexing!

            # build up the columns
            Assign('b', [1,2,3])
            # could also append 1 col at a time, by assigning to the next col number?
            Assign('a', Cbind(['b' for i in range(colCount)]))
            
            for eKey in eKeys:
                Assign(eKey, 'a')
                ## print h2o.dump_json(e)

            xList = []
            eList = []
            fList = []
            for repeat in range(200):
                # EXEC*******************************************************
                # don't use exec_expr to avoid issues with Inspect following etc.
                randICol = random.randint(0,iColCount-1)
                randOCol = random.randint(iColCount, iColCount+oColCount-1)

                # should be two different keys in the sample
                e = random.sample(eKeys,2)
                fKey = e[0]
                eKey = e[1]

                if 1==1:
                    start = time.time()
                    Assign(fKey, random.choice(rowExprList)).do()
                    elapsed = time.time() - start
                    execTime = elapsed
                    print "exec 2 took", elapsed, "seconds."
                
                    inspect = h2o_cmd.runInspect(key=fKey)
                    missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)

                if numRows==0 or numCols!=colCount:
                    h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort")

                # FIX! put quantile back in?
                quantileTime = 0

                # remove all keys*******************************************************
                # what about hex_key?
                if 1==0:
                    start = time.time()
                    h2o.nodes[0].remove_all_keys()
                    elapsed = time.time() - start
                    print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.'

                trial += 1
                xList.append(trial)
                eList.append(execTime)
                fList.append(quantileTime)


        # just get a plot of the last one (biggest)
        if DO_PLOT:
            xLabel = 'trial'
            eLabel = 'exec cut time'
            fLabel = 'quantile time'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
    def test_plot_remove_keys_manyfiles(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        print "Remember, the parse only deletes what got parsed. We import the folder. So we double import. That should work now"
        tryList = [
            ("file_1[0-9].dat.gz", 'c10', 400),
            ("file_[1-2][0-9].dat.gz", 'c20', 400),
            ("file_[1-4][0-9].dat.gz", 'c40', 400),
            ("file_[1-8][0-9].dat.gz", 'c80', 400),
            # don't do this case. timesout at 300 sec on polling with 172-180
            # ("file_[1-2][1-8][0-9].dat.gz", 'c160', 1200),
        ]

        xList = []
        eList = []
        fList = []
        importFolderPath = "manyfiles-nflx-gz"
        for (csvFilePattern, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvPathname = importFolderPath + "/" + csvFilePattern
            start = time.time()
            parseResult = h2i.import_parse(bucket="home-0xdiag-datasets",
                                           path=csvPathname,
                                           hex_key=hex_key,
                                           retryDelaySecs=3,
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False)
            parseElapsed = time.time() - start
            print "Parse only:", parseResult[
                'destination_key'], "took", parseElapsed, "seconds"
            h2o.check_sandbox_for_errors()

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=timeoutSecs)
            print "Inspect:", parseResult[
                'destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            parsedBytes = inspect['byteSize']

            node = h2o.nodes[0]
            print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?"
            start = time.time()
            node.remove_key(hex_key, timeoutSecs=30)
            removeElapsed = time.time() - start
            print "Deleting", hex_key, "took", removeElapsed, "seconds"

            # xList.append(ntrees)
            xList.append(parsedBytes)
            eList.append(parseElapsed)
            fList.append(removeElapsed)

        # just plot the last one
        if 1 == 1:
            xLabel = 'parsedBytes'
            eLabel = 'parseElapsed'
            fLabel = 'removeElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
示例#36
0
    def test_GBM_many_cols_enum(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u']

        if getpass.getuser() == 'kevin': # longer run
            tryList = [
                (10000,  100, 'cA', 300),
                (10000,  300, 'cB', 500),
                # (10000,  500, 'cC', 700),
                # (10000,  700, 'cD', 3600),
                # (10000,  900, 'cE', 3600),
                # (10000,  1000, 'cF', 3600),
                # (10000,  1300, 'cG', 3600),
                # (10000,  1700, 'cH', 3600),
                # (10000,  2000, 'cI', 3600),
                # (10000,  2500, 'cJ', 3600),
                # (10000,  3000, 'cK', 3600),
                ]
        else:
            tryList = [
                (10000,  100, 'cA', 100),
                (10000,  300, 'cC', 300),
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList)

            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            modelKey = 'GBMModelKey'

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTrainResult = h2i.import_parse(bucket=None, path=csvPathname, schema='put', header=0,
                hex_key=hex_key, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = hex_key

            elapsed = time.time() - start
            print "train parse end on ", csvPathname, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            # Logging to a benchmark file
            algo = "Parse"
            l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
            print l
            h2o.cloudPerfH2O.message(l)

            # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # GBM(train iterate)****************************************
            h2o.beta_features = True
            ntrees = 10
            for max_depth in [5,10,20,40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': 'C' + str(numCols-1),
                    'ignored_cols_by_name': None,
                }
               # both response variants should work?
                # if random.randint(0,1):
                #    params['response'] = numCols-1,
                
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True

                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=h2o.beta_features, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth)
                l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed)
                print l
                h2o.cloudPerfH2O.message(l)

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

        h2o.beta_features = False
        # just plot the last one
        if 1==1:
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
    def test_GBM_manyfiles_train_test(self):
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if localhost:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                # problems with categoricals not in the train data set? (warnings in h2o stdout)
                ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex')
                # just use matching
                ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex')
                ]
        else:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex')
                ]

        # if I got to hdfs, it's here
        # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        # h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            h2o.beta_features = False #turn off beta_features
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult['destination_key'] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (trainKey, trainKey, trainKey)
            resultExec = h2o_cmd.runExec(expression=execExpr, timeoutSecs=60)

            # Parse (test)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"

            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, noPoll=h2o.beta_features, doSummary=False)
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTestResult['destination_key'] for h2o"
                parseTestResult['destination_key'] = testKey

            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s=colSwap(%s,378,(%s[378]>15 ? 1 : 0))' % (testKey, testKey, testKey)
            resultExec = h2o_cmd.runExec(expression=execExpr, timeoutSecs=60)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = num_cols - 1
            response = 378

            # randomly ignore a bunch of cols, just to make it go faster
            x = range(num_cols)
            del x[response]
            ignored_cols_by_name = ",".join(map(str,random.sample(x, 300)))

            print "Using the same response %s for train and test (which should have a output value too)" % response

            ntrees = 10
            # ignore 200 random cols (not the response)
            for max_depth in [5, 40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': response,
                    'ignored_cols_by_name': ignored_cols_by_name,
                }
            


                if FORCE_FAIL_CASE:
                    params = {'learn_rate': 0.2, 'classification': None, 'min_rows': 10, 'ntrees': 10, 'response': 378, 'nbins': 1024, 'ignored_cols_by_name': '256, 382, 399, 50, 176, 407, 375, 113, 170, 313, 364, 33, 361, 426, 121, 371, 232, 327, 480, 75, 37, 312, 225, 195, 244, 406, 268, 230, 321, 257, 274, 197, 35, 501, 360, 72, 213, 79, 1, 466, 362, 160, 444, 437, 5, 59, 108, 454, 73, 374, 509, 337, 183, 252, 21, 314, 100, 200, 159, 379, 405, 367, 432, 181, 8, 420, 118, 284, 281, 465, 456, 359, 291, 330, 258, 523, 243, 487, 408, 392, 15, 231, 482, 481, 70, 171, 182, 31, 409, 492, 471, 53, 45, 448, 83, 527, 452, 350, 423, 93, 447, 130, 126, 54, 354, 169, 253, 49, 42, 431, 305, 498, 216, 189, 508, 122, 308, 228, 190, 293, 451, 63, 133, 304, 397, 425, 333, 19, 158, 391, 153, 282, 112, 64, 502, 7, 16, 469, 163, 136, 40, 99, 302, 264, 325, 434, 187, 311, 286, 278, 179, 109, 348, 287, 467, 400, 164, 384, 422, 43, 117, 91, 276, 211, 175, 329, 541, 438, 145, 534, 218, 177, 317, 222, 210, 162, 402, 98, 299, 245, 385, 233, 188, 516, 143, 13, 532, 429, 172, 455, 470, 518, 236, 296, 388, 468, 110, 395, 185, 25, 489, 196, 120, 435, 165, 168, 271, 74, 510, 36, 76, 208, 223, 270, 515, 421, 87, 66, 473, 220, 46, 486, 102, 38, 156, 48, 132, 331, 51, 403, 234, 23, 449, 341, 303, 410, 479, 203, 413, 512, 513, 9, 446, 511, 55, 6, 339, 418, 476, 178, 266, 22, 141, 259, 349, 86, 144, 34, 290, 326, 318, 519, 424, 127, 174, 472, 116, 17, 152, 280, 215, 514, 103, 377, 537, 373, 238, 47, 353, 428, 94, 214, 61, 123, 386, 351, 246, 411, 101, 249, 240, 520, 307, 288, 199, 147, 436, 77, 464, 414', 'source': u'test.hex', 'validation': u'test.hex', 'max_depth': 5} 

                ### print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    noPoll=True, timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cm']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                ### h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'], 
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                print "This is crazy!"
                gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                    actual=parseTestResult['destination_key'],
                    vactual=response,
                    predict=predictKey,
                    vpredict='predict', # choices are 0 and 'predict'
                    )

                # errrs from end of list? is that the last tree?
                # all we get is cm
                cm = gbmPredictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm is really NAs, not CM"
                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrong)
                fList.append(trainElapsed)

            h2o.beta_features = False
            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
示例#38
0
    def test_ddply_plot(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if DO_KNOWN_FAIL:
            tryList = [
                (1000000, 5, 'cD', 0, 320, 30),
            ]
        else:
            tryList = [
                (1000000, 5, 'cD', 0, 10, 30),
                (1000000, 5, 'cD', 0, 20, 30),
                (1000000, 5, 'cD', 0, 40, 30),
                (1000000, 5, 'cD', 0, 50, 30),
                (1000000, 5, 'cD', 0, 80, 30),
                # (1000000, 5, 'cD', 0, 160, 30),
                # fails..don't do
                # (1000000, 5, 'cD', 0, 320, 30),
                # (1000000, 5, 'cD', 0, 320, 30),
                # starts to fail here. too many groups?
                # (1000000, 5, 'cD', 0, 640, 30),
                # (1000000, 5, 'cD', 0, 1280, 30),
            ]

        if DO_APPEND_KNOWN_FAIL2:
            tryList.append((1000000, 5, 'cD', 0, 160, 30), )
            #tryList.append(
            #    (1000000, 5, 'cD', 0, 320, 30),
            #)
        ### h2b.browseTheCloud()
        xList = []
        eList = []
        fList = []
        trial = 0
        for (rowCount, colCount, hex_key, minInt, maxInt,
             timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            if DO_KNOWN_FAIL:
                # csvFilename = 'syn_binary_1000000x5.csv.gz' # fails
                # csvFilename = 'a1' # fails
                csvFilename = "syn_ddply_1Mx5_0_320.gz"
                bucket = "home-0xdiag-datasets"
                csvPathname = "standard/" + csvFilename
                minInt = 0
                maxInt = 320
            else:
                bucket = None
                csvFilename = 'syn_' + "binary" + "_" + str(
                    rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                print "Creating random", csvPathname, "with range", (
                    maxInt - minInt) + 1
                write_syn_dataset(csvPathname, rowCount, colCount, minInt,
                                  maxInt, SEEDPERFILE)

            for lll in range(1):
                # PARSE train****************************************
                hexKey = 'r.hex'
                parseResult = h2i.import_parse(bucket=bucket,
                                               path=csvPathname,
                                               schema='put',
                                               hex_key=hexKey)
                inspect = h2o_cmd.runInspect(key=hexKey)
                missingValuesList = h2o_cmd.infoFromInspect(
                    inspect, csvFilename)
                self.assertEqual(
                    missingValuesList, [],
                    "a1 should have no NAs in parsed dataset: %s" %
                    missingValuesList)

                for resultKey, execExpr in initList:
                    h2e.exec_expr(h2o.nodes[0],
                                  execExpr,
                                  resultKey=resultKey,
                                  timeoutSecs=60)

                #*****************************************************************************************
                # two columns. so worse case every combination of each possible value
                # only true if enough rows (more than the range?)
                maxExpectedGroups = ((maxInt - minInt) + 1)**2
                # do it twice..to get the optimal cached delay for time?
                execExpr = "a1 = ddply(r.hex, c(1,2), " + PHRASE + ")"
                start = time.time()
                (execResult, result) = h2e.exec_expr(h2o.nodes[0],
                                                     execExpr,
                                                     resultKey=None,
                                                     timeoutSecs=500)
                groups = execResult['num_rows']
                # this is a coarse comparision, statistically not valid for small rows, and certain ranges?
                h2o_util.assertApproxEqual(
                    groups,
                    maxExpectedGroups,
                    rel=0.2,
                    msg=
                    "groups %s isn't close to expected amount %s, minInt: %s maxInt: %s"
                    % (groups, maxExpectedGroups, minInt, maxInt))
                ddplyElapsed = time.time() - start
                print "ddplyElapsed:", ddplyElapsed
                print "execResult", h2o.dump_json(execResult)

                a1dump = h2o_cmd.runInspect(key="a1")
                print "a1", h2o.dump_json(a1dump)
                # should never have any NAs in this result
                missingValuesList = h2o_cmd.infoFromInspect(a1dump, "a1")
                self.assertEqual(
                    missingValuesList, [],
                    "a1 should have no NAs: %s trial: %s" %
                    (missingValuesList, trial))

                #*****************************************************************************************

                execExpr = "a2 = ddply(r.hex, c(1,2), " + PHRASE + ")"
                start = time.time()
                (execResult, result) = h2e.exec_expr(h2o.nodes[0],
                                                     execExpr,
                                                     resultKey=None,
                                                     timeoutSecs=500)
                groups = execResult['num_rows']
                # this is a coarse comparision, statistically not valid for small rows, and certain ranges?
                h2o_util.assertApproxEqual(
                    groups,
                    maxExpectedGroups,
                    rel=0.2,
                    msg=
                    "groups %s isn't close to expected amount %s, minInt: %s maxInt: %s"
                    % (groups, maxExpectedGroups, minInt, maxInt))
                ddplyElapsed = time.time() - start
                print "ddplyElapsed:", ddplyElapsed
                print "execResult", h2o.dump_json(execResult)

                a2dump = h2o_cmd.runInspect(key="a2")
                print "a2", h2o.dump_json(a2dump)
                # should never have any NAs in this result
                missingValuesList = h2o_cmd.infoFromInspect(a2dump, "a2")
                self.assertEqual(
                    missingValuesList, [],
                    "a2 should have no NAs: %s trial: %s" %
                    (missingValuesList, trial))

                #*****************************************************************************************
                # should be same answer in both cases
                execExpr = "sum(a1!=a2)==0"
                (execResult, result) = h2e.exec_expr(h2o.nodes[0],
                                                     execExpr,
                                                     resultKey=None,
                                                     timeoutSecs=500)
                execExpr = "s=c(0); s=(a1!=a2)"
                (execResult1, result1) = h2e.exec_expr(h2o.nodes[0],
                                                       execExpr,
                                                       resultKey=None,
                                                       timeoutSecs=500)
                print "execResult", h2o.dump_json(execResult)

                #*****************************************************************************************

                # should never have any NAs in this result
                sdump = h2o_cmd.runInspect(key="s")
                print "s", h2o.dump_json(sdump)
                self.assertEqual(
                    result, 1,
                    "a1 and a2 weren't equal? Maybe ddply can vary execution order (fp error? so multiple ddply() can have different answer. %s %s %s"
                    % (FUNC_PHRASE, result, h2o.dump_json(execResult)))

                # xList.append(ntrees)
                trial += 1
                # this is the biggest it might be ..depends on the random combinations
                # groups = ((maxInt - minInt) + 1) ** 2
                xList.append(groups)
                eList.append(ddplyElapsed)
                fList.append(ddplyElapsed)

        if DO_PLOT:
            xLabel = 'groups'
            eLabel = 'ddplyElapsed'
            fLabel = 'ddplyElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
示例#39
0
    def test_GBM_manyfiles_train_test(self):
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if h2o.localhost:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_1[0-9][0-9].dat.gz', 'file_100.hex', 1800, None, 'file_1.dat.gz', 'file_1_test.hex')
                ]
        else:
            files = [
                # None forces num_cols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'file_10.hex', 1800, None, 'file_1[0-9].dat.gz', 'file_10_test.hex')
                ]

        # if I got to hdfs, it's here
        # hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        # h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n',
                hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False)

            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])
            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (trainKey, trainKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=500)

            # Parse (test)****************************************
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False)

            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            print "Slow! exec is converting all imported keys?, not just what was parsed"
            execExpr = '%s[,378] = %s[,378]>15 ? 1 : 0' % (testKey, testKey, testKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = num_cols - 1
            response = 378
            print "Using the same response %s for train and test (which should have a output value too)" % response

            ntrees = 10
            for max_depth in [5,10,20,40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': response,
                    # 'ignored_cols': 
                }
                print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cm']
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                if doPredict:
                    predictKey = 'Predict.hex'
                    ### h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                    start = time.time()
                    gbmTestResult = h2o_cmd.runPredict(
                        data_key=parseTestResult['destination_key'], 
                        model_key=modelKey,
                        destination_key=predictKey,
                        timeoutSecs=timeoutSecs)
                    elapsed = time.time() - start
                    print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                    print "This is crazy!"
                    gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                        actual=parseTestResult['destination_key'],
                        vactual=response,
                        predict=predictKey,
                        vpredict='predict', # choices are 0 and 'predict'
                        )

                    # errrs from end of list? is that the last tree?
                    # all we get is cm
                    cm = gbmPredictCMResult['cm']

                    # These will move into the h2o_gbm.py
                    pctWrong = h2o_gbm.pp_cm_summary(cm);
                    print "Last line of this cm is really NAs, not CM"
                    print "\nTest\n==========\n"
                    print h2o_gbm.pp_cm(cm)

                    # xList.append(ntrees)
                    xList.append(max_depth)
                    eList.append(pctWrong)
                    fList.append(trainElapsed)


            if doPredict:
                xLabel = 'max_depth'
                eLabel = 'pctWrong'
                fLabel = 'trainElapsed'
                eListTitle = ""
                fListTitle = ""
                h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
    def test_rapids_vec_fail1(self):
        start = time.time()
        xList = []
        eList = []
        fList = []

        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)

        keys = []
        # stop if > 1G (fails memory cleaner assetion
        maxx = 29
        # for trial in range(maxx):
        for trial in range(int(1e6), int(100e6), int(10e6)):

            # length = (2 ** trial)
            # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
            length = trial
            execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)

            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0],
                                               execExpr,
                                               resultKey=None,
                                               timeoutSecs=10)
            elapsed1 = time.time() - start
            if execResult['num_rows']:
                keys.append(execExpr)

            # execExpr = '(= !v (+ (+ %v %v) (+ %v %v))'
            execExpr = '(= !v (+ %v %v))'
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0],
                                               execExpr,
                                               resultKey=None,
                                               timeoutSecs=30)
            elapsed2 = time.time() - start

            if execResult['num_rows']:
                keys.append(execExpr)

            xList.append(length)
            eList.append(elapsed1)
            fList.append(elapsed2)

        if 1 == 1:
            xLabel = 'vector length'
            eLabel = 'elapsed (create v)'
            fLabel = 'elapsed (v = v + v)'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)

        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
    def test_plot_remove_keys(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (100000, 100, 'cG', 400),
            (200000, 100, 'cH', 400),
            (400000, 100, 'cI', 400),
            (800000, 100, 'cJ', 400),
            (1000000, 100, 'cK', 400),
        ]

        xList = []
        eList = []
        fList = []
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            NUM_CASES = h2o_util.fp_format()
            sel = random.randint(0, NUM_CASES - 1)
            csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount,
                                                   colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE,
                              sel)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult,
                                  expectedNumRows=rowCount,
                                  expectedNumCols=colCount)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            parseElapsed = pA.python_elapsed
            parse_key = pA.parse_key
            byteSize = pA.byteSize
            numRows = iA.numRows
            numCols = iA.numCols
            print parse_key, parseElapsed, byteSize, numRows, numCols

            labelList = iA.labelList
            node = h2o.nodes[0]

            print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?"
            start = time.time()
            node.remove_key(hex_key, timeoutSecs=30)
            removeElapsed = time.time() - start
            print "Deleting", hex_key, "took", removeElapsed, "seconds"

            # xList.append(ntrees)
            xList.append(byteSize)
            eList.append(parseElapsed)
            fList.append(removeElapsed)

        # just plot the last one
        if 1 == 1:
            xLabel = 'byteSize'
            eLabel = 'parseElapsed'
            fLabel = 'removeElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
示例#42
0
    def test_ddply_plot(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if DO_KNOWN_FAIL:
            tryList = [(1000000, 5, "cD", 0, 320, 30)]
        else:
            tryList = [
                # (1000000, 5, 'cD', 0, 10, 30),
                # (1000000, 5, 'cD', 0, 20, 30),
                # (1000000, 5, 'cD', 0, 40, 30),
                # (1000000, 5, 'cD', 0, 50, 30),
                (1000000, 5, "cD", 0, 80, 30),
                (1000000, 5, "cD", 0, 160, 30),
                # fails..don't do
                # (1000000, 5, 'cD', 0, 320, 30),
                # (1000000, 5, 'cD', 0, 320, 30),
                # starts to fail here. too many groups?
                # (1000000, 5, 'cD', 0, 640, 30),
                # (1000000, 5, 'cD', 0, 1280, 30),
            ]

        if DO_APPEND_KNOWN_FAIL2:
            tryList.append((1000000, 5, "cD", 0, 160, 30))
            tryList.append((1000000, 5, "cD", 0, 320, 30))
        ### h2b.browseTheCloud()
        xList = []
        eList = []
        fList = []
        trial = 0
        for (rowCount, colCount, hex_key, minInt, maxInt, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            if DO_KNOWN_FAIL:
                # csvFilename = 'syn_binary_1000000x5.csv.gz' # fails
                # csvFilename = 'a1' # fails
                csvFilename = "syn_ddply_1Mx5_0_320.gz"
                bucket = "home-0xdiag-datasets"
                csvPathname = "standard/" + csvFilename
                minInt = 0
                maxInt = 320
            else:
                bucket = None
                csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv"
                csvPathname = SYNDATASETS_DIR + "/" + csvFilename
                print "Creating random", csvPathname, "with range", (maxInt - minInt) + 1
                write_syn_dataset(csvPathname, rowCount, colCount, minInt, maxInt, SEEDPERFILE)

            for lll in range(5):
                # PARSE train****************************************
                hexKey = "r.hex"
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema="local", hex_key=hexKey)
                inspect = h2o_cmd.runInspect(key=hexKey)
                missingValuesList = h2o_cmd.infoFromInspect(inspect, csvFilename)
                self.assertEqual(
                    missingValuesList, [], "a1 should have no NAs in parsed dataset: %s" % missingValuesList
                )

                for resultKey, execExpr in initList:
                    h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=60)

                # *****************************************************************************************
                # two columns. so worse case every combination of each possible value
                # only true if enough rows (more than the range?)
                maxExpectedGroups = ((maxInt - minInt) + 1) ** 2
                # do it twice..to get the optimal cached delay for time?
                execExpr = "a1 = ddply(r.hex, c(1,2), " + PHRASE + ")"
                start = time.time()
                (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90)
                groups = execResult["num_rows"]
                # this is a coarse comparision, statistically not valid for small rows, and certain ranges?
                h2o_util.assertApproxEqual(
                    groups,
                    maxExpectedGroups,
                    rel=0.2,
                    msg="groups %s isn't close to expected amount %s, minInt: %s maxInt: %s"
                    % (groups, maxExpectedGroups, minInt, maxInt),
                )
                ddplyElapsed = time.time() - start
                print "ddplyElapsed:", ddplyElapsed
                print "execResult", h2o.dump_json(execResult)

                a1dump = h2o_cmd.runInspect(key="a1")
                print "a1", h2o.dump_json(a1dump)
                # should never have any NAs in this result
                missingValuesList = h2o_cmd.infoFromInspect(a1dump, "a1")
                self.assertEqual(
                    missingValuesList, [], "a1 should have no NAs: %s trial: %s" % (missingValuesList, trial)
                )

                # *****************************************************************************************

                execExpr = "a2 = ddply(r.hex, c(1,2), " + PHRASE + ")"
                start = time.time()
                (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90)
                groups = execResult["num_rows"]
                # this is a coarse comparision, statistically not valid for small rows, and certain ranges?
                h2o_util.assertApproxEqual(
                    groups,
                    maxExpectedGroups,
                    rel=0.2,
                    msg="groups %s isn't close to expected amount %s, minInt: %s maxInt: %s"
                    % (groups, maxExpectedGroups, minInt, maxInt),
                )
                ddplyElapsed = time.time() - start
                print "ddplyElapsed:", ddplyElapsed
                print "execResult", h2o.dump_json(execResult)

                a2dump = h2o_cmd.runInspect(key="a2")
                print "a2", h2o.dump_json(a2dump)
                # should never have any NAs in this result
                missingValuesList = h2o_cmd.infoFromInspect(a2dump, "a2")
                self.assertEqual(
                    missingValuesList, [], "a2 should have no NAs: %s trial: %s" % (missingValuesList, trial)
                )

                # *****************************************************************************************
                # should be same answer in both cases
                execExpr = "sum(a1!=a2)==0"
                (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90)
                execExpr = "s=c(0); s=(a1!=a2)"
                (execResult1, result1) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=90)
                print "execResult", h2o.dump_json(execResult)

                # *****************************************************************************************

                # should never have any NAs in this result
                sdump = h2o_cmd.runInspect(key="s")
                print "s", h2o.dump_json(sdump)
                self.assertEqual(
                    result,
                    1,
                    "a1 and a2 weren't equal? Maybe ddply can vary execution order (fp error? so multiple ddply() can have different answer. %s %s %s"
                    % (FUNC_PHRASE, result, h2o.dump_json(execResult)),
                )

                # xList.append(ntrees)
                trial += 1
                # this is the biggest it might be ..depends on the random combinations
                # groups = ((maxInt - minInt) + 1) ** 2
                xList.append(groups)
                eList.append(ddplyElapsed)
                fList.append(ddplyElapsed)

        if DO_PLOT:
            xLabel = "groups"
            eLabel = "ddplyElapsed"
            fLabel = "ddplyElapsed"
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
示例#43
0
    def test_ddply_plot(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (1000000, 5, 'cD', 0, 10, 30), 
            (1000000, 5, 'cD', 0, 20, 30), 
            (1000000, 5, 'cD', 0, 30, 30), 
            (1000000, 5, 'cD', 0, 40, 30), 
            (1000000, 5, 'cD', 0, 50, 30), 
            (1000000, 5, 'cD', 0, 70, 30), 
            (1000000, 5, 'cD', 0, 100, 30), 
            (1000000, 5, 'cD', 0, 130, 30), 
            (1000000, 5, 'cD', 0, 160, 30), 
            # (1000000, 5, 'cD', 0, 320, 30), 
            # starts to fail here. too many groups?
            # (1000000, 5, 'cD', 0, 640, 30), 
            # (1000000, 5, 'cD', 0, 1280, 30), 
            ]

        ### h2b.browseTheCloud()
        xList = []
        eList = []
        fList = []
        trial = 0
        for (rowCount, colCount, hex_key, minInt, maxInt, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'

            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname, "with range", (maxInt-minInt)+1
            write_syn_dataset(csvPathname, rowCount, colCount, minInt, maxInt, SEEDPERFILE)

            # PARSE train****************************************
            hexKey = 'r.hex'
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hexKey)

            for resultKey, execExpr in initList:
                h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=60)


            # do it twice..to get the optimal cached delay for time?
            execExpr = "a1 = ddply(r.hex, c(1,2), " + PHRASE + ")"
            start = time.time()
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60)
            ddplyElapsed = time.time() - start
            print "ddplyElapsed:", ddplyElapsed

            execExpr = "a2 = ddply(r.hex, c(1,2), " + PHRASE + ")"
            start = time.time()
            (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60)
            groups = execResult['num_rows']
            maxExpectedGroups = ((maxInt - minInt) + 1) ** 2
            h2o_util.assertApproxEqual(groups, maxExpectedGroups,  rel=0.2, 
                msg="groups %s isn't close to expected amount %s" % (groups, maxExpectedGroups))

            ddplyElapsed = time.time() - start
            print "ddplyElapsed:", ddplyElapsed
            print "execResult", h2o.dump_json(execResult)

            # should be same answer in both cases
            execExpr = "d=sum(a1!=a2)==0"
            (execResult, result) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60)
            print "execResult", h2o.dump_json(execResult)
            self.assertEqual(result, 1, "a1 and a2 weren't equal? %s" % result)

            # xList.append(ntrees)
            trial += 1
            # this is the biggest it might be ..depends on the random combinations
            # groups = ((maxInt - minInt) + 1) ** 2
            xList.append(groups)
            eList.append(ddplyElapsed)
            fList.append(ddplyElapsed)
            

        if DO_PLOT:
            xLabel = 'groups'
            eLabel = 'ddplyElapsed'
            fLabel = 'ddplyElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
示例#44
0
    def test_exec2_col_add(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        # csvPathname = 'airlines/year2013.csv'
        if localhost:
            # csvPathname = '1B/reals_100000x1000_15f.data'
            # csvPathname = '1B/reals_1000000x1000_15f.data'
            csvPathname = '1B/reals_1000000x1_15f.data'
            # csvPathname = '1B/reals_1B_15f.data'
            # csvPathname = '1B/reals_100M_15f.data'
        else:
            # csvPathname = '1B/reals_100000x1000_15f.data'
            # csvPathname = '1B/reals_1000000x1000_15f.data'
            csvPathname = '1B/reals_1000000x1_15f.data'
            # csvPathname = '1B/reals_1B_15f.data'
            # csvPathname = '1B/reals_100M_15f.data'

        hex_key = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', 
            hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2, doSummary=False)
        inspect = h2o_cmd.runInspect(key=hex_key)
        print "numRows:", inspect['numRows']
        print "numCols:", inspect['numCols']
        inspect = h2o_cmd.runInspect(key=hex_key, offset=-1)
        print "inspect offset = -1:", h2o.dump_json(inspect)

        xList = []
        eList = []
        fList = []
        for execExpr in initList:
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300)
        for trial in range(1000):
            for execExpr in exprList:
                # put the trial number into the temp for uniqueness
                execExpr = re.sub('Last.value', 'Last.value%s' % trial, execExpr)
                execExpr = re.sub(',1', ',%s' % trial, execExpr)
                start = time.time()
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=300)
                execTime = time.time() - start
                print 'exec took', execTime, 'seconds'
                c = h2o.nodes[0].get_cloud()
                c = c['nodes']

                # print (h2o.dump_json(c))
                k = [i['num_keys'] for i in c]
                v = [i['value_size_bytes'] for i in c]

                
                print "keys: %s" % " ".join(map(str,k))
                print "value_size_bytes: %s" % " ".join(map(str,v))

                # print "result:", result
                if ('r1' in execExpr) and (not 'apply' in execExpr):
                    xList.append(trial)
                    eList.append(execTime)
                if ('apply' in execExpr):
                    fList.append(execTime)

        h2o.check_sandbox_for_errors()
        # PLOTS. look for eplot.jpg and fplot.jpg in local dir?
        if DO_PLOT:
            xLabel = 'trial'
            eLabel = 'time: r1[,1] = Last.value = r2',
            fLabel = 'time: apply(r1, 2, sum)',
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
示例#45
0
    def test_rapids_cbind_vec(self):

        bucket = 'smalldata'
        csvPathname = 'iris/iris_wheader.csv'
        hexKey = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)

        keys = []
        # stop if > 1G (fails memory cleaner assetion
        maxx = 29
        # for trial in range(maxx):
        # for trial in range(int(1e6),int(200e6),int(1e6)):
        for trial in [int(100e6)]:
            
            # length = (2 ** trial)
            # execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
            length = trial
            execExpr = '(= !v (c {(: #0 #%s)})' % (length - 1)
    
            start = time.time()
            execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
            elapsed1 = time.time() - start
            if execResult['num_rows']:
                keys.append(execExpr)

            # execExpr = '(= !v (+ (+ $v $v) (+ $v $v))'
            # cols = 100
            xList = []
            eList = []
            fList = []
            for trial2 in range(0, 16):
            # for trial2 in range(0, 10):
            # fails. Post size?
            # for trial2 in range(0, 16):
                col = 2 ** trial2
                # assert col < 16384, "h2o can't take col == 16384 or more"
             
                vString = ' '.join(['$v' for x in range(col)])
                execExpr = '(= !v2 (cbind %s))' % vString

                # FIX! check the colnames. 2 cols get C1 and C10? odd 
                # try:
                start = time.time()
                execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=40)
                elapsed2 = time.time() - start

                if execResult['num_rows']:
                    keys.append(execExpr)
                
                # except:
                #     elapsed2 = 0
                #     h2p.red_print("ERROR: col = %s failed" % col)

                if 1==0:
                    start = time.time()
                    execExpr = '(sum $v2 $TRUE)'
                    execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=60)
                    elapsed1 = time.time() - start

                # xList.append(length)
                xList.append(col)
                eList.append(elapsed1)
                fList.append(elapsed2)


        if 1==1:
            xLabel = 'col'
            eLabel = 'elapsed (sum)'
            fLabel = 'elapsed (cbind cols)'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)



        print "\nExpressions that created keys"
        for k in keys:
            print k

        # for execExpr in exprList:
        #     h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10)

        h2o.check_sandbox_for_errors()
示例#46
0
    def test_rf_covtype_fvec(self):
        h2o.beta_features = True  # fvec
        importFolderPath = "standard"

        # Parse Train ******************************************************
        csvTrainFilename = 'covtype.shuffled.90pct.data'
        csvTrainPathname = importFolderPath + "/" + csvTrainFilename
        hex_key = csvTrainFilename + ".hex"
        parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=hex_key,
                                            timeoutSecs=180, doSummary=False)
        inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key'])

        # Parse Test ******************************************************
        csvTestFilename = 'covtype.shuffled.10pct.data'
        csvTestPathname = importFolderPath + "/" + csvTestFilename
        hex_key = csvTestFilename + ".hex"
        parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTestPathname, hex_key=hex_key,
                                           timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseTestResult['destination_key'])

        rfViewInitial = []
        xList = []
        eList = []
        fList = []
        trial = 0

        depthList  = [10, 20, 30, 40]
        ntreesList = [5, 10, 20, 30]
        # ntreesList = [2]
        nbinsList  = [10, 100, 1000]

        if TRY == 'max_depth':
            tryList = depthList
        elif TRY == 'ntrees':
            tryList = ntreesList
        elif TRY == 'nbins':
            tryList = nbinsList
        else:
            raise Exception("huh? %s" % TRY)

        for d in tryList:
            if TRY == 'max_depth':
                paramDict['max_depth'] = d
            elif TRY == 'ntrees':
                paramDict['ntrees'] = d
            elif TRY == 'nbins':
                paramDict['nbins'] = d
            else:
                raise Exception("huh? %s" % TRY)

            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            if DO_OOBE:
                paramDict['validation'] = None
            else:
                paramDict['validation'] = parseTestResult['destination_key']

            timeoutSecs = 30 + paramDict['ntrees'] * 200


            # do ten starts, to see the bad id problem?
            TRIES = 5
            for i in range(TRIES):
                lastOne = i==(TRIES-1)

                # have unique model names
                trial += 1
                kwargs = paramDict.copy()
                model_key = 'RFModel_' + str(trial)
                kwargs['destination_key'] = model_key
                data_key = parseTrainResult['destination_key']

                start = time.time()
                rfResult = h2o_cmd.runSpeeDRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs,
                                         noPoll=True, **kwargs)
                trainElapsed = time.time() - start
                print 'rf train end', i, 'on', csvTrainPathname, 'took', trainElapsed, 'seconds'

                # don't cancel the last one
                if not lastOne:
                    time.sleep(1)
                    h2o_jobs.cancelAllJobs(timeoutSecs=2)


            ### print "rfView", h2o.dump_json(rfView)
            print "We have a result from the RF above, completed but didn't do RFView yet"
            # could the RF indicate 'done' too soon?
            # if rfResult['state']=='RUNNING':
            #    raise Exception("Why is this RF still in RUNNING state? %s" % h2o.dump_json(rfResult))

            # if 'drf_model' not in rfResult:
            #    raise Exception("How come there's no drf_model in this RF result? %s" % h2o.dump_json(rfResult))
            h2o_jobs.pollWaitJobs(timeoutSecs=300)
            rfView = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60)
            print "rfView:", h2o.dump_json(rfView)

            rfView["drf_model"] = rfView.pop("speedrf_model")
            rf_model = rfView['drf_model']
            cms = rf_model['cms']
            ### print "cm:", h2o.dump_json(cm)
            ntrees = rf_model['N']
            errs = rf_model['errs']
            N = rf_model['N']
            varimp = rf_model['varimp']
            treeStats = rf_model['treeStats']

            print "maxDepth:", treeStats['maxDepth']
            print "maxLeaves:", treeStats['maxLeaves']
            print "minDepth:", treeStats['minDepth']
            print "minLeaves:", treeStats['minLeaves']
            print "meanLeaves:", treeStats['meanLeaves']
            print "meanDepth:", treeStats['meanDepth']
            print "errs[0]:", errs[0]
            print "errs[-1]:", errs[-1]
            print "errs:", errs

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)
            # we iterate over params, so can't really do this check
            # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)

            print "classErrorPctList:", classErrorPctList
            self.assertEqual(len(classErrorPctList), 7, "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict")
            # FIX! should update this expected classification error
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key)

            eList.append(classErrorPctList[4])
            fList.append(trainElapsed)
            if DO_PLOT:
                if TRY == 'max_depth':
                    xLabel = 'max_depth'
                elif TRY == 'ntrees':
                    xLabel = 'ntrees'
                elif TRY == 'nbins':
                    xLabel = 'nbins'
                else:
                    raise Exception("huh? %s" % TRY)
                xList.append(paramDict[xLabel])

        if DO_PLOT:
            eLabel = 'class 4 pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
示例#47
0
    def test_plot_remove_keys(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (100000, 50, 'cG', 400, 400),
            (200000, 50, 'cH', 400, 400),
            (400000, 50, 'cI', 400, 400),
            (800000, 50, 'cJ', 400, 400),
            (1000000, 50, 'cK', 400, 400),
        ]

        xList = []
        eList = []
        fList = []
        for (rowCount, colCount, hex_key, timeoutSecs,
             timeoutSecs2) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            NUM_CASES = h2o_util.fp_format()
            sel = random.randint(0, NUM_CASES - 1)
            csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount,
                                                   colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE,
                              sel)

            start = time.time()
            print csvFilename, "parse starting"
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False)
            parseElapsed = time.time() - start
            print "Parse only:", parseResult[
                'destination_key'], "took", parseElapsed, "seconds"
            h2o.check_sandbox_for_errors()

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=timeoutSecs2)
            print "Inspect:", parseResult[
                'destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # should match # of cols in header or ??
            self.assertEqual(
                inspect['numCols'], colCount,
                "parse created result with the wrong number of cols %s %s" %
                (inspect['numCols'], colCount))
            self.assertEqual(inspect['numRows'], rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['numRows'], rowCount))

            parsedBytes = inspect['byteSize']

            node = h2o.nodes[0]
            print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?"
            start = time.time()
            node.remove_key(hex_key, timeoutSecs=30)
            removeElapsed = time.time() - start
            print "Deleting", hex_key, "took", removeElapsed, "seconds"

            # xList.append(ntrees)
            xList.append(parsedBytes)
            eList.append(parseElapsed)
            fList.append(removeElapsed)

        # just plot the last one
        if 1 == 1:
            xLabel = 'parsedBytes'
            eLabel = 'parseElapsed'
            fLabel = 'removeElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
示例#48
0
    def test_GBM_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if localhost:
            tryList = [(10000, 100, "cA", 300)]
        else:
            tryList = [
                # (10000, 10, 'cB', 300),
                # (10000, 50, 'cC', 300),
                (10000, 100, "cD", 300),
                (10000, 200, "cE", 300),
                (10000, 300, "cF", 300),
                (10000, 400, "cG", 300),
                (10000, 500, "cH", 300),
                (10000, 1000, "cI", 300),
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            # csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv"
            hdrFilename = "hdr_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv"

            csvPathname = SYNDATASETS_DIR + "/" + csvFilename
            hdrPathname = SYNDATASETS_DIR + "/" + hdrFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            # PARSE train****************************************
            h2o.beta_features = False  # turn off beta_features
            start = time.time()
            xList = []
            eList = []
            fList = []

            modelKey = "GBMModelKey"

            # Parse (train)****************************************
            if h2o.beta_features:
                print "Parsing to fvec directly! Have to noPoll=true!, and doSummary=False!"
            parseTrainResult = h2i.import_parse(
                bucket=None,
                path=csvPathname,
                schema="put",
                hex_key=hex_key,
                timeoutSecs=timeoutSecs,
                noPoll=h2o.beta_features,
                doSummary=False,
            )
            # hack
            if h2o.beta_features:
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                print "Filling in the parseTrainResult['destination_key'] for h2o"
                parseTrainResult["destination_key"] = trainKey

            elapsed = time.time() - start
            print "train parse end on ", csvPathname, "took", elapsed, "seconds", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )
            print "train parse result:", parseTrainResult["destination_key"]

            # Logging to a benchmark file
            algo = "Parse"
            l = "{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs".format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed
            )
            print l
            h2o.cloudPerfH2O.message(l)

            # if you set beta_features here, the fvec translate will happen with the Inspect not the GBM
            # h2o.beta_features = True
            inspect = h2o_cmd.runInspect(key=parseTrainResult["destination_key"])
            print "\n" + csvPathname, "    num_rows:", "{:,}".format(
                inspect["num_rows"]
            ), "    num_cols:", "{:,}".format(inspect["num_cols"])
            num_rows = inspect["num_rows"]
            num_cols = inspect["num_cols"]
            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            # GBM(train iterate)****************************************
            ntrees = 5
            prefixList = ["A", "B", "C", "D", "E", "F", "G", "H"]
            # for max_depth in [5,10,20,40]:
            for max_depth in [5, 10, 20]:

                # PARSE a new header****************************************
                print "Creating new header", hdrPathname
                prefix = prefixList.pop(0)
                write_syn_header(hdrPathname, rowCount, colCount, prefix)

                # upload and parse the header to a hex

                h2o.beta_features = False  # can't put with fvec yet
                hdr_hex_key = prefix + "_hdr.hex"
                parseHdrResult = h2i.import_parse(
                    bucket=None,
                    path=hdrPathname,
                    schema="put",
                    header=1,  # REQUIRED! otherwise will interpret as enums
                    hex_key=hdr_hex_key,
                    timeoutSecs=timeoutSecs,
                    noPoll=h2o.beta_features,
                    doSummary=False,
                )
                # Set Column Names (before autoframe is created)
                h2o.nodes[0].set_column_names(target=hex_key, copy_from=hdr_hex_key)

                # GBM
                print "The response col name is changing each iteration, since we're parsing a new header"
                params = {
                    "learn_rate": 0.2,
                    "nbins": 1024,
                    "ntrees": ntrees,
                    "max_depth": max_depth,
                    "min_rows": 10,
                    "response": prefix + "_response",
                    "ignored_cols_by_name": None,
                }

                print "Using these parameters for GBM: ", params
                kwargs = params.copy()
                h2o.beta_features = True

                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(
                    parseResult=parseTrainResult,
                    noPoll=h2o.beta_features,
                    timeoutSecs=timeoutSecs,
                    destination_key=modelKey,
                    **kwargs
                )
                # hack
                if h2o.beta_features:
                    h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", csvPathname

                # Logging to a benchmark file
                algo = "GBM " + " ntrees=" + str(ntrees) + " max_depth=" + str(max_depth)
                l = "{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs".format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, trainElapsed
                )
                print l
                h2o.cloudPerfH2O.message(l)

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView["gbm_model"]["errs"][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView["gbm_model"]["cms"][-1]["_arr"]  # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm)
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrongTrain)
                fList.append(trainElapsed)

                # works if you delete the autoframe
                ### h2o_import.delete_keys_at_all_nodes(pattern='autoframe')

        h2o.beta_features = False
        # just plot the last one
        if DO_PLOT:
            xLabel = "max_depth"
            eLabel = "pctWrong"
            fLabel = "trainElapsed"
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
    def test_GBM_manyfiles_train_test(self):
        h2o.beta_features = True
        bucket = 'home-0xdiag-datasets'
        modelKey = 'GBMModelKey'
        if localhost:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                # problems with categoricals not in the train data set? (warnings in h2o stdout)
                ## ('manyfiles-nflx-gz', 'file_1.dat.gz', 'file_1.hex', 1800, None, 'file_11.dat.gz', 'test.hex')
                # just use matching
                ('manyfiles-nflx-gz', 'file_1.dat.gz', 'train.hex', 1800, None, 'file_1.dat.gz', 'test.hex')
                ]
        else:
            files = [
                # None forces numCols to be used. assumes you set it from Inspect
                ('manyfiles-nflx-gz', 'file_[0-9].dat.gz', 'train.hex', 1800, None, 'file_1[0-9].dat.gz', 'test.hex')
                ]

        # if I got to hdfs, it's here
        # hdfs://192.168.1.176/datasets/manyfiles-nflx-gz/file_99.dat.gz

        h2b.browseTheCloud()
        for (importFolderPath, trainFilename, trainKey, timeoutSecs, response, testFilename, testKey) in files:
            # PARSE train****************************************
            start = time.time()
            xList = []
            eList = []
            fList = []

            # Parse (train)****************************************
            csvPathname = importFolderPath + "/" + trainFilename
            parseTrainResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                hex_key=trainKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "train parse end on ", trainFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "train parse result:", parseTrainResult['destination_key']

            ### h2o_cmd.runSummary(key=parsTraineResult['destination_key'])

            inspect = h2o_cmd.runInspect(key=parseTrainResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (trainKey, trainKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60)

            # Parse (test)****************************************
            parseTestResult = h2i.import_parse(bucket=bucket, path=importFolderPath + "/" + testFilename, schema='local',
                hex_key=testKey, timeoutSecs=timeoutSecs, doSummary=False)
            elapsed = time.time() - start
            print "test parse end on ", testFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "test parse result:", parseTestResult['destination_key']

            # Make col 378 it something we can do binomial regression on!
            execExpr = '%s[,378+1]=%s[,378+1]>15' % (testKey, testKey)
            resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=60)

            # Note ..no inspect of test data here..so translate happens later?

            # GBM (train iterate)****************************************
            # if not response:
            #     response = numCols - 1
            # response = 378
            response = 'C379'

            # randomly ignore a bunch of cols, just to make it go faster
            x = range(numCols)
            del x[response]
            ignored_cols_by_name = ",".join(map(lambda x: 'C' + str(x), random.sample(x, 300)))

            print "Using the same response %s for train and test (which should have a output value too)" % response

            ntrees = 10
            # ignore 200 random cols (not the response)
            for max_depth in [5, 40]:
                params = {
                    'learn_rate': .2,
                    'nbins': 1024,
                    'ntrees': ntrees,
                    'max_depth': max_depth,
                    'min_rows': 10,
                    'response': 'C' + str(response),
                    'ignored_cols_by_name': ignored_cols_by_name,
                }
            



                ### print "Using these parameters for GBM: ", params
                kwargs = params.copy()

                # GBM train****************************************
                trainStart = time.time()
                gbmTrainResult = h2o_cmd.runGBM(parseResult=parseTrainResult,
                    timeoutSecs=timeoutSecs, destination_key=modelKey, **kwargs)
                # hack
                trainElapsed = time.time() - trainStart
                print "GBM training completed in", trainElapsed, "seconds. On dataset: ", trainFilename

                gbmTrainView = h2o_cmd.runGBMView(model_key=modelKey)
                # errrs from end of list? is that the last tree?
                errsLast = gbmTrainView['gbm_model']['errs'][-1]
                print "GBM 'errsLast'", errsLast

                cm = gbmTrainView['gbm_model']['cms'][-1]['_arr'] # use the last one
                pctWrongTrain = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm might be NAs, not CM"
                print "\nTrain\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # GBM test****************************************
                predictKey = 'Predict.hex'
                ### h2o_cmd.runInspect(key=parseTestResult['destination_key'])
                start = time.time()
                gbmTestResult = h2o_cmd.runPredict(
                    data_key=parseTestResult['destination_key'], 
                    model_key=modelKey,
                    destination_key=predictKey,
                    timeoutSecs=timeoutSecs)
                elapsed = time.time() - start
                print "GBM predict completed in", elapsed, "seconds. On dataset: ", testFilename

                gbmPredictCMResult =h2o.nodes[0].predict_confusion_matrix(
                    actual=parseTestResult['destination_key'],
                    vactual='C' + str(response),
                    predict=predictKey,
                    vpredict='predict', # choices are 0 and 'predict'
                    )

                # errrs from end of list? is that the last tree?
                # all we get is cm
                cm = gbmPredictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm);
                print "Last line of this cm is really NAs, not CM"
                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

                # xList.append(ntrees)
                xList.append(max_depth)
                eList.append(pctWrong)
                fList.append(trainElapsed)

            xLabel = 'max_depth'
            eLabel = 'pctWrong'
            fLabel = 'trainElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
示例#50
0
    def test_rf_covtype_fvec(self):
        importFolderPath = "standard"

        # Parse Train ******************************************************
        csvTrainFilename = 'covtype.shuffled.90pct.data'
        csvTrainPathname = importFolderPath + "/" + csvTrainFilename
        hex_key = csvTrainFilename + ".hex"
        parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=hex_key, timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key'])

        # Parse Test ******************************************************
        csvTestFilename = 'covtype.shuffled.10pct.data'
        csvTestPathname = importFolderPath + "/" + csvTestFilename
        hex_key = csvTestFilename + ".hex"
        parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTestPathname, hex_key=hex_key, timeoutSecs=180)
        inspect = h2o_cmd.runInspect(None, parseTestResult['destination_key'])

        rfViewInitial = []
        xList = []
        eList = []
        fList = []
        trial = 0

        depthList  = [10, 20, 30, 40]
        ntreesList = [5, 10, 20, 30]
        # ntreesList = [2]
        nbinsList  = [10, 100, 1000]

        if TRY == 'max_depth':
            tryList = depthList
        elif TRY == 'ntrees':
            tryList = ntreesList
        elif TRY == 'nbins':
            tryList = nbinsList
        else:
            raise Exception("huh? %s" % TRY)

        for d in tryList:
            if TRY == 'max_depth':
                paramDict['max_depth'] = d
            elif TRY == 'ntrees':
                paramDict['ntrees'] = d
            elif TRY == 'nbins':
                paramDict['nbins'] = d
            else:
                raise Exception("huh? %s" % TRY)

            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow
            trial += 1
            paramDict['destination_key'] = 'RFModel_' + str(trial)
            if DO_OOBE:
                paramDict['validation'] = None
            else:
                paramDict['validation'] = parseTestResult['destination_key']

            kwargs = paramDict.copy()
            timeoutSecs = 30 + kwargs['ntrees'] * 200

            start = time.time()
            rfFirstResult = h2o_cmd.runRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, noPoll=True, rfView=False, **kwargs)
            # print h2o.dump_json(rfFirstResult)
            # FIX! are these already in there?
            rfView = {}
            rfView['data_key'] = hex_key
            rfView['model_key'] = kwargs['destination_key']
            rfView['ntrees'] = kwargs['ntrees']

            h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=120, retryDelaySecs=5)
            trainElapsed = time.time() - start
            print "rf train end on ", csvTrainPathname, 'took', trainElapsed, 'seconds'
            ### print "rfView", h2o.dump_json(rfView)
            data_key = rfView['data_key']
            model_key = rfView['model_key']
            ntrees = rfView['ntrees']

            rfView = h2o_cmd.runRFView(None, model_key=model_key, timeoutSecs=60, noPoll=False, doSimpleCheck=False)
            h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
            # rfView = h2o_cmd.runRFView(None, data_key, model_key, timeoutSecs=60, noPoll=True, doSimpleCheck=False)
            ## print "rfView:", h2o.dump_json(rfView)

            # "N":1,
            # "errs":[0.25,0.1682814508676529],
            # "testKey":"syn_binary_10000x10.hex",
            # "cm":[[3621,1399],[1515,3465]]}}

            rf_model = rfView['drf_model']
            cm = rf_model['cm']
            ### print "cm:", h2o.dump_json(cm)
            ntrees = rf_model['N']
            errs = rf_model['errs']
            N = rf_model['N']
            varimp = rf_model['varimp']
            treeStats = rf_model['treeStats']

            print "maxDepth:", treeStats['maxDepth']
            print "maxLeaves:", treeStats['maxLeaves']
            print "minDepth:", treeStats['minDepth']
            print "minLeaves:", treeStats['minLeaves']
            print "meanLeaves:", treeStats['meanLeaves']
            print "meanDepth:", treeStats['meanDepth']
            print "errs[0]:", errs[0]
            print "errs[-1]:", errs[-1]
            print "errs:", errs

            (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRF2View(rfv=rfView)

            # "treeStats": {
            #     "maxDepth": 15, 
            #     "maxLeaves": 1715, 
            #     "meanDepth": 15, 
            #     "meanLeaves": 421, 
            #     "minDepth": 15, 
            #     "minLeaves": 5
            # }, 
            # "varimp": null

            # FIX! should update this expected classification error
            ## (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees)
            ## self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)
            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key)

            # xList.append(ntrees)
            # paramDict['max_depth']
            # paramDict['nbins']
            # paramDict['mtries']
            # paramDict['ntrees']
            # eList.append(errs[-1])
            eList.append(classErrorPctList[4])
            fList.append(trainElapsed)
            if DO_PLOT:
                if TRY == 'max_depth':
                    xLabel = 'max_depth'
                elif TRY == 'ntrees':
                    xLabel = 'ntrees'
                elif TRY == 'nbins':
                    xLabel = 'nbins'
                else:
                    raise Exception("huh? %s" % TRY)
                xList.append(paramDict[xLabel])

        eLabel = 'class 4 pctWrong'
        fLabel = 'trainElapsed'
        eListTitle = ""
        fListTitle = ""
        h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)