def test_parse_manyfile_hack(self):

        for trial in range(2):
            importFolderPath = "/home/0xdiag/datasets/manyfiles-nflx-gz"

            importList = []
            maxi = 50
            # 4-9 don't exist?
            for i in range(10, 10+maxi+1):
                csvFilename = "file_%s.dat.gz" % i
                csvPathname = importFolderPath + "/" + csvFilename
                importResult = h2o.n0.import_files(path=csvPathname)
                # just 1!
                import_key = importResult['keys'][0]
                assert len(importResult['keys'])==1
                assert len(importResult['files'])==1
                assert len(importResult['fails'])==0
                assert len(importResult['dels'])==0
                importList.append(import_key)
                

            timeoutSecs = 800
            parseResult = h2o.n0.parse(key=importList, timeoutSecs=timeoutSecs)
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
            inspectResult = h2o_cmd.runInspect(key=parse_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspectResult)

            assert numRows == (maxi * 100000)
            assert numCols == 542
示例#2
0
    def test_parse_rand_utf8(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        print "HACK: reduce rows to 10 for debug"
        tryList = [
            # do two cols to detect bad eol behavior
            (10, 2, 'cA', 120),
            (10, 2, 'cG', 120),
            (10, 2, 'cH', 120),
            ]

        print "What about messages to log (INFO) about unmatched quotes (before eol)"
        # got this ..trying to avoid for now
        # Exception: rjson error in parse: Argument 'source_key' error: Parser setup appears to be broken, got AUTO

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED=SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', check_header=0,
                hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)
            print "parseResult:", dump_json(parseResult)

            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
            inspect = h2o_cmd.runInspect(key=parse_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)

            assert len(missingList) == 0
            # FIX! check type?
        
            # print "inspect:", h2o.dump_json(inspect)
            self.assertEqual(numRows, rowCount, msg='Wrong numRows: %s %s' % (numRows, rowCount))
            self.assertEqual(numCols, colCount, msg='Wrong numCols: %s %s' % (numCols, colCount))
示例#3
0
    def test_parse_rand_utf8(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        print "HACK: reduce rows to 10 for debug"
        tryList = [
            # do two cols to detect bad eol behavior
            (10, 2, 'cA', 120),
            (10, 2, 'cG', 120),
            (10, 2, 'cH', 120),
            ]

        print "What about messages to log (INFO) about unmatched quotes (before eol)"
        # got this ..trying to avoid for now
        # Exception: rjson error in parse: Argument 'source_key' error: Parser setup appears to be broken, got AUTO

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED=SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', checkHeader=0,
                hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)
            print "parseResult:", dump_json(parseResult)

            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
            inspect = h2o_cmd.runInspect(key=parse_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)

            assert len(missingList) == 0
            # FIX! check type?
        
            # print "inspect:", h2o.dump_json(inspect)
            self.assertEqual(numRows, rowCount, msg='Wrong numRows: %s %s' % (numRows, rowCount))
            self.assertEqual(numCols, colCount, msg='Wrong numCols: %s %s' % (numCols, colCount))
示例#4
0
    def test_kmeans_benign(self):
        importFolderPath = "logreg"
        csvFilename = "benign.csv"
        hex_key = "benign.hex"
        csvPathname = importFolderPath + "/" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, checkHeader=1, 
            timeoutSecs=180, doSummary=False)
        numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)

        inspectResult = h2o_cmd.runInspect(key=parse_key)
        missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspectResult)

        expected = [
            ([8.86, 2.43, 35.53, 0.31, 13.22, 1.47, 1.33, 20.06, 13.08, 0.53, 2.12, 128.61, 35.33, 1.57], 49, None), 
            ([33.47, 2.29, 50.92, 0.34, 12.82, 1.33, 1.36, 21.43, 13.30, 0.37, 2.52, 125.40, 43.91, 1.79], 87, None), 
            ([27.64, 2.87, 48.11, 0.09, 11.80, 0.98, 1.51, 21.02, 12.53, 0.58, 2.89, 171.27, 42.73, 1.53], 55, None), 
            ([26.00, 2.67, 46.67, 0.00, 13.00, 1.33, 1.67, 21.56, 11.44, 0.22, 2.89, 234.56, 39.22, 1.56], 9, None), 
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01, 0.01)

        # loop, to see if we get same centers

        # no cols ignored
        labelListUsed = list(labelList)
        numColsUsed = numCols
        for trial in range(5):
            kmeansSeed = random.randint(0, sys.maxint)
            # kmeansSeed = 6655548259421773879
            parameters = {
                'validation_frame': parse_key,
                'ignored_columns': None,
                'score_each_iteration': False,
                'K': 4, 
                'max_iters': 50,
                'normalize': False,
                'seed': kmeansSeed,
                'init': 'PlusPlus',
            }

            model_key = 'benign_k.hex'
            kmeansResult = h2o.n0.build_model(
                algo='kmeans', 
                destination_key=model_key,
                training_frame=parse_key,
                parameters=parameters, 
                timeoutSecs=10) 

            modelResult = h2o.n0.models(key=model_key)

            # this prints too
            tuplesSorted, iters, mse, names = \
                h2o_kmeans.simpleCheckKMeans(self, modelResult, parameters, numRows, numColsUsed, labelListUsed)
            
            h2o_cmd.runStoreView()

            # zip with * is it's own inverse here. It's sorted by centers for easy comparisons
            ids, mses, rows, clusters = zip(*tuplesSorted)
    def test_parse_manyfile_hack(self):

        for trial in range(2):
            importFolderPath = "/home/0xdiag/datasets/manyfiles-nflx-gz"

            importList = []
            maxi = 50
            # 4-9 don't exist?
            for i in range(10, 10 + maxi + 1):
                csvFilename = "file_%s.dat.gz" % i
                csvPathname = importFolderPath + "/" + csvFilename
                importResult = h2o.n0.import_files(path=csvPathname)
                # just 1!
                import_key = importResult['keys'][0]
                assert len(importResult['keys']) == 1
                assert len(importResult['files']) == 1
                assert len(importResult['fails']) == 0
                assert len(importResult['dels']) == 0
                importList.append(import_key)

            timeoutSecs = 800
            parseResult = h2o.n0.parse(key=importList, timeoutSecs=timeoutSecs)
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
            inspectResult = h2o_cmd.runInspect(key=parse_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                inspectResult)

            assert numRows == (maxi * 100000)
            assert numCols == 542
示例#6
0
    def test_GBM_covtype_train_test(self):

        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'standard'
        trainFilename = 'covtype.shuffled.90pct.data'
        train_key = 'covtype.train.hex'
        model_key = 'GBMModelKey'
        timeoutSecs = 1800
        csvPathname = importFolderPath + "/" + trainFilename

        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
            hex_key=train_key, timeoutSecs=timeoutSecs)
        numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
        inspectResult = h2o_cmd.runInspect(key=parse_key)
        missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspectResult)

        labelListUsed = list(labelList)
        numColsUsed = numCols

        parameters = {
            'validation_frame': train_key,
            'ignored_columns': None,
            'score_each_iteration': True,
            'response_column': 'C55',
            'do_classification': True,
            # 'balance_classes':
            # 'max_after_balance_size':
            'ntrees': 2,
            'max_depth': 10,
            'min_rows': 3,
            'nbins': 40,
            'learn_rate': 0.2,
            # FIX! doesn't like it?
            # 'loss': 'Bernoulli',
            # FIX..no variable importance for GBM yet?
            'variable_importance': False,
            # 'seed': 
        }

        model_key = 'benign_gbm.hex'
        bmResult = h2o.n0.build_model(
            algo='gbm',
            destination_key=model_key,
            training_frame=parse_key,
            parameters=parameters,
            timeoutSecs=60)
        bm = OutputObj(bmResult, 'bm')

        modelResult = h2o.n0.models(key=model_key)
        model = OutputObj(modelResult['models'][0]['output'], 'model')

        cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
        cmm = OutputObj(cmmResult, 'cmm')

        mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
        mm = OutputObj(mmResult, 'mm')

        prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
        pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
    def test_parse_syn_gz_cat(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # summary fails with 100000 cols
            # overwrite the key each time to save space?
            (100, 100, 'cF', 600),
            (100, 5000, 'cF', 600),
            (100, 10000, 'cF', 600),
            # (100, 12000, 'cF', 600),
            # (100, 15000, 'cF', 600),
            # (100, 17000, 'cF', 600),
            (100, 20000, 'cF', 600),
            (100, 40000, 'cF', 600),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            csvFilenamegz = csvFilename + ".gz"
            csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz
            h2o_util.file_gzip(csvPathname, csvPathnamegz)

            start = time.time()
            print "Parse start:", csvPathnamegz
            parseResult = h2i.import_parse(path=csvPathnamegz, schema='put', hex_key=hex_key, 
                timeoutSecs=timeoutSecs, doSummary=DOSUMMARY)

            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
            if DOSUMMARY:
                algo = "Parse and Summary:"
            else:
                algo = "Parse:"
            print algo , parse_key, "took", time.time() - start, "seconds"

            print "Inspecting.."
            start = time.time()
            inspect = h2o_cmd.runInspect(key=parse_key, timeoutSecs=timeoutSecs)
            print "Inspect:", parse_key, "took", time.time() - start, "seconds"
            
            missingValuesList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)
            print "\n" + csvPathnamegz, \
                "\n    numRows:", "{:,}".format(numRows), \
                "\n    numCols:", "{:,}".format(numCols)

            self.assertEqual(len(missingValuesList), 0, 
                "Don't expect any missing values. These cols had some: %s" % missingValuesList)
            # should match # of cols in header or ??
            self.assertEqual(numCols, colCount,
                "parse created result with the wrong number of cols %s %s" % (numCols, colCount))
            self.assertEqual(numRows, rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (numRows, rowCount))
示例#8
0
    def test_50_nongz_fvec(self):
        avgMichalSize = 237270000 * 2
        bucket = 'home-0xdiag-datasets'
        importFolderPath = "many_many"
        print "Using non-gz'ed files in", importFolderPath
        csvFilenameList= [
            ("*.dat", "file_18_A.dat", 18 * avgMichalSize, 1800),
        ]

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern

            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
            importFullList = importResult['files']
            importFailList = importResult['fails']
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                retryDelaySecs=retryDelaySecs,
                pollTimeoutSecs=pollTimeoutSecs)
            elapsed = time.time() - start
            print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
            h2o_cmd.columnInfoFromInspect(parse_key, exceptionOnMissingValues=False)

            if totalBytes is not None:
                fileMBS = (totalBytes/1e6)/elapsed
                msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                print msg
    def test_exec2_enums_rand_cut(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = ROWS
        tryList = [
            (n, 10, 9, 'cE', 300),
        ]

        # create key names to use for exec
        eKeys = ['e%s' % i for i in range(10)]

        # h2b.browseTheCloud()
        trial = 0
        for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList:
            colCount = iColCount + oColCount

            hex_key = 'p'
            colEnumList = create_col_enum_list(iColCount)

            # create 100 possible cut expressions here, so we don't waste time below
            rowExprList = []
            print "Creating", CUT_EXPR_CNT, 'cut expressions'
            for j in range(CUT_EXPR_CNT):
                # init cutValue. None means no compare
                cutValue = [None for i in range(iColCount)]
                # build up a random cut expression
                cols = random.sample(range(iColCount),
                                     random.randint(1, iColCount))
                for c in cols:
                    # possible choices within the column
                    cel = colEnumList[c]
                    # for now the cutValues are numbers for the enum mappings

                    # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like
                    # celChoice = str(random.choice(range(len(cel))))
                    celChoice = random.choice(range(len(cel)))
                    cutValue[c] = celChoice

                cutExprList = []

                pKey = Key('p')
                for i, c in enumerate(cutValue):
                    if c is None:
                        continue
                    else:
                        # new ...ability to reference cols
                        # src[ src$age<17 && src$zip=95120 && ... , ]
                        # cutExprList.append('p$C'+str(i+1)+'=='+c)
                        # all column indexing in h2o-dev is with number
                        e = Fcn('==', c, pKey[:, i])
                        cutExprList.append(e)

                cutExpr = None
                for ce in cutExprList:
                    if cutExpr:
                        cutExpr = Fcn('&', cutExpr, ce)
                    else:
                        cutExpr = ce

                print "cutExpr:", cutExpr

                # should be two different keys in the sample
                e = random.sample(eKeys, 2)
                fKey = e[0]
                eKey = e[1]

                # rowExpr = '%s[%s,];' % (hex_key, cutExpr)
                hKey = Key(hex_key)
                rowExpr = hKey[cutExpr, :]

                print "rowExpr:", rowExpr
                rowExprList.append(rowExpr)

            # CREATE DATASET*******************************************
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname,
                              rowCount,
                              iColCount,
                              oColCount,
                              SEEDPERFILE,
                              colEnumList=colEnumList)

            # PARSE*******************************************************
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30)
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)

            inspect = h2o_cmd.runInspect(key=parse_key)
            missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect(
                inspect)
            # print h2o.dump_json(inspect)

            # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
            #    h2o_cmd.columnInfoFromInspect(parse_key, exceptionOnMissingValues=False)

            # error if any col has constant values
            # if len(constantValuesDict) != 0:
            #    raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict)

            # INIT all possible key names used***************************
            # remember. 1 indexing!

            # build up the columns
            Assign('b', [1, 2, 3])
            # could also append 1 col at a time, by assigning to the next col number?
            Assign('a', Cbind(['b' for i in range(colCount)]))

            for eKey in eKeys:
                Assign(eKey, 'a')
                ## print h2o.dump_json(e)

            xList = []
            eList = []
            fList = []
            for repeat in range(200):
                # EXEC*******************************************************
                # don't use exec_expr to avoid issues with Inspect following etc.
                randICol = random.randint(0, iColCount - 1)
                randOCol = random.randint(iColCount, iColCount + oColCount - 1)

                # should be two different keys in the sample
                e = random.sample(eKeys, 2)
                fKey = e[0]
                eKey = e[1]

                if 1 == 1:
                    start = time.time()
                    Assign(fKey, random.choice(rowExprList)).do()
                    elapsed = time.time() - start
                    execTime = elapsed
                    print "exec 2 took", elapsed, "seconds."

                    inspect = h2o_cmd.runInspect(key=fKey)
                    missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect(
                        inspect)

                if numRows == 0 or numCols != colCount:
                    h2p.red_print("Warning: Cut resulted in", numRows,
                                  "rows and", numCols,
                                  "cols. Quantile will abort")

                # FIX! put quantile back in?
                quantileTime = 0

                # remove all keys*******************************************************
                # what about hex_key?
                if 1 == 0:
                    start = time.time()
                    h2o.nodes[0].remove_all_keys()
                    elapsed = time.time() - start
                    print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.'

                trial += 1
                xList.append(trial)
                eList.append(execTime)
                fList.append(quantileTime)

        # just get a plot of the last one (biggest)
        if DO_PLOT:
            xLabel = 'trial'
            eLabel = 'exec cut time'
            fLabel = 'quantile time'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel,
                              fListTitle, fList, fLabel)
    def test_exec2_enums_rand_cut(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = ROWS
        tryList = [
            (n, 10, 9, 'cE', 300), 
            ]

        # create key names to use for exec
        eKeys = ['e%s' % i for i in range(10)]

        # h2b.browseTheCloud()
        trial = 0
        for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList:
            colCount = iColCount + oColCount

            hex_key = 'p'
            colEnumList = create_col_enum_list(iColCount)

            # create 100 possible cut expressions here, so we don't waste time below
            rowExprList = []
            print "Creating", CUT_EXPR_CNT, 'cut expressions'
            for j in range(CUT_EXPR_CNT):
                # init cutValue. None means no compare
                cutValue = [None for i in range(iColCount)]
                # build up a random cut expression
                cols = random.sample(range(iColCount), random.randint(1,iColCount))
                for c in cols:
                    # possible choices within the column
                    cel = colEnumList[c]
                    # for now the cutValues are numbers for the enum mappings

                    # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like
                    # celChoice = str(random.choice(range(len(cel))))
                    celChoice = random.choice(range(len(cel)))
                    cutValue[c] = celChoice
    
                cutExprList = []

                pKey = Key('p')
                for i,c in enumerate(cutValue):
                    if c is None:   
                        continue
                    else:
                        # new ...ability to reference cols
                        # src[ src$age<17 && src$zip=95120 && ... , ]
                        # cutExprList.append('p$C'+str(i+1)+'=='+c)
                        # all column indexing in h2o-dev is with number
                        e = Fcn('==', c, pKey[:,i])
                        cutExprList.append(e)

                cutExpr = None
                for ce in cutExprList:
                    if cutExpr:
                        cutExpr = Fcn('&', cutExpr, ce)
                    else: 
                        cutExpr = ce

                print "cutExpr:", cutExpr    

                # should be two different keys in the sample
                e = random.sample(eKeys,2)
                fKey = e[0]
                eKey = e[1]

                # rowExpr = '%s[%s,];' % (hex_key, cutExpr)
                hKey = Key(hex_key)
                rowExpr = hKey[cutExpr, :]

                print "rowExpr:", rowExpr
                rowExprList.append(rowExpr)


            # CREATE DATASET*******************************************
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList)

            # PARSE*******************************************************
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)

            inspect = h2o_cmd.runInspect(key=parse_key)
            missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)
            # print h2o.dump_json(inspect)

            # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
            #    h2o_cmd.columnInfoFromInspect(parse_key, exceptionOnMissingValues=False)

            # error if any col has constant values
            # if len(constantValuesDict) != 0:
            #    raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict)

            # INIT all possible key names used***************************
            # remember. 1 indexing!

            # build up the columns
            Assign('b', [1,2,3])
            # could also append 1 col at a time, by assigning to the next col number?
            Assign('a', Cbind(['b' for i in range(colCount)]))
            
            for eKey in eKeys:
                Assign(eKey, 'a')
                ## print h2o.dump_json(e)

            xList = []
            eList = []
            fList = []
            for repeat in range(200):
                # EXEC*******************************************************
                # don't use exec_expr to avoid issues with Inspect following etc.
                randICol = random.randint(0,iColCount-1)
                randOCol = random.randint(iColCount, iColCount+oColCount-1)

                # should be two different keys in the sample
                e = random.sample(eKeys,2)
                fKey = e[0]
                eKey = e[1]

                if 1==1:
                    start = time.time()
                    Assign(fKey, random.choice(rowExprList)).do()
                    elapsed = time.time() - start
                    execTime = elapsed
                    print "exec 2 took", elapsed, "seconds."
                
                    inspect = h2o_cmd.runInspect(key=fKey)
                    missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)

                if numRows==0 or numCols!=colCount:
                    h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort")

                # FIX! put quantile back in?
                quantileTime = 0

                # remove all keys*******************************************************
                # what about hex_key?
                if 1==0:
                    start = time.time()
                    h2o.nodes[0].remove_all_keys()
                    elapsed = time.time() - start
                    print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.'

                trial += 1
                xList.append(trial)
                eList.append(execTime)
                fList.append(quantileTime)


        # just get a plot of the last one (biggest)
        if DO_PLOT:
            xLabel = 'trial'
            eLabel = 'exec cut time'
            fLabel = 'quantile time'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
示例#11
0
    def test_exec2_xorsum(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (ROWS, 1, 'r1', 0, 10, None),
        ]

        for trial in range(10):
            ullResultList = []
            for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                # dynamic range of the data may be useful for estimating error
                maxDelta = expectedMax - expectedMin

                csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
                print "Creating random", csvPathname
                (expectedUllSum, expectedFpSum)  = write_syn_dataset(csvPathname, 
                    rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
                expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum)
                expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum)

                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                    timeoutSecs=3000, retryDelaySecs=2)
                numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
                assert parse_key == hex_key
                assert numCols == colCount
                assert numRows == rowCount

                inspect = h2o_cmd.runInspect(key=hex_key)
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)
                assert len(missingList) == 0

                # looking at the 8 bytes of bits for the h2o doubles
                # xorsum will zero out the sign and exponent
                for execExpr in exprList:
                    for r in range(10):
                        start = time.time()
                        execResult = h2o_cmd.runExec(ast=execExpr, timeoutSecs=30)
                        fpResult = execResult['scalar']
                        # (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='h', timeoutSecs=300)
                        print r, 'exec took', time.time() - start, 'seconds'
                        print r, "execResult:", h2o.dump_json(execResult)
                        h2o_cmd.runStoreView()

                        ullResult = h2o_util.doubleToUnsignedLongLong(fpResult)
                        ullResultList.append((ullResult, fpResult))

                        print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)
                        print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (expectedUllSum, expectedUllSumAsDouble)
                        print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (expectedFpSumAsLongLong, expectedFpSum)

                        # allow diff of the lsb..either way
                        # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3):
                        if ullResult!=expectedUllSum:
                            raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum))
                            print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % (ullResult, expectedUllSum)

                h2o.check_sandbox_for_errors()

                print "first result was from a sum. others are xorsum"
                print "ullResultList:"
                for ullResult, fpResult in ullResultList:
                    print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)

                print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (expectedUllSum, expectedUllSumAsDouble)
                print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (expectedFpSumAsLongLong, expectedFpSum)
示例#12
0
    def test_GLM_basic_1(self):
        importFolderPath = "logreg"
        csvFilename = "prostate.csv"
        hex_key = "prostate.hex"
        csvPathname = importFolderPath + "/" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, checkHeader=1, 
            timeoutSecs=180, doSummary=False)
        numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
        inspectResult = h2o_cmd.runInspect(key=parse_key)
        missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspectResult)

        expected = []
        allowedDelta = 0

        # loop, to see if we get same centers

        # no cols ignored
        labelListUsed = list(labelList)
        numColsUsed = numCols
        for trial in range(1):
            # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie']
            # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie']
            # can we do classification with probabilities?
            # are only lambda and alpha grid searchable?
            parameters = {
                'validation_frame': parse_key,
                'ignored_columns': '[ID]',
                'score_each_iteration': True,
                'response_column': 'CAPSULE',
                # FIX! when is this needed? redundant for binomial?
                'do_classification': True,
                'balance_classes': False,
                'max_after_balance_size': None,
                'standardize': False,
                'family': 'binomial', 
                'link': None, 
                'tweedie_variance_power': None,
                'tweedie_link_power': None,
                'alpha': '[1e-4]',
                'lambda': '[0.5]',
                'prior1': None,
                'lambda_search': None,
                'nlambdas': None,
                'lambda_min_ratio': None,
                'higher_accuracy': True,
                'use_all_factor_levels': False,
                # NPE with n_folds 2?
                'n_folds': 1,
            }


            model_key = 'prostate_glm.hex'
            glmResult = h2o.n0.build_model(
                algo='glm', 
                destination_key=model_key,
                training_frame=parse_key,
                parameters=parameters, 
                timeoutSecs=10) 

            gr = self.GLMOutput(glmResult)
            for k,v in gr:
                if k != 'parameters':
                    print "gr", k, dump_json(v)

            modelResult = h2o.n0.models(key=model_key)

            mr = self.GLMOutput(modelResult['models'][0]['output'])
            for k,v in mr:
                if k != 'parameters':
                    print "mr", k, dump_json(v)

            cmmResult = h2o.n0.compute_model_metrics(
                model=model_key, 
                frame=parse_key, 
                timeoutSecs=60)

            print "cmmResult", dump_json(cmmResult)

            mmResult = h2o.n0.model_metrics(
                model=model_key, 
                frame=parse_key, 
                timeoutSecs=60)
    
            print "mmResult", dump_json(mmResult)

            # this prints too
            # tuplesSorted, iters, mse, names = \
            #    h2o_glm.simpleCheckGLM(self, modelResult, parameters, numRows, numColsUsed, labelListUsed)
            
            h2o_cmd.runStoreView()
示例#13
0
    def test_w2v_basic_2(self):
        global SYNDATASETS_DIR
        SYNDATASETS_DIR = h2o.make_syn_dir()
        n = 100
        tryList = [
            # (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 3, 'cF', 300),
            (n, 4, 'cG', 300),
            (n, 5, 'cH', 300),
            (n, 6, 'cI', 300),
            (n, 7, 'cJ', 300),
            (n, 9, 'cK', 300),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:

            csvPathname = create_file_with_seps(rowCount, colCount)
            hex_key = "not_used.hex"

            # just parse to make sure it's good
            parseResult = h2i.import_parse(path=csvPathname,
                checkHeader=1, delete_on_done = 0, timeoutSecs=180, doSummary=False)
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)

            inspectResult = h2o_cmd.runInspect(key=parse_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspectResult)

            src_key = h2i.find_key('syn_.*csv')

            # no cols ignored
            labelListUsed = list(labelList)
            numColsUsed = numCols
            for trial in range(1):

                parameters = {
                    'validation_frame': parse_key, # Frame False []
                    'ignored_columns': None, # string[] None []
                    'score_each_iteration': None, # boolean false []

                    'minWordFreq': 1, # int 5 []
                    'wordModel': 'CBOW', # enum [u'CBOW', u'SkipGram']
                    'normModel': 'NegSampling', # enum # [u'HSM', u'NegSampling']
                    'negSampleCnt': 1,# int 5 []
                    'vecSize': 10,  # int 100
                    'windowSize': 2,  # int 5
                    'sentSampleRate': 0.001,  # float 0.001
                    'initLearningRate': 0.05,  # float 0.05
                    'epochs': 1, # int 5
                }

                model_key = 'benign_w2v.hex'
                bmResult = h2o.n0.build_model(
                    algo='word2vec', 
                    destination_key=model_key,
                    training_frame=parse_key,
                    parameters=parameters, 
                    timeoutSecs=10) 
                bm = OutputObj(bmResult, 'bm')

                modelResult = h2o.n0.models(key=model_key)
                model = OutputObj(modelResult['models'][0]['output'], 'model')

                cmmResult = h2o.n0.compute_model_metrics( model=model_key, frame=parse_key, timeoutSecs=60)
                cmm = OutputObj(cmmResult, 'cmm')

                mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
                mm = OutputObj(mmResult['model_metrics'][0], 'mm')

                prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
                pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
        
                h2o_cmd.runStoreView()
    def test_rapids_overloaded_opr(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # (1000000, 5, 'cA', 200),
            (1000, 5, 'cA', 200),
        ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False)

            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
            inspect = h2o_cmd.runInspect(key=hex_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                inspect)

            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols)

            # should match # of cols in header or ??
            self.assertEqual(
                numCols, colCount,
                "parse created result with the wrong number of cols %s %s" %
                (numCols, colCount))
            self.assertEqual(
                numRows, rowCount,
                "parse created result with the wrong number of rows %s %s" %
                (numRows, rowCount))

            # Xbase.debugOnly = True

            REPEAT = 1
            data_key = hex_key
            for i in range(REPEAT):
                result_key = data_key + "_" + str(i)
                Assign('s1', Seq(range(5)))

                # take advantage of default params for row/col (None)
                # need the 'c' function, to make sure the key is created

                # first try as object, then method
                Assign('s2', Fcn('c', Seq(range(5))))

                # just combine
                Assign('s3', Col(Seq(range(5))))

                inspect = h2o_cmd.runInspect(key='s3')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                    inspect)
                assert numRows == 5
                assert numCols == 1

                Assign('s2', Col(Seq(range(5))))

                inspect = h2o_cmd.runInspect(key='s2')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                    inspect)
                assert numRows == 5
                assert numCols == 1

                # can't have sequence of sequences?
                # make sure key is created with c()
                f = Fcn(
                    'c',
                    Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10),
                        range(50, 52)))
                Assign('s1', f)

                f = Col(
                    Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10),
                        range(50, 52)))
                Assign('s2', f)

                inspect = h2o_cmd.runInspect(key='s2')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
                    inspect)
                assert numRows == 313
                assert numCols == 1

                print "Now trying to do the functions with the alternate overloaded operators"
                data_key = Key(parse_key)
                result_key = Key()
                # what triggers immediate operation at h2o
                # as opposed to an object within a function

                result_key.frame = 'a1'
                result_key <<= data_key[Seq(range(1, 4)), :]
                result_key.frame = 'a2'
                result_key <<= data_key[Seq(range(1, 4)), :]
                result_key.frame = 'a3'
                result_key <<= data_key[Seq(range(1, 4)), :]
                result_key.frame = 'a4'
                result_key <<= data_key[Seq(range(1, 4)), 0:1]
                result_key.frame = 'a5'
                result_key <<= data_key[Seq(range(1, 4)), 0:1]

                result_key.frame = 'a6'
                result_key <<= data_key[[1, 2, 3], 1]

                print "\n" + csvPathname, \
                    "    numRows:", "{:,}".format(numRows), \
                    "    numCols:", "{:,}".format(numCols)
    def test_rapids_overloaded_opr(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # (1000000, 5, 'cA', 200),
            (1000, 5, 'cA', 200),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=timeoutSecs, doSummary=False)

            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
            inspect = h2o_cmd.runInspect(key=hex_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)

            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols)

            # should match # of cols in header or ??
            self.assertEqual(numCols, colCount,
                "parse created result with the wrong number of cols %s %s" % (numCols, colCount))
            self.assertEqual(numRows, rowCount,
                "parse created result with the wrong number of rows %s %s" % (numRows, rowCount))

            # Xbase.debugOnly = True

            REPEAT = 1
            data_key = hex_key
            for i in range(REPEAT):
                result_key = data_key + "_" + str(i)
                Assign('s1', Seq(range(5)) )

                # take advantage of default params for row/col (None)
                # need the 'c' function, to make sure the key is created

                # first try as object, then method
                Assign('s2', Fcn('c', Seq(range(5)) ))

                # just combine
                Assign('s3', Col(Seq(range(5)) ))

                inspect = h2o_cmd.runInspect(key='s3')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)
                assert numRows==5
                assert numCols==1

                Assign('s2', Col(Seq(range(5))) )

                inspect = h2o_cmd.runInspect(key='s2')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)
                assert numRows==5
                assert numCols==1

                # can't have sequence of sequences?
                # make sure key is created with c()
                f = Fcn('c', Seq(Colon(99,400), "#2", 1, range(1,5), range(7,10), range(50,52) ))
                Assign('s1', f)

                f = Col(Seq(Colon(99,400), "#2", 1, range(1,5), range(7,10), range(50,52) ))
                Assign('s2', f)

                inspect = h2o_cmd.runInspect(key='s2')
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)
                assert numRows==313
                assert numCols==1
            
                print "Now trying to do the functions with the alternate overloaded operators"
                data_key = Key(parse_key)
                result_key = Key()
                # what triggers immediate operation at h2o
                # as opposed to an object within a function

                result_key.frame = 'a1'
                result_key <<= data_key[Seq(range(1,4)), :]  
                result_key.frame = 'a2'
                result_key <<= data_key[Seq(range(1,4)), :]
                result_key.frame = 'a3'
                result_key <<= data_key[Seq(range(1,4)), :]
                result_key.frame = 'a4'
                result_key <<= data_key[Seq(range(1,4)), 0:1]
                result_key.frame = 'a5'
                result_key <<= data_key[Seq(range(1,4)), 0:1]

                result_key.frame = 'a6'
                result_key <<= data_key[[1,2,3], 1]

                print "\n" + csvPathname, \
                    "    numRows:", "{:,}".format(numRows), \
                    "    numCols:", "{:,}".format(numCols)
示例#16
0
    def test_hdfs_cdh5(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            # "3G_poker_shuffle"
            ("and-testing.data", 60),
            ### "arcene2_train.both",
            ### "arcene_train.both",
            ### "bestbuy_test.csv",
            ("covtype.data", 60),
            ("covtype4x.shuffle.data", 60),
            # "four_billion_rows.csv",
            ("hhp.unbalanced.012.data.gz", 60),
            ("hhp.unbalanced.data.gz", 60),
            ("leads.csv", 60),
            # ("covtype.169x.data", 1200),
            ("prostate_long_1G.csv", 200),
            ("airlines_all.csv", 1200),
        ]

        # pick 8 randomly!
        if (1==0):
            csvFilenameList = random.sample(csvFilenameAll,8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        trial = 0
        print "try importing /tmp2"
        d = h2i.import_only(path="tmp2/*", schema='hdfs', timeoutSecs=1000)
        for (csvFilename, timeoutSecs) in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir 
            print "Loading", csvFilename, 'from HDFS'
            start = time.time()
            hex_key = "a.hex"
            csvPathname = "datasets/" + csvFilename
            parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=1000)
            print "hdfs parse of", csvPathname, "took", time.time() - start, 'secs'
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
            inspectResult = h2o_cmd.runInspect(key=parse_key)
            missingValuesListA, labelListA, numRowsA, numColsA  = h2o_cmd.infoFromInspect(inspectResult)

            if DO_EXPORT:
                start = time.time()
                print "Saving", csvFilename, 'to HDFS'
                print "Using /tmp2 to avoid the '.' prefixed files in /tmp2 (kills import)"
                print "Unique per-user to avoid permission issues"
                username = getpass.getuser()
                csvPathname = "tmp2/a%s.%s.csv" % (trial, username)
                # reuse the file name to avoid running out of space
                csvPathname = "tmp2/a%s.%s.csv" % ('_h2o_export_files', username)

                path = "hdfs://"+ h2o.nodes[0].hdfs_name_node + "/" + csvPathname
                h2o.nodes[0].export_files(src_key=hex_key, path=path, force=1, timeoutSecs=timeoutSecs)
                print "export_files of", hex_key, "to", path, "took", time.time() - start, 'secs'
                trial += 1

                print "Re-Loading", csvFilename, 'from HDFS'
                start = time.time()
                hex_key = "a2.hex"
                time.sleep(2)
                d = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=1000)
                print h2o.dump_json(d)
                parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=1000)
                print "hdfs re-parse of", csvPathname, "took", time.time() - start, 'secs'
示例#17
0
    def test_kmeans_prostate(self):

        importFolderPath = "logreg"
        csvFilename = "prostate.csv"
        hex_key = "prostate.hex"
        csvPathname = importFolderPath + "/" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, checkHeader=1, 
            timeoutSecs=180, doSummary=False)
        numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)

        inspectResult = h2o_cmd.runInspect(key=parse_key)
        missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspectResult)

        # loop, to see if we get same centers

        expected = [
            ([0.37,65.77,1.07,2.23,1.11,10.49,4.24,6.31], 215, 36955), 
            ([0.36,66.44,1.09,2.21,1.06,10.84,34.16,6.31], 136, 46045),
            ([0.83,66.17,1.21,2.86,1.34,73.30,15.57,7.31], 29, 33412),
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01)

        labelListUsed = list(labelList)
        labelListUsed.remove('ID')
        numColsUsed = numCols - 1

        for trial in range(5):
            # kmeansSeed = random.randint(0, sys.maxint)
            # actually can get a slightly better error sum with a different seed
            # this seed gets the same result as scikit
            # kmeansSeed = 6655548259421773879
            kmeansSeed = random.randint(0, sys.maxint)
            parameters = {
                'validation_frame': parse_key,
                'ignored_columns': '[ID]',
                'score_each_iteration': False,
                'K': 3, 
                'max_iters': 500,
                'normalize': False,
                'seed': kmeansSeed,
                'init': 'PlusPlus',
            }

            model_key = 'prostate_k.hex'
            kmeansResult = h2o.n0.build_model(
                algo='kmeans', 
                destination_key=model_key,
                training_frame=parse_key,
                parameters=parameters, 
                timeoutSecs=10) 

            modelResult = h2o.n0.models(key=model_key)

            h2o_cmd.runStoreView()

            tuplesSorted, iters, mse, names = \
                h2o_kmeans.simpleCheckKMeans(self, modelResult, parameters, numRows, numColsUsed, labelListUsed)
            ids, mses, rows, clusters = zip(*tuplesSorted)
示例#18
0
    def test_DL_basic(self):
        importFolderPath = "logreg"
        csvFilename = "benign.csv"
        hex_key = "benign.hex"
        csvPathname = importFolderPath + "/" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, 
            checkHeader=1, timeoutSecs=180, doSummary=False)
        numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
        inspectResult = h2o_cmd.runInspect(key=parse_key)
        missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspectResult)

        expected = []
        allowedDelta = 0

        # no cols ignored
        labelListUsed = list(labelList)
        labelListUsed.remove('STR')
        numColsUsed = numCols - 1
        for trial in range(1):
            parameters = {
                'validation_frame': parse_key, # Frame None
                'ignored_columns': '[STR]', # string[] None
                'score_each_iteration': None, # boolean false
                'response_column': 'FNDX', # string None
                'do_classification': None, # boolean false
                'balance_classes': None, # boolean false
                'max_after_balance_size': None, # float Infinity
                'n_folds': None, # int 0

                'keep_cross_validation_splits': None, # boolean false
                'checkpoint': None, # Key None
                'override_with_best_model': None, # boolean true
                'expert_mode': None, # boolean false
                'autoencoder': None, # boolean false
                'use_all_factor_levels': None, # boolean true
                # [u'Tanh', u'TanhWithDropout', u'Rectifier', u'RectifierWithDropout', u'Maxout', u'MaxoutWithDropout']
                'activation': None, # enum Rectifier 
                'hidden': None, # int[] [200, 200]
                'epochs': None, # double 10.0
                'train_samples_per_iteration': None, # long -2
                'target_ratio_comm_to_comp': None, # double 0.02
                'seed': None, # long 1679194146842485659
                'adaptive_rate': None, # boolean true
                'rho': None, # double 0.99
                'epsilon': None, # double 1.0E-8
                'rate': None, # double 0.005
                'rate_annealing': None, # double 1.0E-6
                'rate_decay': None, # double 1.0
                'momentum_start': None, # double 0.0
                'momentum_ramp': None, # double 1000000.0
                'momentum_stable': None, # double 0.0
                'nesterov_accelerated_gradient': None, # boolean true
                'input_dropout_ratio': None, # double 0.0
                'hidden_dropout_ratios': None, # double[] None (this can grid?)
                'l1': None, # double 0.0
                'l2': None, # double 0.0
                'max_w2': None, # float Infinity
                'initial_weight_distribution': None, # enum UniformAdaptive [u'UniformAdaptive', u'Uniform', u'Normal']
                'initial_weight_scale': None, # double 1.0
                'loss': None, # enum MeanSquare [u'Automatic', u'MeanSquare', u'CrossEntropy']
                'score_interval': None, # double 5.0
                'score_training_samples': None, # long 10000
                'score_validation_samples': None, # long 0
                'score_duty_cycle': None, # double 0.1
                'classification_stop': None, # double 0.0
                'regression_stop': None, # double 1.0E-6
                'quiet_mode': None, # boolean false
                'max_confusion_matrix_size': None, # int 20
                'max_hit_ratio_k': None, # int 10
                'balance_classes': None, # boolean false
                'class_sampling_factors': None, # float[] None
                'max_after_balance_size': None, # float Infinity
                'score_validation_sampling': None, # enum Uniform [u'Uniform', u'Stratified']
                'diagnostics': None, # boolean true
                'variable_importances': None, # boolean false
                'fast_mode': None, # boolean true
                'ignore_const_cols': None, # boolean true
                'force_load_balance': None, # boolean true
                'replicate_training_data': None, # boolean false
                'single_node_mode': None, # boolean false
                'shuffle_training_data': None, # boolean false
                'missing_values_handling': None, # enum MeanImputation [u'Skip', u'MeanImputation']
                'sparse': None, # boolean false
                'col_major': None, # boolean false
                'average_activation': None, # double 0.0
                'sparsity_beta': None, # double 0.0
            }

            model_key = 'benign_dl.hex'
            bmResult = h2o.n0.build_model(
                algo='deeplearning', 
                destination_key=model_key,
                training_frame=parse_key,
                parameters=parameters, 
                timeoutSecs=10) 
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')

            cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            mm = OutputObj(mmResult['model_metrics'][0], 'mm')

            prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

            h2o_cmd.runStoreView()
示例#19
0
    def test_GLM_basic_2(self):
        importFolderPath = "logreg"
        csvFilename = "prostate.csv"
        hex_key = "prostate.hex"
        csvPathname = importFolderPath + "/" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, 
            checkHeader=1, timeoutSecs=180, doSummary=False)
        numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
        inspectResult = h2o_cmd.runInspect(key=parse_key)
        missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspectResult)

        expected = []
        allowedDelta = 0

        labelListUsed = list(labelList)
        labelListUsed.remove('ID')
        labelListUsed.remove('CAPSULE')
        numColsUsed = numCols - 2
        for trial in range(1):
            # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie']
            # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie']
            # can we do classification with probabilities?
            # are only lambda and alpha grid searchable?
            parameters = {
                'validation_frame': parse_key,
                'ignored_columns': '[ID]',
                'score_each_iteration': True,
                'response_column': 'CAPSULE',
                # FIX! when is this needed? redundant for binomial?
                'do_classification': True,
                'balance_classes': False,
                'max_after_balance_size': None,
                'standardize': False,
                'family': 'binomial', 
                'link': None, 
                'tweedie_variance_power': None,
                'tweedie_link_power': None,
                'alpha': '[1e-4]',
                'lambda': '[0.5]',
                'prior1': None,
                'lambda_search': None,
                'nlambdas': None,
                'lambda_min_ratio': None,
                'higher_accuracy': True,
                'use_all_factor_levels': False,
                # NPE with n_folds 2?
                'n_folds': 1,
            }

            model_key = 'prostate_glm.hex'
            bmResult = h2o.n0.build_model(
                algo='glm',
                destination_key=model_key,
                training_frame=parse_key,
                parameters=parameters,
                timeoutSecs=10)
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')
            h2o_glm.newSimpleCheckGLM(self, model, parameters, labelList, labelListUsed)

            cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            mm = OutputObj(mmResult, 'mm')

            prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

            h2o_cmd.runStoreView()
示例#20
0
    def test_mixed_int_enum_many(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # this should be a sorted list for comparing to hbrk in the histogram in h2o summary?
        enumList = ['abc', 'def', 'ghi']
        # numbers 1 and 2 may not be counted as NAs correctly? what about blank space?
        intList = [0, 1, 2, '']
        expectedList = ['abc', 'def', 'ghi']

        tryList = [
            # not sure about this case
            # some of the cases interpret as ints now (not as enum)
            (ROWS, COLS, 'a.hex', enumList[0:1], expectedList[0:1],
             intList[0:2], False),
            # colname, (min, COLS5th, 50th, 75th, max)
            (ROWS, COLS, 'b.hex', enumList[0:2], expectedList[0:2],
             intList[0:1], True),
            # fails this case
            (ROWS, COLS, 'c.hex', enumList[0:1], expectedList[0:1],
             intList[0:1], True),
            (ROWS, COLS, 'd.hex', enumList[0:], expectedList[0:], intList[0:1],
             True),
            (ROWS, COLS, 'e.hex', enumList[0:2], expectedList[0:2],
             intList[0:2], True),
            # this case seems to fail
            (ROWS, COLS, 'f.hex', enumList[0:1], expectedList[0:1],
             intList[0:2], True),
            # this seems wrong also
            (ROWS, COLS, 'g.hex', enumList[0:], expectedList[0:], intList[0:2],
             True),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        class Column(object):
            def __init__(self, column):
                assert isinstance(column, dict)
                for k, v in column.iteritems():
                    setattr(self, k, v)  # achieves self.k = v

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, enumChoices, enumExpected,
             intChoices, resultIsEnum) in tryList:
            # max error = half the bin size?

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)

            print "Creating random", csvPathname
            expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount,
                                              SEEDPERFILE, enumChoices,
                                              intChoices)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           check_header=0,
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
            print "numRows:", numRows, "numCols:", numCols
            inspect = h2o_cmd.runInspect(None, hex_key)

            print "\nTrial:", trial, csvFilename

            # this summary only does one column?
            # assert colCount == len(columns), "%s %s" % (colCount, len(columns))

            for i in range(colCount):
                summaryResult = h2o_cmd.runSummary(key=hex_key,
                                                   column="C" + str(i + 1))
                h2o.verboseprint("summaryResult:",
                                 h2o.dump_json(summaryResult))

                # columns = summaryResult['frames'][0]['columns']
                co = Column(summaryResult)
                # how are enums binned. Stride of 1? (what about domain values)
                coList = [
                    co.base,
                    len(co.bins),
                    len(co.data),
                    co.domain,
                    co.label,
                    co.maxs,
                    co.mean,
                    co.mins,
                    co.missing,
                    co.ninfs,
                    co.pctiles,
                    co.pinfs,
                    co.precision,
                    co.sigma,
                    co.str_data,
                    co.stride,
                    co.type,
                    co.zeros,
                ]

                coNameList = [
                    'co.base',
                    'len(co.bins)',
                    'len(co.data)',
                    'co.domain',
                    'co.label',
                    'co.maxs',
                    'co.mean',
                    'co.mins',
                    'co.missing',
                    'co.ninfs',
                    'co.pctiles',
                    'co.pinfs',
                    'co.precision',
                    'co.sigma',
                    'co.str_data',
                    'co.stride',
                    'co.type',
                    'co.zeros',
                ]

                for c, n in zip(coList, coNameList):
                    print n + ":", c

                print "len(co.bins):", len(co.bins)

                print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(
                    co.mean)
                # what is precision. -1?
                # This can go to NaN (string) with big numbers
                # print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma)

                # can be None if col is all NA
                # print "FIX! hacking the co.pctiles because it's short by two"
                # pctiles = [0] + co.pctiles + [0]

                assert co.zeros <= numRows, "Can't have more zeros than rows %s %s" % (
                    co.zeros, numRows)

                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(
                        co.type, 'enum',
                        "Expecting co.type %s to be 'enum' for %s co label  %s"
                        % (co.type, i, co.label))

                if ENABLE_ASSERTS and resultIsEnum:
                    # not always there
                    cardinality = len(co.domain)
                    self.assertEqual(
                        cardinality,
                        len(enumChoices),
                        msg="trial %s: cardinality %s should be %s" %
                        (trial, cardinality, len(enumChoices)))

                # assume I create the list above in the same order that h2o will show the order. sorted?
                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(co.bins, enumChoices)

                hcntTotal = sum(co.bins)
                numRowsCreated = rowCount + len(intChoices)
                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(hcntTotal,
                                     numRowsCreated - expectedNaCnt[i])

                self.assertEqual(numRows,
                                 numRowsCreated,
                                 msg="trial %s: numRows %s should be %s" %
                                 (trial, numRows, numRowsCreated))

                nacnt = co.missing
                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(
                        nacnt, expectedNaCnt[i],
                        "trial %s: Column %s Expected %s. nacnt %s incorrect" %
                        (trial, i, expectedNaCnt[i], nacnt))

                # FIX! no checks for the case where it got parsed as int column!
            trial += 1
示例#21
0
    def test_summary2_exp(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        LAMBD = random.uniform(0.005, 0.5)
        tryList = [
            # co.label, (min, 25th, 50th, 75th, max)
# parse setup error
#            (1,     1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
            (5,     1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
#            (10,     1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
#            (100,    1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
#            (1000,   1, 'x.hex', -5000, 0,        ['C1', None, None, None, None, None]),
#            (10000,  1, 'x.hex', -100000, 100000, ['C1', None, None, None, None, None]),
#            (100000, 1, 'x.hex', -1, 1,           ['C1', None, None, None, None, None]),
#            (1000000, 1, 'A.hex', 1, 100,          ['C1', None, None, None, None, None]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60

        class Column(object):
            def __init__(self, column):
                assert isinstance(column, dict)
                for k,v in column.iteritems():
                    setattr(self, k, v) # achieves self.k = v

        for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname, "lambd:", LAMBD
            (expectedMin, expectedMax) = write_syn_dataset(csvPathname, 
                rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE)
            print "expectedMin:", expectedMin, "expectedMax:", expectedMax
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            expected[1] = expectedMin
            expected[5] = expectedMax

            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', 
                hex_key=hex_key, timeoutSecs=30, doSummary=False)
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)

            inspect = h2o_cmd.runInspect(key=parse_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)
            print "\n" + csvFilename
            # column 0?
            summaryResult = h2o_cmd.runSummary(key=hex_key, column='C1')
            h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult))

            # default_pctiles
            # isText
            # rows
            # off
            # key
            # checksum

            # only one column
            columns = summaryResult['frames'][0]['columns']
            default_pctiles = summaryResult['frames'][0]['default_pctiles']
            co = Column(columns[0])
            # how are enums binned. Stride of 1? (what about domain values)
            coList = [
                co.base,
                len(co.bins),
                len(co.data),
                co.domain,
                co.label,
                co.maxs,
                co.mean,
                co.mins,
                co.missing,
                co.ninfs,
                co.pctiles,
                co.pinfs,
                co.precision,
                co.sigma,
                co.str_data,
                co.stride,
                co.type,
                co.zeros,
                ]

            for c in coList:
                print c

            print "len(co.bins):", len(co.bins)

            print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean)
            # what is precision. -1?
            print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma)

            print "FIX! hacking the co.pctiles because it's short by two"
            pctiles = [0] + co.pctiles + [0]
            
            # the thresholds h2o used, should match what we expected
            if expected[0]:
                self.assertEqual(co.label, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(co.mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            if expected[2]:
                h2o_util.assertApproxEqual(pctiles[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            if expected[3]:
                h2o_util.assertApproxEqual(pctiles[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            if expected[4]:
                h2o_util.assertApproxEqual(pctiles[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')
            if expected[5]:
                h2o_util.assertApproxEqual(co.maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            # figure out the expected max error
            # use this for comparing to sklearn/sort
            if expected[1] and expected[5]:
                expectedRange = expected[5] - expected[1]
                # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                # the extra bin for the max value, is an extra bin..ignore
                expectedBin = expectedRange/(MAX_QBINS-2)
                maxErr = expectedBin # should we have some fuzz for fp?

            else:
                print "Test won't calculate max expected error"
                maxErr = 0

            pt = h2o_util.twoDecimals(pctiles)
            mx = h2o_util.twoDecimals(co.maxs)
            mn = h2o_util.twoDecimals(co.mins)

            print "co.label:", co.label, "co.pctiles (2 places):", pt
            print "default_pctiles:", default_pctiles
            print "co.label:", co.label, "co.maxs: (2 places):", mx
            print "co.label:", co.label, "co.mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):", compareActual)
            print "co.label:", co.label, "co.maxs (2 places):", mx
            print "co.label:", co.label, "co.mins (2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            print "h2oSummary2MaxErr", maxErr
            if co.label!='' and expected[scipyCol]:
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=False,
                    col=scipyCol,
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctiles[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                    h2oSummary2MaxErr=maxErr,
                    )
示例#22
0
    def test_mixed_int_enum_many(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # this should be a sorted list for comparing to hbrk in the histogram in h2o summary?
        enumList = ['abc', 'def', 'ghi']
        # numbers 1 and 2 may not be counted as NAs correctly? what about blank space?
        intList = [0, 1, 2, '']
        expectedList = [ 'abc', 'def', 'ghi']

        tryList = [
            # not sure about this case
            # some of the cases interpret as ints now (not as enum)
            (ROWS, COLS, 'a.hex', enumList[0:1], expectedList[0:1], intList[0:2], False),
            # colname, (min, COLS5th, 50th, 75th, max)
            (ROWS, COLS, 'b.hex', enumList[0:2], expectedList[0:2], intList[0:1], True),
            # fails this case
            (ROWS, COLS, 'c.hex', enumList[0:1], expectedList[0:1], intList[0:1], True),
            (ROWS, COLS, 'd.hex', enumList[0: ], expectedList[0: ], intList[0:1], True),
            (ROWS, COLS, 'e.hex', enumList[0:2], expectedList[0:2], intList[0:2], True),
            # this case seems to fail
            (ROWS, COLS, 'f.hex', enumList[0:1], expectedList[0:1], intList[0:2], True),
            # this seems wrong also
            (ROWS, COLS, 'g.hex', enumList[0: ], expectedList[0: ], intList[0:2], True),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        class Column(object):
            def __init__(self, column):
                assert isinstance(column, dict)
                for k,v in column.iteritems():
                    setattr(self, k, v) # achieves self.k = v

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, enumChoices, enumExpected, intChoices, resultIsEnum) in tryList:
            # max error = half the bin size?
        
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)

            print "Creating random", csvPathname
            expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, enumChoices, intChoices)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', checkHeader=0,
                hex_key=hex_key, timeoutSecs=10, doSummary=False)
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
            print "numRows:", numRows, "numCols:", numCols
            inspect = h2o_cmd.runInspect(None, hex_key)
            
            print "\nTrial:", trial, csvFilename

            # this summary only does one column?
            # assert colCount == len(columns), "%s %s" % (colCount, len(columns))

            for i in range(colCount):
                summaryResult = h2o_cmd.runSummary(key=hex_key, column="C" + str(i+1))
                h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

                columns = summaryResult['frames'][0]['columns']
                co = Column(columns[0])
                # how are enums binned. Stride of 1? (what about domain values)
                coList = [
                    co.base,
                    len(co.bins),
                    len(co.data),
                    co.domain,
                    co.label,
                    co.maxs,
                    co.mean,
                    co.mins,
                    co.missing,
                    co.ninfs,
                    co.pctiles,
                    co.pinfs,
                    co.precision,
                    co.sigma,
                    co.str_data,
                    co.stride,
                    co.type,
                    co.zeros,
                    ]

                coNameList = [
                    'co.base',
                    'len(co.bins)',
                    'len(co.data)',
                    'co.domain',
                    'co.label',
                    'co.maxs',
                    'co.mean',
                    'co.mins',
                    'co.missing',
                    'co.ninfs',
                    'co.pctiles',
                    'co.pinfs',
                    'co.precision',
                    'co.sigma',
                    'co.str_data',
                    'co.stride',
                    'co.type',
                    'co.zeros',
                    ]

                

                for c,n in zip(coList, coNameList):
                    print n+":", c

                print "len(co.bins):", len(co.bins)

                print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean)
                # what is precision. -1?
                # This can go to NaN (string) with big numbers
                # print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma)

                # can be None if col is all NA
                # print "FIX! hacking the co.pctiles because it's short by two"
                # pctiles = [0] + co.pctiles + [0]

                assert co.zeros <= numRows, "Can't have more zeros than rows %s %s" % (co.zeros, numRows)

                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(co.type, 'Enum', "trial %s: Expecting type to be Enum for %s col colname %s" % (trial, i, colname))

                if ENABLE_ASSERTS and resultIsEnum:
                    # not always there
                    cardinality = len(co.domain)
                    self.assertEqual(cardinality, len(enumChoices),
                        msg="trial %s: cardinality %s should be %s" % (trial, cardinality, len(enumChoices))) 

                # assume I create the list above in the same order that h2o will show the order. sorted?
                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(co.bins, enumChoices) 

                hcntTotal = sum(co.bins)
                numRowsCreated = rowCount + len(intChoices)
                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(hcntTotal, numRowsCreated - expectedNaCnt[i])

                self.assertEqual(numRows, numRowsCreated,
                    msg="trial %s: numRows %s should be %s" % (trial, numRows, numRowsCreated))

                nacnt = co.missing
                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(nacnt, expectedNaCnt[i], 
                        "trial %s: Column %s Expected %s. nacnt %s incorrect" % (trial, i, expectedNaCnt[i], nacnt))

                # FIX! no checks for the case where it got parsed as int column!
            trial += 1
示例#23
0
    def test_exec2_xorsum(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (ROWS, 1, 'r1', 0, 10, None),
        ]

        for trial in range(10):
            ullResultList = []
            for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                # dynamic range of the data may be useful for estimating error
                maxDelta = expectedMax - expectedMin

                csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
                print "Creating random", csvPathname
                (expectedUllSum, expectedFpSum)  = write_syn_dataset(csvPathname, 
                    rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
                expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum)
                expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum)

                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                    timeoutSecs=3000, retryDelaySecs=2)
                numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
                assert parse_key == hex_key
                assert numCols == colCount
                assert numRows == rowCount

                inspect = h2o_cmd.runInspect(key=hex_key)
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)
                assert len(missingList) == 0

                # looking at the 8 bytes of bits for the h2o doubles
                # xorsum will zero out the sign and exponent
                for execExpr in exprList:
                    for r in range(10):
        
                        if 1==0:
                            execResult = h2o_cmd.runExec(ast=execExpr, timeoutSecs=30)
                            fpResult = execResult['scalar']
                        else:
                            (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='x', timeoutSecs=300)
                            # print dump_json(h2o.n0.frames(key="h"))

                        # (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='h', timeoutSecs=300)
                        # print dump_json(h2o.n0.frames(key="r1"))
                        print r, "execResult:", h2o.dump_json(execResult)
                        h2o_cmd.runStoreView()

                        ullResult = h2o_util.doubleToUnsignedLongLong(fpResult)
                        ullResultList.append((ullResult, fpResult))

                        print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)
                        print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (expectedUllSum, expectedUllSumAsDouble)
                        print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (expectedFpSumAsLongLong, expectedFpSum)

                        # allow diff of the lsb..either way
                        # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3):
                        if ullResult!=expectedUllSum:
                            raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % \
                                (ullResult, expectedUllSum))
                            print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % \
                                (ullResult, expectedUllSum)

                h2o.check_sandbox_for_errors()

                print "first result was from a sum. others are xorsum"
                print "ullResultList:"
                for ullResult, fpResult in ullResultList:
                    print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)

                print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (expectedUllSum, expectedUllSumAsDouble)
                print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (expectedFpSumAsLongLong, expectedFpSum)
    def test_quant_cmp_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (5*ROWS, 1, 'x.hex', 1, 20000,        ['C1',  1.10, 5000.0, 10000.0, 15000.0, 20000.00]),
            (5*ROWS, 1, 'x.hex', -5000, 0,        ['C1', -5001.00, -3750.0, -2445, -1200.0, 99]),
            (1*ROWS, 1, 'x.hex', -100000, 100000, ['C1',  -100001.0, -50000.0, 1613.0, 50000.0, 100000.0]),
            (1*ROWS, 1, 'x.hex', -1, 1,           ['C1',  -1.05, -0.48, 0.0087, 0.50, 1.00]),

            (1*ROWS, 1, 'A.hex', 1, 100,          ['C1',   1.05, 26.00, 51.00, 76.00, 100.0]),
            (1*ROWS, 1, 'A.hex', -99, 99,         ['C1',  -99, -50.0, 0, 50.00, 99]),

            (1*ROWS, 1, 'B.hex', 1, 10000,        ['C1',   1.05, 2501.00, 5001.00, 7501.00, 10000.00]),
            (1*ROWS, 1, 'B.hex', -100, 100,       ['C1',  -100.10, -50.0, 0.85, 51.7, 100,00]),

            (1*ROWS, 1, 'C.hex', 1, 100000,       ['C1',   1.05, 25002.00, 50002.00, 75002.00, 100000.00]),
            (1*ROWS, 1, 'C.hex', -101, 101,       ['C1',  -100.10, -50.45, -1.18, 49.28, 100.00]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
            colname = expected[0]
            maxDelta = ((expectedMax - expectedMin)/1000.0) / 2.0

            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            # need the full pathname when python parses the csv for numpy/sort
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)

            #***************************
            # Parse
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)

            #***************************
            # Inspect
            inspect = h2o_cmd.runInspect(key=parse_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(parseResult)

            #***************************
            # Summary
            summaryResult = h2o_cmd.runSummary(key=parse_key)
            columns = summaryResult['frames'][0]['columns']
            default_pctiles = summaryResult['frames'][0]['default_pctiles']

            co = OutputObj(columns[0], 'summary')

            coList = [ co.base, len(co.bins), len(co.data), co.domain,
                co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles,
                co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros]
            for c in coList:
                print c

            print "len(co.bins):", len(co.bins)
            print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean)
            print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma)

            print "FIX! hacking the co.pctiles because it's short by two"
            summ_pctiles = [0] + co.pctiles + [0]

            pt = h2o_util.twoDecimals(summ_pctiles)
            mx = h2o_util.twoDecimals(co.maxs)
            mn = h2o_util.twoDecimals(co.mins)
            exp = h2o_util.twoDecimals(expected[1:])

            print "co.label:", co.label, "co.pctiles (2 places):", pt
            print "default_pctiles:", default_pctiles
            print "co.label:", co.label, "co.maxs: (2 places):", mx
            print "co.label:", co.label, "co.mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\
                mn[0], pt[3], pt[5], pt[7], mx[0])
            h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\
                exp[0], exp[1], exp[2], exp[3], exp[4])

            #***************************
            # Quantile
            # the thresholds h2o used, should match what we expected

            # using + here seems to result in an odd tuple..doesn't look right to h2o param
            # so went with this. Could add '[' and ']' to the list first, before the join.
            probsStr  = "[%s]" % ",".join(map(str,probsList))
            parameters = {
                'destination_key': "a.hex",
                'training_frame': parse_key,
                'validation_frame': parse_key,
                'ignored_columns': None,
                'score_each_iteration': False,
                'probs': probsStr,
            }

            model_key = 'qhex'
            bmResult = h2o.n0.build_model(
                algo='quantile',
                destination_key=model_key,
                training_frame=parse_key,
                parameters=parameters,
                timeoutSecs=10)
            bm = OutputObj(bmResult, 'bm')

            msec = bm.jobs[0]['msec']
            print "bm msec", msec

            # quantile result is just a job result to a key
            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0], 'model')

            # model output {
            #   "domains": [
            #     null
            #   ],
            #   "iters": 1.0,
            #   "model_category": null,
            #   "names": [
            #     "C1"
            #   ],
            #   "quantiles": [
            #     [
            #       10009.03502345
            #     ]
            #   ],
            print "model.output:", model.output
            print "model.output:['quantiles']", model.output['quantiles']
            print "model.output:['iters']", model.output['iters']
            print "model.output:['names']", model.output['names']
            quantiles = model.output['quantiles'][0] # why is this a double array
            iters = model.output['iters']
            assert iters == 11, iters
            print "quantiles: ", quantiles
            print "iters: ", quantiles

            # cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            # cmm = OutputObj(cmmResult, 'cmm')

            # mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            # mm = OutputObj(mmResult, 'mm')

            # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
            # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
            h2o_cmd.runStoreView()

            trial += 1
            # compare the last threshold
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0, # what col to extract from the csv
                    datatype='float',
                    quantile=CHECK_PCTILE,
                    # h2oSummary2=pctile[-1],
                    # h2oQuantilesApprox=result, # from exec
                    h2oExecQuantiles=quantiles[CHECK_PCTILE_INDEX],
                    )
            h2o.nodes[0].remove_all_keys()