예제 #1
0
    def test_50_nongz_fvec(self):
        avgMichalSize = 237270000 * 2
        bucket = 'home-0xdiag-datasets'
        importFolderPath = "many_many"
        print "Using non-gz'ed files in", importFolderPath
        csvFilenameList = [
            ("*.dat", "file_18_A.dat", 18 * avgMichalSize, 1800),
        ]

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes,
                    timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern

            (importResult, importPattern) = h2i.import_only(bucket=bucket,
                                                            path=csvPathname,
                                                            schema='local')
            importFullList = importResult['files']
            importFailList = importResult['fails']
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(
                importFailList)

            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket,
                                           path=csvPathname,
                                           schema='local',
                                           hex_key=csvFilename + ".hex",
                                           timeoutSecs=timeoutSecs,
                                           retryDelaySecs=retryDelaySecs,
                                           pollTimeoutSecs=pollTimeoutSecs)
            elapsed = time.time() - start
            print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            h2o_cmd.columnInfoFromInspect(parseResult['destination_key'],
                                          exceptionOnMissingValues=False)

            if totalBytes is not None:
                fileMBS = (totalBytes / 1e6) / elapsed
                msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern,
                    csvFilename, fileMBS, elapsed)
                print msg
예제 #2
0
    def test_50_nongz_fvec(self):
        h2o.beta_features = True
        avgMichalSize = 237270000
        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        print "Using non-gz'ed files in", importFolderPath
        csvFilenameList= [
            ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
        ]

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern

            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
            importFullList = importResult['files']
            importFailList = importResult['fails']
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                retryDelaySecs=retryDelaySecs,
                pollTimeoutSecs=pollTimeoutSecs)
            elapsed = time.time() - start
            print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            print "Parse result['destination_key']:", parseResult['destination_key']
            h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

            if totalBytes is not None:
                fileMBS = (totalBytes/1e6)/elapsed
                msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                print msg
예제 #3
0
파일: h2o_glm.py 프로젝트: amitkumaar/h2o
def findXFromColumnInfo(key=None, keepList=None, timeoutSecs=120, noPrint=False):

    (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
        h2o_cmd.columnInfoFromInspect(key, exceptionOnMissingValues=False, 
        max_column_display=99999999, timeoutSecs=timeoutSecs)

    num_cols = len(colNameDict)
    x = range(num_cols)

    # need to walk over a copy, cause we change x
    xOrig = x[:]
    for k in xOrig:
        name = colNameDict[k]
        if not name in keepList:
            if not noPrint:
                print "Removing %d because name: %s isn't in keepList %s" % (k, name, keepList)
            x.remove(k)

    if not noPrint:
        print "x has", len(x), "cols"
        strX = ",".join(map(str,x))
        print "\nmatching keepList x:",strX 
    return strX
예제 #4
0
    def sub_c3_nongz_fvec_long(self, csvFilenameList):
        # a kludge
        h2o.setup_benchmark_log()

        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        print "Using nongz'ed files in", importFolderPath

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                if DO_DOUBLE_IMPORT:
                    (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
                    importFullList = importResult['files']
                    importFailList = importResult['fails']
                    print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key="A.hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                if DO_GLM:
                    # output 378 can't be in this
                    ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]
                    ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x))

                    GLMkwargs = {
                        'ignored_cols': ignore_x, 
                        'response': 'C379', 
                        'max_iter': 10, 
                        'n_folds': 1, 
                        'family': 'binomial',
                        'alpha': 0.2, 
                        'lambda': 1e-5
                    }

                    # convert to binomial
                    # execExpr="A.hex=%s" % parseResult['destination_key']
                    # h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

                    # are the unparsed keys slowing down exec?
                    h2i.delete_keys_at_all_nodes(pattern="manyfile")

                    execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)'
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)

                    aHack = {'destination_key': "A.hex"}

                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                h2o_cmd.checkKeyDistribution()
예제 #5
0
파일: h2o_gbm.py 프로젝트: hardikk/h2o
def goodXFromColumnInfo(y, 
    num_cols=None, missingValuesDict=None, constantValuesDict=None, enumSizeDict=None, 
    colTypeDict=None, colNameDict=None, keepPattern=None, key=None, 
    timeoutSecs=120, forRF=False, noPrint=False):

    y = str(y)

    # if we pass a key, means we want to get the info ourselves here
    if key is not None:
        (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
            h2o_cmd.columnInfoFromInspect(key, exceptionOnMissingValues=False, 
            max_column_display=99999999, timeoutSecs=timeoutSecs)
        num_cols = len(colNameDict)

    # now remove any whose names don't match the required keepPattern
    if keepPattern is not None:
        keepX = re.compile(keepPattern)
    else:
        keepX = None

    x = range(num_cols)
    # need to walk over a copy, cause we change x
    xOrig = x[:]
    ignore_x = [] # for use by RF
    for k in xOrig:
        name = colNameDict[k]
        # remove it if it has the same name as the y output
        if str(k)== y: # if they pass the col index as y
            if not noPrint:
                print "Removing %d because name: %s matches output %s" % (k, str(k), y)
            x.remove(k)
            # rf doesn't want it in ignore list
            # ignore_x.append(k)
        elif name == y: # if they pass the name as y 
            if not noPrint:
                print "Removing %d because name: %s matches output %s" % (k, name, y)
            x.remove(k)
            # rf doesn't want it in ignore list
            # ignore_x.append(k)

        elif keepX is not None and not keepX.match(name):
            if not noPrint:
                print "Removing %d because name: %s doesn't match desired keepPattern %s" % (k, name, keepPattern)
            x.remove(k)
            ignore_x.append(k)

        # missing values reports as constant also. so do missing first.
        # remove all cols with missing values
        # could change it against num_rows for a ratio
        elif k in missingValuesDict:
            value = missingValuesDict[k]
            if not noPrint:
                print "Removing %d with name: %s because it has %d missing values" % (k, name, value)
            x.remove(k)
            ignore_x.append(k)

        elif k in constantValuesDict:
            value = constantValuesDict[k]
            if not noPrint:
                print "Removing %d with name: %s because it has constant value: %s " % (k, name, str(value))
            x.remove(k)
            ignore_x.append(k)

        # this is extra pruning..
        # remove all cols with enums, if not already removed
        elif k in enumSizeDict:
            value = enumSizeDict[k]
            if not noPrint:
                print "Removing %d %s because it has enums of size: %d" % (k, name, value)
            x.remove(k)
            ignore_x.append(k)

    if not noPrint:
        print "x has", len(x), "cols"
        print "ignore_x has", len(ignore_x), "cols"
    x = ",".join(map(str,x))
    ignore_x = ",".join(map(str,ignore_x))

    if not noPrint:
        print "\nx:", x
        print "\nignore_x:", ignore_x

    if forRF:
        return ignore_x
    else:
        return x
예제 #6
0
    def test_rf_enums_mappings_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 3000
        tryList = [
            # (n, 1, 'cD', 300), 
            # (n, 2, 'cE', 300), 
            # (n, 3, 'cF', 300), 
            # (n, 4, 'cG', 300), 
            # (n, 5, 'cH', 300), 
            # (n, 6, 'cI', 300), 
            (n, 3, 'cI', 300), 
            (n, 3, 'cI', 300), 
            (n, 3, 'cI', 300), 
            ]

        # SEED_FOR_TRAIN = random.randint(0, sys.maxint)
        SEED_FOR_TRAIN = 1234567890
        SEED_FOR_SCORE = 9876543210
        errorHistory = []
        enumHistory = []
        lastcolsTrainHistory = []
        lastcolsScoreHistory = []

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            enumList = create_enum_list(listSize=ENUMS)
            # reverse the list
            enumList.reverse()

            # using the comma is nice to ensure no craziness
            colSepHexString = '2c' # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a' # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            # use same enum List
            enumListForScore = enumList

            print "Creating random", csvPathname, "for rf model building"
            lastcols = write_syn_dataset(csvPathname, enumList, rowCount, colCount, 
                colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_TRAIN)

            lastcolsTrainHistory.append(lastcols)

            print "Creating random", csvScorePathname, "for rf scoring with prior model (using same enum list)"
            # same enum list/mapping, but different dataset?
            lastcols = write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, 
                colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_FOR_SCORE)
            lastcolsScoreHistory.append(lastcols)

            scoreDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, 
                timeoutSecs=30, separator=colSepInt)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key,
                timeoutSecs=30, separator=colSepInt)
            print "Parse result['destination_key']:", parseResult['destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            modelKey = 'enums'
            # limit depth and number of trees to accentuate the issue with categorical split decisions

            if SPEEDRF:
                kwargs = {
                    'destination_key': modelKey,
                    'response': y,
                    'num_trees': 1,
                    'max_depth': 100,
                    'oobee': 1,
                    'seed': 123456789,
                }
            else:
                kwargs = {
                    'destination_key': modelKey,
                    'response': y,
                    'classification': 1,
                    'ntrees': 1,
                    'max_depth': 100,
                    'min_rows': 1,
                    'validation': scoreDataKey,
                    'seed': 123456789,
                }

            for r in range(4):
                start = time.time()
                
                if SPEEDRF:
                    rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult, 
                        timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
                else:
                    rfResult = h2o_cmd.runRF(parseResult=parseResult, 
                        timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
                
                print "rf end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
                # print h2o.dump_json(rfResult)
                (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)
                h2o_cmd.runScore(dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, doAUC=not MULTINOMIAL) # , expectedAuc=0.5)
                
                errorHistory.append(classification_error)
                enumHistory.append(enumList)

            print "error from all runs on this dataset (with different enum mappings)"
            print errorHistory
            for e in enumHistory:
                print e

            print "last row from all train datasets, as integer"
            for l in lastcolsTrainHistory:
                print l
            print "last row from all score datasets, as integer"
            for l in lastcolsScoreHistory:
                print l
예제 #7
0
    def test_GLM_enums_score_subset(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 200
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 3, 'cF', 300),
            (n, 4, 'cG', 300),
            (n, 5, 'cH', 300),
            (n, 6, 'cI', 300),
        ]

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(csvPathname,
                              enumList,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname,
                              enumListForScore,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            parseKey = h2o_cmd.parseFile(None,
                                         csvPathname,
                                         key2=key2,
                                         timeoutSecs=30,
                                         separator=colSepInt)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseKey['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            kwargs = {
                'y': y,
                'max_iter': 1,
                'n_folds': 1,
                'alpha': 0.2,
                'lambda': 1e-5,
                'case_mode': '=',
                'case': 0
            }
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=180,
                                     **kwargs)
            print "glm end on ", parseKey[
                'destination_key'], 'took', time.time() - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            GLMModel = glm['GLMModel']
            modelKey = GLMModel['model_key']

            parseKey = h2o_cmd.parseFile(None,
                                         csvScorePathname,
                                         key2="score_" + key2,
                                         timeoutSecs=30,
                                         separator=colSepInt)

            start = time.time()
            # score with same dataset (will change to recreated dataset with one less enum
            glmScore = h2o_cmd.runGLMScore(key=parseKey['destination_key'],
                                           model_key=modelKey,
                                           thresholds="0.5",
                                           timeoutSecs=timeoutSecs)
            print "glm end on ", parseKey[
                'destination_key'], 'took', time.time() - start, 'seconds'
            ### print h2o.dump_json(glmScore)
            classErr = glmScore['validation']['classErr']
            auc = glmScore['validation']['auc']
            err = glmScore['validation']['err']
            print "classErr:", classErr
            print "err:", err
            print "auc:", auc
예제 #8
0
    def test_parse_time(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_time.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        colCount = 6
        rowCount = 10
        headerData = rand_header(colCount)
        write_syn_dataset(csvPathname, rowCount, colCount, headerData)

        for trial in range(1):
            rowData = rand_rowData()
            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            src_key = csvFilename + "_" + str(trial)
            hex_key = csvFilename + "_" + str(trial) + ".hex"

            start = time.time()
            parseResultA = h2i.import_parse(path=csvPathname,
                                            schema='put',
                                            src_key=src_key,
                                            hex_key=hex_key)
            print "\nA trial #", trial, "parse end on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'
            inspect = h2o_cmd.runInspect(key=hex_key)
            numRowsA = inspect['numRows']
            numColsA = inspect['numCols']

            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               timeoutSecs=100,
                                               numCols=numColsA,
                                               numRows=numRowsA,
                                               noPrint=True)

            print summaryResult
            h2o_cmd.infoFromSummary(summaryResult)
            (missingValuesDictA, constantValuesDictA, enumSizeDictA, colTypeDictA, colNameDictA) = \
                h2o_cmd.columnInfoFromInspect(hex_key, exceptionOnMissingValues=False)

            if constantValuesDictA or enumSizeDictA:
                raise Exception(
                    "Should be empty?  constantValuesDictA %s enumSizeDictA %s"
                    % (constantValuesDictA, enumSizeDictA))

            print "missingValuesListA", missingValuesListA

            # self.assertEqual(missingValuesListA, [], "missingValuesList should be empty")
            self.assertEqual(numColsA, colCount)
            self.assertEqual(numRowsA, rowCount)

            # do a little testing of saving the key as a csv
            csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv"
            h2o.nodes[0].csv_download(src_key=hex_key,
                                      csvPathname=csvDownloadPathname)

            # remove the original parsed key. source was already removed by h2o
            h2o.nodes[0].remove_key(hex_key)
            # interesting. what happens when we do csv download with time data?
            start = time.time()
            parseResultB = h2i.import_parse(path=csvDownloadPathname,
                                            schema='put',
                                            src_key=src_key,
                                            hex_key=hex_key)
            print "B trial #", trial, "parse end on ", csvFilename, 'took', time.time(
            ) - start, 'seconds'
            inspect = h2o_cmd.runInspect(key=hex_key)
            numRowsB = inspect['numRows']
            numColsB = inspect['numCols']
            print "missingValuesListB", missingValuesListB
            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               timeoutSecs=100,
                                               numCols=numColsB,
                                               numRows=numRowsB,
                                               noPrint=True)
            (missingValuesDictB, constantValuesDictB, enumSizeDictB, colTypeDictB, colNameDictB) = \
                h2o_cmd.columnInfoFromInspect(hex_key, exceptionOnMissingValues=False)
            if constantValuesDictB or enumSizeDictB:
                raise Exception(
                    "Should be empty?  constantValuesDictB %s enumSizeDictB %s"
                    % (constantValuesDictB, enumSizeDictB))

            self.assertEqual(
                missingValuesListA, missingValuesListB,
                "missingValuesList mismatches after re-parse of downloadCsv result"
            )
            self.assertEqual(
                numColsA, numColsB,
                "numCols mismatches after re-parse of downloadCsv result")
            # H2O adds a header to the csv created. It puts quotes around the col numbers if no header
            # but in this dataset we have a header too, so the row counts should be equal
            # if not, maybe the parse of our dataset didn't detect a row
            self.assertEqual(
                numRowsA, numRowsB,
                "numRowsA: %s numRowsB: %s mismatch after re-parse of downloadCsv result"
                % (numRowsA, numRowsB))

            # FIX! should do some comparison of values?
            # maybe can use exec to checksum the columns and compare column list.
            # or compare to expected values? (what are the expected values for the number for time inside h2o?)

            # FIX! should compare the results of the two parses. The infoFromInspect result?
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
예제 #9
0
    def test_exec_enums_rand_cut(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (ROWS, 3, 2, 'cE', 300), 
            ]

        # create key names to use for exec
        eKeys = ['e%s' % i for i in range(10)]

        # h2b.browseTheCloud()
        trial = 0
        for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList:
            colCount = iColCount + oColCount

            hex_key = 'p'
            colEnumList = create_col_enum_list(iColCount)

            # create 100 possible cut expressions here, so we don't waste time below
            rowExprList = []
            for j in range(CUT_EXPR_CNT):
                print "Creating", CUT_EXPR_CNT, 'cut expressions'
                # init cutValue. None means no compare
                cutValue = [None for i in range(iColCount)]
                # build up a random cut expression
                cols = random.sample(range(iColCount), random.randint(1,iColCount))
                for c in cols:
                    # possible choices within the column
                    # cel = colEnumList[c]
                    cel = colEnumList
                    # for now the cutValues are numbers for the enum mappings
                    if 1==1:
                        # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like
                        celChoice = str(random.choice(range(len(cel))))
                    else:
                        celChoice = random.choice(cel)
                    cutValue[c] = celChoice
    
                cutExprList = []
                for i,c in enumerate(cutValue):
                    if c is None:   
                        continue
                    else:
                        # new ...ability to reference cols
                        # src[ src$age<17 && src$zip=95120 && ... , ]
                        cutExprList.append('p$C'+str(i+1)+'=='+c)

                cutExpr = ' && '.join(cutExprList)
                print "cutExpr:", cutExpr    

                # should be two different keys in the sample
                e = random.sample(eKeys,2)
                fKey = e[0]
                eKey = e[1]

                rowExpr = '%s[%s,];' % (hex_key, cutExpr)
                print "rowExpr:", rowExpr
                rowExprList.append(rowExpr)

                print "j:", j

            # CREATE DATASET*******************************************
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList)

            # PARSE*******************************************************

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False, header=0)

            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            # print h2o.dump_json(inspect)

            rSummary = h2o_cmd.runSummary(key=parseResult['destination_key'])
            h2o_cmd.infoFromSummary(rSummary)

            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

            # error if any col has constant values
            if len(constantValuesDict) != 0:
                raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict)

            # INIT all possible key names used***************************
            # remember. 1 indexing!

            # is this needed?
            if 1==1:
                a = 'a=c(1,2,3);' + ';'.join(['a[,%s]=a[,%s-1]'% (i,i) for i in range(2,colCount)])
                print a
                for eKey in eKeys:
                    # build up the columns
                    e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey), print_params=False)
                    ## print h2o.dump_json(e)


            xList = []
            eList = []
            fList = []
            for repeat in range(200):
                # EXEC*******************************************************
                # don't use exec_expr to avoid issues with Inspect following etc.
                randICol = random.randint(0,iColCount-1)
                randOCol = random.randint(iColCount, iColCount+oColCount-1)

                # should be two different keys in the sample
                e = random.sample(eKeys,2)
                fKey = e[0]
                eKey = e[1]

                if 1==0:
                    start = time.time()
                    e = h2o.nodes[0].exec_query(str='%s=%s[,%s]' % (fKey, hex_key, randOCol+1))

                    elapsed = time.time() - start
                    print "exec 1 took", elapsed, "seconds."
                    execTime = elapsed

                if 1==1:
                    start = time.time()
                    h2o.nodes[0].exec_query(str="%s=%s" % (fKey, random.choice(rowExprList)))
                    elapsed = time.time() - start
                    execTime = elapsed
                    print "exec 2 took", elapsed, "seconds."
                
                if 1==0:
                    gKey = random.choice(eKeys)
                    # do a 2nd random to see if things blow up
                    start = time.time()
                    h2o.nodes[0].exec_query(str="%s=%s" % (gKey, fKey))
                    elapsed = time.time() - start
                    print "exec 3 took", elapsed, "seconds."

                if 1==1:
                    inspect = h2o_cmd.runInspect(key=fKey)
                    h2o_cmd.infoFromInspect(inspect, fKey)
                    numRows = inspect['numRows']
                    numCols = inspect['numCols']

                if numRows==0 or numCols!=colCount:
                    h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort")

                # QUANTILE*******************************************************
                quantile = 0.5 if DO_MEDIAN else .999
                # first output col. always fed by an exec cut, so 0?
                column = iColCount
                start = time.time()
                q = h2o.nodes[0].quantiles(source_key=fKey, column=column, 
                    quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS)
                h2p.red_print("quantile", quantile, q['result'])
                elapsed = time.time() - start
                print "quantile end on ", fKey, 'took', elapsed, 'seconds.'
                quantileTime = elapsed


                # remove all keys*******************************************************
                # what about hex_key?
                if 1==0:
                    start = time.time()
                    h2o.nodes[0].remove_all_keys()
                    elapsed = time.time() - start
                    print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.'

                trial += 1
                xList.append(trial)
                eList.append(execTime)
                fList.append(quantileTime)


        # just get a plot of the last one (biggest)
        if DO_PLOT:
            xLabel = 'trial'
            eLabel = 'exec cut time'
            fLabel = 'quantile time'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
예제 #10
0
    def test_quant_cols(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if getpass.getuser() == 'kevin':
            tryList = [
                (None, '/home/kevin/Downloads/t.csv', 15, 11, 'cE', 300),
                ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None,
                 'cE', 300),
            ]
        else:
            tryList = [
                ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None,
                 'cE', 300),
            ]

        # h2b.browseTheCloud()
        trial = 0
        for (bucket, csvPathname, iColCount, oColCount, hex_key,
             timeoutSecs) in tryList:
            xList = []
            eList = []
            fList = []

            # PARSE*******************************************************
            parseResult = h2i.import_parse(bucket=bucket,
                                           path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=200,
                                           doSummary=False)
            csvPathnameFull = h2i.find_folder_and_filename(bucket,
                                                           csvPathname,
                                                           returnFullPath=True)

            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            if not oColCount:
                iColCount = 0

            if not oColCount:
                oColCount = numCols

            colCount = iColCount + oColCount
            for i in range(0, numCols):
                print "Column", i, "summary"
                h2o_cmd.runSummary(key=hex_key, max_qbins=1, cols=i)

            # print h2o.dump_json(inspect)
            levels = h2o.nodes[0].levels(source=hex_key)
            # print "levels result:", h2o.dump_json(levels)

            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

            # error if any col has constant values
            if len(constantValuesDict) != 0:
                # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict)
                print "Probably got a col NA'ed and constant values as a result %s" % constantValuesDict

            # start after the last input col
            levels = h2o.nodes[0].levels(source=hex_key)
            l = levels['levels']
            for column in range(iColCount, iColCount + oColCount):
                if l[column]:
                    print "Skipping", column, "because it's enum (says levels)"
                    continue

                # QUANTILE*******************************************************

                quantile = 0.5 if DO_MEDIAN else .999
                # first output col. always fed by an exec cut, so 0?
                start = time.time()
                # file has headers. use col index
                q = h2o.nodes[0].quantiles(source_key=hex_key,
                                           column=column,
                                           quantile=quantile,
                                           max_qbins=MAX_QBINS,
                                           multiple_pass=1)
                qresult = q['result']
                h2p.red_print("result:", q['result'], "quantile", quantile,
                              "interpolated:", q['interpolated'], "iterations",
                              q['iterations'])
                elapsed = time.time() - start
                print "quantile end on ", hex_key, 'took', elapsed, 'seconds.'
                quantileTime = elapsed

                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                if 1 == 1:
                    h2o_summ.quantile_comparisons(
                        csvPathnameFull,
                        skipHeader=True,
                        col=column,  # what col to extract from the csv
                        datatype='float',
                        quantile=0.5 if DO_MEDIAN else 0.999,
                        # h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                        # h2oQuantilesApprox=qresult_single,
                        h2oQuantilesExact=qresult,
                        use_genfromtxt=True,
                    )

                trial += 1
                execTime = 0
                xList.append(column)
                eList.append(execTime)
                fList.append(quantileTime)

                # remove all keys*******************************************************
                # what about hex_key?
                if 1 == 0:
                    start = time.time()
                    h2o.nodes[0].remove_all_keys()
                    elapsed = time.time() - start
                    print "remove all keys end on took", elapsed, 'seconds.'

        #****************************************************************
        # PLOTS. look for eplot.jpg and fplot.jpg in local dir?
        if DO_PLOT:
            xLabel = 'column (0 is first)'
            eLabel = 'exec cut time'
            fLabel = 'quantile time'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList,
                              xLabel,
                              eListTitle,
                              eList,
                              eLabel,
                              fListTitle,
                              fList,
                              fLabel,
                              server=True)
예제 #11
0
    def sub_c3_nongz_fvec_long(self, csvFilenameList):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        print "Using nongz'ed files in", importFolderPath

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                if DO_DOUBLE_IMPORT:
                    (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
                    importFullList = importResult['files']
                    importFailList = importResult['fails']
                    print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key="A.hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                fileMBS = (totalBytes/1e6)/elapsed
                msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                print msg
                h2o.cloudPerfH2O.message(msg)
                h2o_cmd.checkKeyDistribution()

                # are the unparsed keys slowing down exec?
                h2i.delete_keys_at_all_nodes(pattern="manyfile")

                execExpr = 'B.hex=A.hex'
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                h2o_cmd.checkKeyDistribution()

                execExpr = 'C.hex=B.hex'
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                h2o_cmd.checkKeyDistribution()

                execExpr = 'D.hex=C.hex'
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                h2o_cmd.checkKeyDistribution()
예제 #12
0
    def test_GLM2_enums_score_subset(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 500
        tryList = [
            # (n, 1, 'cD', 300), 
            # (n, 2, 'cE', 300), 
            # (n, 3, 'cF', 300), 
            (n, 4, 'cG', 300), 
            (n, 5, 'cH', 300), 
            (n, 6, 'cI', 300), 
            ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c' # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a' # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList,5)

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=30, separator=colSepInt)

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="score_" + hex_key, 
                timeoutSecs=30, separator=colSepInt)


            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            kwargs = {'response': y, 'max_iter': 8, 'family': 'binomial', 'n_folds': 2, 'alpha': 0.2, 'lambda': 1e-5}
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
            print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            # Score *******************************
            # this messes up if you use case_mode/case_vale above
            predictKey = 'Predict.hex'
            modelKey = glm['glm_model']['_key']
            h2o_cmd.runScore(dataKey="score_" + hex_key, modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.5)
예제 #13
0
    def test_GLM_enums_unbalanced(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 2000
        tryList = [
            (n, 1, 'cD', 300), 
            (n, 2, 'cE', 300), 
            (n, 4, 'cF', 300), 
            (n, 8, 'cG', 300), 
            (n, 16, 'cH', 300), 
            (n, 32, 'cI', 300), 
            ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c' # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a' # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList,5)

            print "Creating random", csvPathname, "for glm2 model building"
            write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            print "Creating another random", csvScorePathname, "for glm2 scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=30, separator=colSepInt)
            print "Parse result['destination_key']:", parseResult['destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            testDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=testDataKey,
                timeoutSecs=30, separator=colSepInt)

            y = colCount
            modelKey = 'glm_model'
            kwargs = {
                'standardize': 0,
                'destination_key': modelKey,
                'response': 'C' + str(y+1), 
                'max_iter': 200, 
                'family': 'binomial',
                'n_folds': 0, 
                'alpha': 0, 
                'lambda': 0, 
                }

            start = time.time()

            updateList= [ 
                {'alpha': 0.5, 'lambda': 1e-4},
                {'alpha': 0.25, 'lambda': 1e-6},
                {'alpha': 0.0, 'lambda': 1e-12},
                {'alpha': 0.5, 'lambda': 1e-12},
                {'alpha': 0.0, 'lambda': 1e-12},
                {'alpha': 0.0, 'lambda': 0},
            ]

            # Try each one
            for updateDict in updateList:
                print "\n#################################################################"
                print updateDict
                kwargs.update(updateDict)
                print "If we poll, we get a message saying it was cancelled by user??"
                glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
                print "glm2 end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'

                glm_model = glm['glm_model']
                _names = glm_model['_names']
                modelKey = glm_model['_key']
                coefficients_names = glm_model['coefficients_names']
                submodels = glm_model['submodels'][0]

                beta = submodels['beta']
                norm_beta = submodels['norm_beta']
                iteration = submodels['iteration']

                validation = submodels['validation']

                auc = validation['auc']
                aic = validation['aic']
                null_deviance = validation['null_deviance']
                residual_deviance = validation['residual_deviance']

                print '_names', _names
                print 'coefficients_names', coefficients_names
                # did beta get shortened? the simple check confirms names/beta/norm_beta are same length

                print 'beta', beta
                print 'iteration', iteration
                print 'auc', auc

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
                if iteration > 20:
                    raise Exception("Why take so many iterations:  %s in this glm2 training?" % iterations)

               # Score **********************************************
                print "Problems with test data having different enums than train? just use train for now"
                testDataKey = hex_key
                h2o_cmd.runScore(dataKey=testDataKey, modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.5)
예제 #14
0
    def test_GLM2_enums_score_superset(self):
        h2o.beta_features = True
        print "FIX!: this should cause an error. We should detect that it's not causing an error/warning?"
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 200
        tryList = [
            (n, 1, "cD", 300),
            (n, 2, "cE", 300),
            (n, 3, "cF", 300),
            (n, 4, "cG", 300),
            (n, 5, "cH", 300),
            (n, 6, "cI", 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = "2c"  # comma
            colSepChar = colSepHexString.decode("hex")
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = "0a"  # newline
            rowSepChar = rowSepHexString.decode("hex")
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = "syn_enums_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename
            csvScoreFilename = "syn_enums_score_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvScorePathname = SYNDATASETS_DIR + "/" + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            # add a extra enum for scoring that's not in the model enumList
            enumListForScore.append("xyzzy")

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(
                csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar
            )

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(
                csvScorePathname,
                enumListForScore,
                rowCount,
                colCount,
                SEEDPERFILE,
                colSepChar=colSepChar,
                rowSepChar=rowSepChar,
            )

            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, separator=colSepInt
            )
            print "Parse result['destination_key']:", parseResult["destination_key"]

            print "\n" + csvFilename
            (
                missingValuesDict,
                constantValuesDict,
                enumSizeDict,
                colTypeDict,
                colNameDict,
            ) = h2o_cmd.columnInfoFromInspect(parseResult["destination_key"], exceptionOnMissingValues=True)

            y = colCount
            modelKey = "enums"
            kwargs = {
                "destination_key": modelKey,
                "response": y,
                "max_iter": 1,
                "n_folds": 1,
                "alpha": 0.2,
                "lambda": 1e-5,
                "family": "binomial",
            }

            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
            print "glm end on ", parseResult["destination_key"], "took", time.time() - start, "seconds"
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            scoreDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(
                path=csvScorePathname, schema="put", hex_key=scoreDataKey, timeoutSecs=30, separator=colSepInt
            )

            # Score *******************************
            # this messes up if you use case_mode/case_vale above
            predictKey = "Predict.hex"
            start = time.time()

            predictResult = h2o_cmd.runPredict(
                data_key=scoreDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs
            )

            # just get a predict and AUC on the same data. has to be binomial result
            resultAUC = h2o.nodes[0].generate_auc(
                thresholds=None, actual=scoreDataKey, predict="Predict.hex", vactual=y, vpredict=1
            )
            auc = resultAUC["AUC"]
            self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=scoreDataKey, predict=predictKey, vactual="C" + str(y + 1), vpredict="predict"
            )

            cm = predictCMResult["cm"]

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm)

            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)
예제 #15
0
    def test_GLM_enums_unbalanced(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 2000
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 4, 'cF', 300),
            (n, 8, 'cG', 300),
            (n, 16, 'cH', 300),
            (n, 32, 'cI', 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            print "Creating random", csvPathname, "for glm2 model building"
            write_syn_dataset(csvPathname,
                              enumList,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            print "Creating another random", csvScorePathname, "for glm2 scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname,
                              enumListForScore,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            testDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname,
                                           schema='put',
                                           hex_key=testDataKey,
                                           timeoutSecs=30,
                                           separator=colSepInt)

            y = colCount
            modelKey = 'glm_model'
            kwargs = {
                'standardize': 0,
                'destination_key': modelKey,
                'response': 'C' + str(y + 1),
                'max_iter': 200,
                'family': 'binomial',
                'n_folds': 0,
                'alpha': 0,
                'lambda': 0,
            }

            start = time.time()

            updateList = [
                {
                    'alpha': 0.5,
                    'lambda': 1e-4
                },
                {
                    'alpha': 0.25,
                    'lambda': 1e-6
                },
                {
                    'alpha': 0.0,
                    'lambda': 1e-12
                },
                {
                    'alpha': 0.5,
                    'lambda': 1e-12
                },
                {
                    'alpha': 0.0,
                    'lambda': 1e-12
                },
                {
                    'alpha': 0.0,
                    'lambda': 0
                },
            ]

            # Try each one
            h2o.beta_features = True
            for updateDict in updateList:
                print "\n#################################################################"
                print updateDict
                kwargs.update(updateDict)
                print "If we poll, we get a message saying it was cancelled by user??"
                glm = h2o_cmd.runGLM(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=180,
                                     noPoll=True,
                                     **kwargs)
                h2j.pollWaitJobs(timeoutSecs=300,
                                 pollTimeoutSecs=300,
                                 retryDelaySecs=5,
                                 errorIfCancelled=True)
                glm = h2o.nodes[0].glm_view(_modelKey=modelKey)
                print "glm2 end on ", parseResult[
                    'destination_key'], 'took', time.time() - start, 'seconds'

                glm_model = glm['glm_model']
                _names = glm_model['_names']
                modelKey = glm_model['_key']
                coefficients_names = glm_model['coefficients_names']
                submodels = glm_model['submodels'][0]

                beta = submodels['beta']
                norm_beta = submodels['norm_beta']
                iteration = submodels['iteration']

                validation = submodels['validation']

                if not validation or 'avg_err' not in validation:
                    raise Exception("glm: %s" % h2o.dump_json(glm) + \
                        "\nNo avg_err in validation." + \
                        "\nLikely if you look back, the job was cancelled, so there's no cross validation.")

                avg_err = validation['avg_err']
                auc = validation['auc']
                aic = validation['aic']
                null_deviance = validation['null_deviance']
                residual_deviance = validation['residual_deviance']

                print '_names', _names
                print 'coefficients_names', coefficients_names
                # did beta get shortened? the simple check confirms names/beta/norm_beta are same length
                print 'beta', beta
                print 'iteration', iteration
                print 'avg_err', avg_err
                print 'auc', auc

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
                if iteration > 20:
                    raise Exception(
                        "Why take so many iterations:  %s in this glm2 training?"
                        % iterations)

            # Score **********************************************
                print "Problems with test data having different enums than train? just use train for now"
                testDataKey = hex_key
                predictKey = 'Predict.hex'
                start = time.time()

                predictResult = h2o_cmd.runPredict(data_key=testDataKey,
                                                   model_key=modelKey,
                                                   destination_key=predictKey,
                                                   timeoutSecs=timeoutSecs)

                predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                    actual=testDataKey,
                    vactual='C' + str(y),
                    predict=predictKey,
                    vpredict='predict',
                )

                cm = predictCMResult['cm']

                # These will move into the h2o_gbm.py
                pctWrong = h2o_gbm.pp_cm_summary(cm)
                self.assertLess(
                    pctWrong, 8,
                    "Should see less than 7 pct error (class = 4): %s" %
                    pctWrong)

                print "\nTest\n==========\n"
                print h2o_gbm.pp_cm(cm)

                if 1 == 0:
                    # stuff from GLM1

                    classErr = glmScore['validation']['classErr']
                    auc = glmScore['validation']['auc']
                    err = glmScore['validation']['err']
                    nullDev = glmScore['validation']['nullDev']
                    resDev = glmScore['validation']['resDev']
                    h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)

                    print "score classErr:", classErr
                    print "score err:", err
                    print "score auc:", auc
                    print "score resDev:", resDev
                    print "score nullDev:", nullDev

                    if math.isnan(resDev):
                        emsg = "Why is this resDev = 'nan'?? %6s %s" % (
                            "resDev:\t", validation['resDev'])
                        raise Exception(emsg)

                    # what is reasonable?
                    # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err)
                    self.assertAlmostEqual(
                        auc,
                        0.5,
                        delta=0.15,
                        msg="actual auc: %s not close enough to 0.5" % auc)

                    if math.isnan(err):
                        emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t",
                                                                     err)
                        raise Exception(emsg)

                    if math.isnan(resDev):
                        emsg = "Why is this resDev = 'nan'?? %6s %s" % (
                            "resDev:\t", resDev)
                        raise Exception(emsg)

                    if math.isnan(nullDev):
                        emsg = "Why is this nullDev = 'nan'?? %6s %s" % (
                            "nullDev:\t", nullDev)
예제 #16
0
    def sub_c3_nongz_fvec_long(self, csvFilenameList):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        print "Using nongz'ed files in", importFolderPath

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                if DO_DOUBLE_IMPORT:
                    (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
                    importFullList = importResult['files']
                    importFailList = importResult['fails']
                    print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                if DO_GLM:
                    # these are all the columns that are enums in the dataset...too many for GLM!
                    x = range(542) # don't include the output column
                    # remove the output too! (378)
                    ignore_x = []
                    for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]:
                        x.remove(i)
                        ignore_x.append(i)
                    x.remove(378)

                    # add one since we are no longer 0 based offset
                    x = ",".join(map(lambda x: "C" + str(x+1), x))
                    ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x))

                    GLMkwargs = {
                        'ignored_cols': ignore_x, 
                        'response': 'C379', 
                        'max_iter': 4, 
                        'n_folds': 1, 
                        'family': 'binomial',
                        'alpha': 0.2, 
                        'lambda': 1e-5
                    }

                    # convert to binomial
                    execExpr="A.hex=%s" % parseResult['destination_key']
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=60)

                    execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)'
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=60)

                    aHack = {'destination_key': "A.hex"}

                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                h2o_cmd.checkKeyDistribution()
예제 #17
0
    def sub_c3_fvec_long(self):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        avgMichalSize = 116561140
        bucket = "home-0xdiag-datasets"
        ### importFolderPath = 'more1_1200_link'
        importFolderPath = "manyfiles-nflx-gz"
        print "Using .gz'ed files in", importFolderPath
        if len(h2o.nodes) == 1:
            csvFilenameList = [("*[1][0][0-9].dat.gz", "file_10_A.dat.gz", 10 * avgMichalSize, 600)]
        else:
            csvFilenameList = [
                ("*[1][0-4][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 1800),
                # ("*[1][0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 1800),
            ]

        if LOG_MACHINE_STATS:
            benchmarkLogging = ["cpu", "disk", "network"]
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern

            (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema="local")
            importFullList = importResult["files"]
            importFailList = importResult["fails"]
            print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

            # this accumulates performance stats into a benchmark log over multiple runs
            # good for tracking whether we're getting slower or faster
            h2o.cloudPerfH2O.change_logfile(csvFilename)
            h2o.cloudPerfH2O.message("")
            h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

            start = time.time()
            parseResult = h2i.import_parse(
                bucket=bucket,
                path=csvPathname,
                schema="local",
                hex_key=csvFilename + ".hex",
                timeoutSecs=timeoutSecs,
                retryDelaySecs=retryDelaySecs,
                pollTimeoutSecs=pollTimeoutSecs,
                benchmarkLogging=benchmarkLogging,
            )
            elapsed = time.time() - start
            print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )

            print "Parse result['destination_key']:", parseResult["destination_key"]
            h2o_cmd.columnInfoFromInspect(parseResult["destination_key"], exceptionOnMissingValues=False)

            if totalBytes is not None:
                fileMBS = (totalBytes / 1e6) / elapsed
                msg = "{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs".format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed
                )
                print msg
                h2o.cloudPerfH2O.message(msg)

            if DO_GLM:
                # these are all the columns that are enums in the dataset...too many for GLM!
                x = range(542)  # don't include the output column
                # remove the output too! (378)
                ignore_x = []
                for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541]:
                    x.remove(i)
                    ignore_x.append(i)
                x.remove(378)

                # add one since we are no longer 0 based offset
                x = ",".join(map(lambda x: "C" + str(x + 1), x))
                ignore_x = ",".join(map(lambda x: "C" + str(x + 1), ignore_x))

                GLMkwargs = {
                    "ignored_cols": ignore_x,
                    "response": "C379",
                    "max_iter": 4,
                    "n_folds": 1,
                    "family": "binomial",
                    "alpha": 0.2,
                    "lambda": 1e-5,
                }

                # convert to binomial
                execExpr = "A.hex=%s" % parseResult["destination_key"]
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=60)
                execExpr = "A.hex[,%s]=(A.hex[,%s]>%s)" % ("C379", "C379", 15)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=60)
                aHack = {"destination_key": "A.hex"}

                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs)
                elapsed = time.time() - start
                h2o.check_sandbox_for_errors()

                h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                msg = "{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs".format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed
                )
                print msg
                h2o.cloudPerfH2O.message(msg)

            h2o_cmd.checkKeyDistribution()
    def test_parse_multi_header_rand(self):
        ### h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_ints.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        allowedLetters = 'abcdeABCDE01234[]'
        headerChoices = []
        for n in range(20):
            l = random.randint(1, 64)  # random length headers
            headerName = ''.join(
                [random.choice(allowedLetters) for _ in range(l)])
            headerChoices.append(headerName)

        # cols must be 9 to match the header above, otherwise a different bug is hit
        # extra output is added, so it's 10 total
        tryList = [
            # FIX! one fails count for now
            # (1, 5, 9, 'cA', 60, 0),
            (1, 5, 9, 'cA', 60, 0),
            (1, 5, 25, 'cA', 60, 0),

            # try with col mismatch on header.
            # FIX! causes exception? don't test for now
            # (7, 300, 10, 'cA', 60, 0),
            # (7, 300, 10, 'cB', 60, 1),
            # (7, 300, 10, 'cC', 60, 2),
            # (7, 300, 10, 'cD', 60, 3),

            # (7, 300, 8, 'cA', 60, 0),
            # (7, 300, 8, 'cB', 60, 1),
            # (7, 300, 8, 'cC', 60, 2),
            # (7, 300, 8, 'cD', 60, 3),
        ]

        # so many random combos..rather than walk tryList, just do random for some amount of time
        for trial in range(50):
            (fileNum, rowCount, colCount, hex_key, timeoutSecs,
             dataRowsWithHeader) = random.choice(tryList)
            print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader
            # FIX! should we add a header to them randomly???
            print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            totalCols = colCount + 1  # 1 extra for output
            totalDataRows = 0
            totalHeaderRows = 0

            # HEADER_HAS_HDR_ROW = random.randint(0,1)
            HEADER_HAS_HDR_ROW = 1
            # DATA_HAS_HDR_ROW = random.randint(0,1)
            DATA_HAS_HDR_ROW = 0
            # PARSE_PATTERN_INCLUDES_HEADER = random.randint(0,1)
            PARSE_PATTERN_INCLUDES_HEADER = 0
            ## DATA_FIRST_IS_COMMENT = random.randint(0,1)
            ## HEADER_FIRST_IS_COMMENT = random.randint(0,1)
            print "TEMPORARY: don't put any comments in"
            DATA_FIRST_IS_COMMENT = 0
            HEADER_FIRST_IS_COMMENT = 0
            # none is not legal
            # SEP_CHAR_GEN = random.choice(paramsDict['separator'])
            SEP_CHAR_GEN = "\t"

            print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW
            print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW
            print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER
            print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT
            print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT
            print 'SEP_CHAR_GEN:', SEP_CHAR_GEN

            # they need to both use the same separator (h2o rule)
            hh = [random.choice(headerChoices)
                  for h in range(colCount)] + ["output"]
            print hh
            print "UPDATE: always use comma (space legal also?) for header separator?? it should work no matter what separator the data uses?"
            headerForHeader = ",".join(hh)
            # make these different
            hh = [random.choice(headerChoices)
                  for h in range(colCount)] + ["output"]
            headerForData = SEP_CHAR_GEN.join(hh)

            # random selection of parse param choices
            kwargs = {}
            for k, v in paramsDict.items():
                aChoice = random.choice(v)
                # can tell h2o something different compared to what we actually used!
                if k == 'separator':
                    if aChoice:
                        sepChar = aChoice
                        sepCharInt = ord(aChoice)  # make it an integer for h2o
                    else:
                        sepChar = ','  # default char for None, need it for header/data file creation
                        sepCharInt = None
                    aChoice = sepCharInt

                kwargs[k] = aChoice

            # FOR NOW: ..override the rand choice if it exists, so we can parse and expect 'A' to be found
            # match what was gen'ed if choice is not None
            if kwargs['separator']:
                if SEP_CHAR_GEN == " " or SEP_CHAR_GEN == ",":  # parse doesn't auto-detect tab. will autodetect space and comma
                    del kwargs['separator']
                else:
                    kwargs['separator'] = ord(SEP_CHAR_GEN)

            # create data files
            for fileN in range(fileNum):
                csvFilename = 'syn_data_' + str(fileN) + "_" + str(
                    SEED) + "_" + str(trial) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN)
                (headerRowsDone, dataRowsDone) = write_syn_dataset(
                    csvPathname,
                    rowCount,
                    headerString=(headerForData if DATA_HAS_HDR_ROW else None),
                    rList=rList,
                    commentFirst=DATA_FIRST_IS_COMMENT,
                    sepChar=SEP_CHAR_GEN)
                totalDataRows += dataRowsDone
                totalHeaderRows += headerRowsDone

            # create the header file
            hdrFilename = 'syn_header_' + str(SEED) + "_" + str(
                trial) + "_" + rowxcol + '.csv'
            hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename
            # dataRowsWithHeader = 0 # temp hack
            (headerRowsDone, dataRowsDone) = write_syn_dataset(
                hdrPathname,
                dataRowsWithHeader,
                headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None),
                rList=rList,
                commentFirst=HEADER_FIRST_IS_COMMENT,
                sepChar=SEP_CHAR_GEN)
            if PARSE_PATTERN_INCLUDES_HEADER:  # only include header file data rows if the parse pattern includes it
                totalDataRows += dataRowsDone
            totalHeaderRows += headerRowsDone

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = "syn_dst" + str(trial) + ".hex"

            # DON"T get redirected to S3! (EC2 hack in config, remember!)
            # use it at the node level directly (because we gen'ed the files.
            # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w
            xs = h2o.nodes[0].import_files(SYNDATASETS_DIR)['keys']
            headerKey = [x for x in xs if hdrFilename in x][0]
            dataKey = [x for x in xs if csvFilename not in x][0]

            # use regex. the only files in the dir will be the ones we just created with  *fileN* match
            print "Header Key =", headerKey

            # put the right name in
            if kwargs['header_from_file'] == 'syn_header':
                kwargs['header_from_file'] = headerKey
            # use one of the data files?
            elif kwargs['header_from_file'] == 'syn_data':
                kwargs['header_from_file'] = dataKey

            # if there's no header in the header file, turn off the header_from_file
            if not HEADER_HAS_HDR_ROW:
                kwargs['header_from_file'] = None

            print "If header_from_file= is used, we are currently required to force header=1 for h2o"
            if kwargs['header_from_file']:
                kwargs['header'] = 1
            # if we have a header in a data file, tell h2o (for now)
            elif DATA_HAS_HDR_ROW:
                kwargs['header'] = 1
            else:
                kwargs['header'] = 0

            # may have error if h2o doesn't get anything!
            start = time.time()
            if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW:
                pattern = '*syn_*' + str(trial) + "_" + rowxcol + '*'
            else:
                pattern = '*syn_data_*' + str(trial) + "_" + rowxcol + '*'
            parseResult = h2i.parse_only(pattern=pattern,
                                         hex_key=hex_key,
                                         timeoutSecs=timeoutSecs,
                                         **kwargs)

            print "parseResult['destination_key']: " + parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # more reporting: (we can error here if extra col in header, causes all NA for missing col of data)
            h2o_cmd.columnInfoFromInspect(parseResult['destination_key'],
                                          exceptionOnMissingValues=False)

            # should match # of cols in header or ??
            self.assertEqual(inspect['numCols'], totalCols, \
                "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols))

            # do we end up parsing one data rows as a header because of mismatch in gen/param
            h2oLosesOneData = (headerRowsDone
                               == 0) and (kwargs['header']
                                          == 1) and not DATA_HAS_HDR_ROW
            # header in data file gets treated as data
            h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \
                DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None)
            print "h2oLosesOneData:", h2oLosesOneData
            print "h2oGainsOneData:", h2oGainsOneData
            ### print (headerRowsDone!=0), (kwargs['header']==1), DATA_HAS_HDR_ROW, (kwargs['header_from_file'] is not None)
            if h2oLosesOneData:
                totalDataRows -= 1
            if h2oGainsOneData:
                totalDataRows += 1

            self.assertEqual(inspect['numRows'], totalDataRows,
                "parse created result with the wrong number of rows (header rows don't count) h2o: %s gen'ed: %s" % \
                (inspect['numRows'], totalDataRows))

            # put in an ignore param, that will fail unless headers were parsed correctly
            # doesn't matter if the header got a comment, should see it
            h2oShouldSeeHeader = (HEADER_HAS_HDR_ROW and
                                  (kwargs['header_from_file']
                                   is not None)) or DATA_HAS_HDR_ROW
            if h2oShouldSeeHeader:
                kwargs = {'sample': 75, 'depth': 25, 'ntree': 1, 'ignore': 'A'}
            else:
                kwargs = {'sample': 75, 'depth': 25, 'ntree': 1}

            start = time.time()
            elapsed = time.time() - start
            print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100)
            print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \
                'took', time.time() - start, 'seconds'

            h2o.check_sandbox_for_errors()
예제 #19
0
    def test_GLM_many_enums(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if not localhost:
            n = 200
            tryList = [
                (n, 1, 'cD', 300), 
                (n, 2, 'cE', 300), 
                (n, 3, 'cF', 300), 
                (n, 4, 'cG', 300), 
                (n, 5, 'cH', 300), 
                (n, 6, 'cI', 300), 
                ]
        else:
            n = 150
            tryList = [
                (n, 1, 'cD', 300), 
                (n, 2, 'cE', 300), 
                (n, 3, 'cF', 300), 
                (n, 4, 'cG', 300), 
                (n, 5, 'cH', 300), 
                (n, 6, 'cI', 300), 
                (n, 7, 'cJ', 300), 
                (n, 9, 'cK', 300), 
                (n, 10, 'cLA', 300), 
                (n, 11, 'cDA', 300), 
                (n, 12, 'cEA', 300), 
                (n, 13, 'cFA', 300), 
                (n, 14, 'cGA', 300), 
                (n, 15, 'cHA', 300), 
                (n, 16, 'cIA', 300), 
                (n, 17, 'cJA', 300), 
                (n, 19, 'cKA', 300), 
                (n, 20, 'cLA', 300), 
                ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # just randomly pick the row and col cases.
            colSepCase = random.randint(0,1)
            colSepCase = 1
            # using the comma is nice to ensure no craziness
            if (colSepCase==0):
                colSepHexString = '01'
                quoteChars = ",\'\"" # more choices for the unquoted string
            else:
                colSepHexString = '2c' # comma
                quoteChars = ""

            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar
            print "colSepInt", colSepInt

            rowSepCase = random.randint(0,1)
            # using this instead, makes the file, 'row-readable' in an editor
            if (rowSepCase==0):
                rowSepHexString = '0a' # newline
            else:
                rowSepHexString = '0d0a' # cr + newline (windows) \r\n

            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar, quoteChars=quoteChars)

            # FIX! does 'separator=' take ints or ?? hex format
            # looks like it takes the hex string (two chars)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=30, separator=colSepInt)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            ### inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            kwargs = {'y': y, 'max_iter': 1, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5, 
                'case_mode': '=', 'case': 0}
            start = time.time()
            ### glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
    def test_rf_enums_score_superset_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 3000
        tryList = [
            (n, 1, 'cD', 300), 
            (n, 2, 'cE', 300), 
            (n, 3, 'cF', 300), 
            (n, 4, 'cG', 300), 
            (n, 5, 'cH', 300), 
            (n, 6, 'cI', 300), 
            ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c' # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a' # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList,5)

            # add a extra enum for scoring that's not in the model enumList
            enumListForScore.append("xyzzy")

            print "Creating random", csvPathname, "for rf model building"
            write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            print "Creating random", csvScorePathname, "for rf scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            scoreDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key=scoreDataKey, 
                timeoutSecs=30, separator=colSepInt)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key,
                timeoutSecs=30, separator=colSepInt)
            print "Parse result['destination_key']:", parseResult['destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            modelKey = 'enums'
            ntrees = 5
            kwargs = {
                'destination_key': modelKey,
                'response': y,
                'classification': 1,
                'ntrees': ntrees,
                'validation': scoreDataKey,
            }

            start = time.time()
            rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
            print "rf end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult, ntree=ntrees)
            predictKey = 'Predict.hex'
            h2o_cmd.runScore(dataKey=scoreDataKey, modelKey=modelKey, vactual=y, vpredict=1, expectedAuc=0.5)
예제 #21
0
    def sub_c2_nongz_fvec_long(self):
        # a kludge
        h2o.setup_benchmark_log()

        avgMichalSize = 237270000
        bucket = 'home-0xdiag-datasets'
        ### importFolderPath = 'more1_1200_link'
        importFolderPath = 'manyfiles-nflx'
        print "Using non-gz'ed files in", importFolderPath
        csvFilenameList= [
            ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-9][0-9].dat", "file_100_A.dat", 100 * avgMichalSize, 3600),
        ]

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                # double import still causing problems?
                # (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
                # importFullList = importResult['files']
                # importFailList = importResult['fails']
                # print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                if DO_GLM:
                    # remove the output too! (378)
                    ignore_x = [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]
                    ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x))

                    GLMkwargs = {
                        'ignored_cols': ignore_x, 
                        'family': 'binomial',
                        'response': 'C379', 
                        'max_iter': 4, 
                        'n_folds': 1, 
                        'family': 'binomial',
                        'alpha': 0.2, 
                        'lambda': 1e-5
                    }

                    # are the unparsed keys slowing down exec?
                    h2i.delete_keys_at_all_nodes(pattern="manyfile")

                    # convert to binomial
                    execExpr="A.hex=%s" % parseResult['destination_key']
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                    execExpr="A.hex[,%s]=(A.hex[,%s]>%s)" % ('379', '379', 15)
                    h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                    aHack = {'destination_key': "A.hex"}

                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                h2o_cmd.checkKeyDistribution()
예제 #22
0
    def test_quant_cols(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

    
        if getpass.getuser()=='kevin':
            tryList = [
                ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None, 'cE', 300), 
                (None, '/home/kevin/Downloads/t.csv', 15, 11, 'cE', 300), 
                ]
        else:
            tryList = [
                ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None, 'cE', 300), 
                ]

        # h2b.browseTheCloud()
        trial = 0
        for (bucket, csvPathname, iColCount, oColCount, hex_key, timeoutSecs) in tryList:
            xList = []
            eList = []
            fList = []

            # PARSE*******************************************************
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=200, doSummary=False)
            csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)

            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            if not oColCount:
                iColCount = 0

            if not oColCount:
                oColCount = numCols

            colCount = iColCount + oColCount
            for i in range (0,numCols):
                print "Column", i, "summary"
                h2o_cmd.runSummary(key=hex_key, max_qbins=1, cols=i);

            # print h2o.dump_json(inspect)
            levels = h2o.nodes[0].levels(source=hex_key)
            print "levels result:", h2o.dump_json(levels)

            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

            # error if any col has constant values
            if len(constantValuesDict) != 0:
                # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict)
                print "Probably got a col NA'ed and constant values as a result %s" % constantValuesDict
            
            # start after the last input col
            levels = h2o.nodes[0].levels(source=hex_key);
            l = levels['levels']
            for column in range(iColCount, iColCount+oColCount):
                if l[column]:
                    print "Skipping", column, "because it's enum (says levels)"
                    continue

                # QUANTILE*******************************************************
                
                quantile = 0.5 if DO_MEDIAN else .999
                # first output col. always fed by an exec cut, so 0?
                start = time.time()
                # file has headers. use col index
                q = h2o.nodes[0].quantiles(source_key=hex_key, column=column,
                    quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=1)
                qresult = q['result']
                h2p.red_print("result:", q['result'], "quantile", quantile, 
                    "interpolated:", q['interpolated'], "iterations", q['iterations'])
                elapsed = time.time() - start
                print "quantile end on ", hex_key, 'took', elapsed, 'seconds.'
                quantileTime = elapsed

                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                if 1==0:
                    h2o_summ.quantile_comparisons(
                        csvPathnameFull,
                        skipHeader=True,
                        col=column, # what col to extract from the csv
                        datatype='float',
                        quantile=0.5 if DO_MEDIAN else 0.999,
                        # h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                        # h2oQuantilesApprox=qresult_single,
                        h2oQuantilesExact=qresult,
                        use_genfromtxt=True,
                        )

                trial += 1
                execTime = 0
                xList.append(column)
                eList.append(execTime)
                fList.append(quantileTime)

                # remove all keys*******************************************************
                # what about hex_key?
                if 1==0:
                    start = time.time()
                    h2o.nodes[0].remove_all_keys()
                    elapsed = time.time() - start
                    print "remove all keys end on took", elapsed, 'seconds.'

        #****************************************************************
        # PLOTS. look for eplot.jpg and fplot.jpg in local dir?
        if DO_PLOT:
            xLabel = 'column (0 is first)'
            eLabel = 'exec cut time'
            fLabel = 'quantile time'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
예제 #23
0
    def sub_c2_nongz_fvec_long(self):
        # a kludge
        h2o.setup_benchmark_log()

        avgMichalSize = 237270000
        bucket = 'home-0xdiag-datasets'
        ### importFolderPath = 'more1_1200_link'
        importFolderPath = 'manyfiles-nflx'
        print "Using non-gz'ed files in", importFolderPath
        csvFilenameList = [
            ("*[1][0-4][0-9].dat", "file_50_A.dat", 50 * avgMichalSize, 1800),
            # ("*[1][0-9][0-9].dat", "file_100_A.dat", 100 * avgMichalSize, 3600),
        ]

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes,
                    timeoutSecs) in enumerate(csvFilenameList):
            csvPathname = importFolderPath + "/" + csvFilepattern

            # double import still causing problems?
            # (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
            # importFullList = importResult['files']
            # importFailList = importResult['fails']
            # print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

            # this accumulates performance stats into a benchmark log over multiple runs
            # good for tracking whether we're getting slower or faster
            h2o.cloudPerfH2O.change_logfile(csvFilename)
            h2o.cloudPerfH2O.message("")
            h2o.cloudPerfH2O.message("Parse " + csvFilename +
                                     " Start--------------------------------")

            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket,
                                           path=csvPathname,
                                           schema='local',
                                           hex_key=csvFilename + ".hex",
                                           timeoutSecs=timeoutSecs,
                                           retryDelaySecs=retryDelaySecs,
                                           pollTimeoutSecs=pollTimeoutSecs,
                                           benchmarkLogging=benchmarkLogging)
            elapsed = time.time() - start
            print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            h2o_cmd.columnInfoFromInspect(parseResult['destination_key'],
                                          exceptionOnMissingValues=False)

            if totalBytes is not None:
                fileMBS = (totalBytes / 1e6) / elapsed
                msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern,
                    csvFilename, fileMBS, elapsed)
                print msg
                h2o.cloudPerfH2O.message(msg)

            if DO_GLM:
                # these are all the columns that are enums in the dataset...too many for GLM!
                x = range(542)  # don't include the output column
                # remove the output too! (378)
                ignore_x = []
                for i in [
                        3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20,
                        424, 425, 426, 540, 541
                ]:
                    x.remove(i)
                    ignore_x.append(i)

                # plus 1 because we are no longer 0 offset
                x = ",".join(map(lambda x: "C" + str(x + 1), x))
                ignore_x = ",".join(map(lambda x: "C" + str(x + 1), ignore_x))

                GLMkwargs = {
                    'ignored_cols': ignore_x,
                    'family': 'binomial',
                    'response': 'C379',
                    'max_iter': 4,
                    'n_folds': 1,
                    'family': 'binomial',
                    'alpha': 0.2,
                    'lambda': 1e-5
                }

                # are the unparsed keys slowing down exec?
                h2i.delete_keys_at_all_nodes(pattern="manyfile")

                # convert to binomial
                execExpr = "A.hex=%s" % parseResult['destination_key']
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                execExpr = "A.hex[,%s]=(A.hex[,%s]>%s)" % ('379', '379', 15)
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                aHack = {'destination_key': "A.hex"}

                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=aHack,
                                     timeoutSecs=timeoutSecs,
                                     **GLMkwargs)
                elapsed = time.time() - start
                h2o.check_sandbox_for_errors()

                h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern,
                    csvFilename, elapsed)
                print msg
                h2o.cloudPerfH2O.message(msg)

            h2o_cmd.checkKeyDistribution()
예제 #24
0
    def test_GLM_enums_unbalanced(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 2000
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 4, 'cF', 300),
            (n, 8, 'cG', 300),
            (n, 16, 'cH', 300),
            (n, 32, 'cI', 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            print "Creating random", csvPathname, "for glm2 model building"
            write_syn_dataset(csvPathname,
                              enumList,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            print "Creating another random", csvScorePathname, "for glm2 scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname,
                              enumListForScore,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            testDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname,
                                           schema='put',
                                           hex_key=testDataKey,
                                           timeoutSecs=30,
                                           separator=colSepInt)

            y = colCount
            modelKey = 'glm_model'
            kwargs = {
                'standardize': 0,
                'destination_key': modelKey,
                'response': 'C' + str(y + 1),
                'max_iter': 200,
                'family': 'binomial',
                'n_folds': 0,
                'alpha': 0,
                'lambda': 0,
            }

            start = time.time()

            updateList = [
                {
                    'alpha': 0.5,
                    'lambda': 1e-4
                },
                {
                    'alpha': 0.25,
                    'lambda': 1e-6
                },
                {
                    'alpha': 0.0,
                    'lambda': 1e-12
                },
                {
                    'alpha': 0.5,
                    'lambda': 1e-12
                },
                {
                    'alpha': 0.0,
                    'lambda': 1e-12
                },
                {
                    'alpha': 0.0,
                    'lambda': 0
                },
            ]

            # Try each one
            for updateDict in updateList:
                print "\n#################################################################"
                print updateDict
                kwargs.update(updateDict)
                print "If we poll, we get a message saying it was cancelled by user??"
                glm = h2o_cmd.runGLM(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=180,
                                     **kwargs)
                print "glm2 end on ", parseResult[
                    'destination_key'], 'took', time.time() - start, 'seconds'

                glm_model = glm['glm_model']
                _names = glm_model['_names']
                modelKey = glm_model['_key']
                coefficients_names = glm_model['coefficients_names']
                submodels = glm_model['submodels'][0]

                beta = submodels['beta']
                norm_beta = submodels['norm_beta']
                iteration = submodels['iteration']

                validation = submodels['validation']

                auc = validation['auc']
                aic = validation['aic']
                null_deviance = validation['null_deviance']
                residual_deviance = validation['residual_deviance']

                print '_names', _names
                print 'coefficients_names', coefficients_names
                # did beta get shortened? the simple check confirms names/beta/norm_beta are same length

                print 'beta', beta
                print 'iteration', iteration
                print 'auc', auc

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
                if iteration > 20:
                    raise Exception(
                        "Why take so many iterations:  %s in this glm2 training?"
                        % iterations)

            # Score **********************************************
                print "Problems with test data having different enums than train? just use train for now"
                testDataKey = hex_key
                h2o_cmd.runScore(dataKey=testDataKey,
                                 modelKey=modelKey,
                                 vactual=y,
                                 vpredict=1,
                                 expectedAuc=0.5)
예제 #25
0
    def test_parse_time_rand_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_time.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        colCount = 6
        rowCount = 10
        headerData = rand_header(colCount)
        write_syn_dataset(csvPathname, rowCount, colCount, headerData)

        for trial in range (1):
            rowData = rand_rowData()
            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            src_key = csvFilename + "_" + str(trial)
            hex_key = csvFilename + "_" + str(trial) + ".hex"

            start = time.time()
            parseResultA = h2i.import_parse(path=csvPathname, schema='put', src_key=src_key, hex_key=hex_key)
            print "\nA trial #", trial, "parse end on ", csvFilename, 'took', time.time() - start, 'seconds'
            inspect = h2o_cmd.runInspect(key=hex_key)
            numRowsA = inspect['numRows']
            numColsA = inspect['numCols']

            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=100,
                numCols=numColsA, numRows=numRowsA, noPrint=True)

            print summaryResult
            h2o_cmd.infoFromSummary(summaryResult)
            (missingValuesDictA, constantValuesDictA, enumSizeDictA, colTypeDictA, colNameDictA) = \
                h2o_cmd.columnInfoFromInspect(hex_key, exceptionOnMissingValues=False)


            if constantValuesDictA or enumSizeDictA:
                raise Exception("Should be empty?  constantValuesDictA %s enumSizeDictA %s" % (constantValuesDictA, enumSizeDictA))

            print "missingValuesListA", missingValuesListA

            # self.assertEqual(missingValuesListA, [], "missingValuesList should be empty")
            self.assertEqual(numColsA, colCount)
            self.assertEqual(numRowsA, rowCount)

            # do a little testing of saving the key as a csv
            csvDownloadPathname = SYNDATASETS_DIR + "/csvDownload.csv"
            h2o.nodes[0].csv_download(src_key=hex_key, csvPathname=csvDownloadPathname)

            # remove the original parsed key. source was already removed by h2o
            h2o.nodes[0].remove_key(hex_key)
            # interesting. what happens when we do csv download with time data?
            start = time.time()
            parseResultB = h2i.import_parse(path=csvDownloadPathname, schema='put', src_key=src_key, hex_key=hex_key)
            print "B trial #", trial, "parse end on ", csvFilename, 'took', time.time() - start, 'seconds'
            inspect = h2o_cmd.runInspect(key=hex_key)
            numRowsB = inspect['numRows']
            numColsB = inspect['numCols']
            print "missingValuesListB", missingValuesListB
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=100,
                numCols=numColsB, numRows=numRowsB, noPrint=True)
            (missingValuesDictB, constantValuesDictB, enumSizeDictB, colTypeDictB, colNameDictB) = \
                h2o_cmd.columnInfoFromInspect(hex_key, exceptionOnMissingValues=False)
            if constantValuesDictB or enumSizeDictB:
                raise Exception("Should be empty?  constantValuesDictB %s enumSizeDictB %s" % (constantValuesDictB, enumSizeDictB))

            self.assertEqual(missingValuesListA, missingValuesListB,
                "missingValuesList mismatches after re-parse of downloadCsv result")
            self.assertEqual(numColsA, numColsB,
                "numCols mismatches after re-parse of downloadCsv result")
            # H2O adds a header to the csv created. It puts quotes around the col numbers if no header
            # but in this dataset we have a header too, so the row counts should be equal
            # if not, maybe the parse of our dataset didn't detect a row
            self.assertEqual(numRowsA, numRowsB,
                "numRowsA: %s numRowsB: %s mismatch after re-parse of downloadCsv result" % (numRowsA, numRowsB) )

            # FIX! should do some comparison of values? 
            # maybe can use exec to checksum the columns and compare column list.
            # or compare to expected values? (what are the expected values for the number for time inside h2o?)

            # FIX! should compare the results of the two parses. The infoFromInspect result?
            ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            h2o.check_sandbox_for_errors()
예제 #26
0
    def test_GLM_many_rooz_enums(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if localhost:
            n = 4000
            tryList = [
                (n, 999, 'cI', 300), 
                ]
        else:
            n = 5
            tryList = [
                (n, 1, 'cD', 300), 
                (n, 2, 'cE', 300), 
                (n, 3, 'cF', 300), 
                (n, 4, 'cG', 300), 
                (n, 5, 'cH', 300), 
                (n, 6, 'cI', 300), 
                (n, 7, 'cJ', 300), 
                (n, 9, 'cK', 300), 
                (n, 10, 'cLA', 300), 
                (n, 11, 'cDA', 300), 
                (n, 12, 'cEA', 300), 
                (n, 13, 'cFA', 300), 
                (n, 14, 'cGA', 300), 
                (n, 15, 'cHA', 300), 
                (n, 16, 'cIA', 300), 
                (n, 17, 'cJA', 300), 
                (n, 19, 'cKA', 300), 
                (n, 20, 'cLA', 300), 
                ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            # can randomly pick the row and col cases.
            ### colSepCase = random.randint(0,1)
            colSepCase = 1
            # using the comma is nice to ensure no craziness
            if (colSepCase==0):
                colSepHexString = '01'
            else:
                colSepHexString = '2c' # comma

            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar
            print "colSepInt", colSepInt

            rowSepCase = random.randint(0,1)
            # using this instead, makes the file, 'row-readable' in an editor
            if (rowSepCase==0):
                rowSepHexString = '0a' # newline
            else:
                rowSepHexString = '0d0a' # cr + newline (windows) \r\n

            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            if DO_TEN_INTEGERS:
                csvFilename = 'syn_rooz_int10_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            else:
                csvFilename = 'syn_rooz_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            # FIX! does 'separator=' take ints or ?? hex format
            # looks like it takes the hex string (two chars)
            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, 
                timeoutSecs=30, separator=colSepInt)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            # We should be able to see the parse result?
            ### inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename
            # we allow some NAs in the list above
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseKey['destination_key'],exceptionOnMissingValues=False)

            y = colCount
            x = range(colCount)
            x = ",".join(map(str,x))
            # kwargs = {'x': x, 'y': y, 'max_iter': 6, 'n_folds': 1, 'alpha': 0.1, 'lambda': 1e-5, 'family': 'poisson', 'case_mode': '=', 'case': 0}
            kwargs = {'y': y, 'max_iter': 6, 'n_folds': 1, 'alpha': 0.1, 'lambda': 1e-5, 'family': 'poisson', 'case_mode': '=', 'case': 0}
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time() - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
예제 #27
0
    def sub_c3_nongz_fvec_long(self, csvFilenameList):
        # a kludge
        h2o.setup_benchmark_log()

        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'manyfiles-nflx'
        print "Using nongz'ed files in", importFolderPath

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                if DO_DOUBLE_IMPORT:
                    (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')
                    importFullList = importResult['files']
                    importFailList = importResult['fails']
                    print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key="A.hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                fileMBS = (totalBytes/1e6)/elapsed
                msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                    len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                print msg
                h2o.cloudPerfH2O.message(msg)
                h2o_cmd.checkKeyDistribution()

                # are the unparsed keys slowing down exec?
                h2i.delete_keys_at_all_nodes(pattern="manyfile")

                execExpr = 'B.hex=A.hex'
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                h2o_cmd.checkKeyDistribution()

                execExpr = 'C.hex=B.hex'
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                h2o_cmd.checkKeyDistribution()

                execExpr = 'D.hex=C.hex'
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=180)
                h2o_cmd.checkKeyDistribution()
예제 #28
0
    def test_parse_full_rand(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if DEBUG:
            n = 20
        else:
            n = 1000000

        # from command line arg -long
        if h2o.long_test_case:
            repeat = 1000
            scale = 10  # scale up the # of rows
            tryList = [
                (n * scale, 3, 'cI', 300),
            ]
        else:
            repeat = 1
            scale = 1
            tryList = [
                (n, 3, 'cI', 300),
            ]

        lastcolsHistory = []

        for r in range(repeat):
            SEED_PER_FILE = random.randint(0, sys.maxint)
            for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
                # using the comma is nice to ensure no craziness
                colSepHexString = '2c'  # comma
                colSepChar = colSepHexString.decode('hex')
                colSepInt = int(colSepHexString, base=16)
                print "colSepChar:", colSepChar

                rowSepHexString = '0a'  # newline
                rowSepChar = rowSepHexString.decode('hex')
                print "rowSepChar:", rowSepChar

                csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                    colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # same enum list/mapping, but different dataset?
                start = time.time()
                lastcols = write_syn_dataset(csvPathname,
                                             rowCount,
                                             colCount,
                                             scale=1,
                                             colSepChar=colSepChar,
                                             rowSepChar=rowSepChar,
                                             SEED=SEED_PER_FILE)
                elapsed = time.time() - start
                print "took %s seconds to create %s" % (elapsed, csvPathname)
                # why are we saving this?
                lastcolsHistory.append(lastcols)

                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='put',
                                               hex_key=hex_key,
                                               header=0,
                                               timeoutSecs=60,
                                               separator=colSepInt,
                                               doSummary=DO_SUMMARY)
                print "Parse result['destination_key']:", parseResult[
                    'destination_key']

                inspect = h2o_cmd.runInspect(
                    key=parseResult['destination_key'])
                numCols = inspect['numCols']
                numRows = inspect['numRows']

                h2o_cmd.infoFromInspect(inspect)

                # Each column should get .10 random NAs per iteration. Within 10%?
                missingValuesList = h2o_cmd.infoFromInspect(inspect)
                # print "missingValuesList", missingValuesList
                # for mv in missingValuesList:
                #     self.assertAlmostEqual(mv, expectedNA, delta=0.1 * mv,
                #        msg='mv %s is not approx. expected %s' % (mv, expectedNA))

                # might have extra rows
                if numRows < rowCount:
                    raise Exception(
                        "Expect numRows %s >= rowCount %s since we can have extra eols"
                        % (numRows, rowCount))
                # numCols should be right?
                self.assertEqual(colCount, numCols)

                (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                    h2o_cmd.columnInfoFromInspect(parseResult['destination_key'],
                    exceptionOnMissingValues=DISABLE_ALL_NA)
예제 #29
0
    def test_GLM_ints_unbalanced(self):
        ### h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 2000
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 4, 'cF', 300),
            (n, 8, 'cG', 300),
            (n, 16, 'cH', 300),
            (n, 32, 'cI', 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list()
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(csvPathname,
                              enumList,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname,
                              enumListForScore,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            kwargs = {
                'y': y,
                'max_iter': 200,
                'family': 'binomial',
                'n_folds': 10,
                'alpha': 0,
                'lambda': 0,
                'thresholds': 0.5,
                # 'case_mode': '=',
                # 'case': 0,
            }

            start = time.time()

            updateList = [
                {
                    'alpha': 0.5,
                    'lambda': 1e-4
                },
                {
                    'alpha': 0.25,
                    'lambda': 1e-6
                },
                {
                    'alpha': 0.0,
                    'lambda': 1e-8
                },
                {
                    'alpha': 0.5,
                    'lambda': 0.0
                },
                {
                    'alpha': 0.0,
                    'lambda': 0.0
                },
            ]

            # Try each one
            for updateDict in updateList:
                print "\n#################################################################"
                print updateDict
                kwargs.update(updateDict)
                glm = h2o_cmd.runGLM(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=180,
                                     **kwargs)
                print "glm end on ", parseResult[
                    'destination_key'], 'took', time.time() - start, 'seconds'

                GLMModel = glm['GLMModel']
                # submodels0 = GLMModel['submodels'][0]
                iterations = GLMModel['iterations']
                modelKey = GLMModel['model_key']

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
                # if iterations > 20:
                #    raise Exception("Why take so many iterations:  %s in this glm training?" % iterations)

                parseResult = h2i.import_parse(path=csvScorePathname,
                                               schema='put',
                                               hex_key="score_" + hex_key,
                                               timeoutSecs=30,
                                               separator=colSepInt)

                start = time.time()
                # score with same dataset (will change to recreated dataset with one less enum
                glmScore = h2o_cmd.runGLMScore(
                    key=parseResult['destination_key'],
                    model_key=modelKey,
                    thresholds="0.5",
                    timeoutSecs=timeoutSecs)
                print "glm end on ", parseResult[
                    'destination_key'], 'took', time.time() - start, 'seconds'
                ### print h2o.dump_json(glmScore)
                classErr = glmScore['validation']['classErr']
                auc = glmScore['validation']['auc']
                err = glmScore['validation']['err']
                nullDev = glmScore['validation']['nullDev']
                resDev = glmScore['validation']['resDev']
                h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)

                print "classErr:", classErr
                print "err:", err
                print "auc:", auc
                print "resDev:", resDev
                print "nullDev:", nullDev
                if math.isnan(resDev):
                    emsg = "Why is this resDev = 'nan'?? %6s %s" % (
                        "resDev:\t", validation['resDev'])
                    raise Exception(emsg)

                # what is reasonable?
                # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err)
                # self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc)

                if math.isnan(err):
                    emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err)
                    raise Exception(emsg)

                if math.isnan(resDev):
                    emsg = "Why is this resDev = 'nan'?? %6s %s" % (
                        "resDev:\t", resDev)
                    raise Exception(emsg)

                if math.isnan(nullDev):
                    emsg = "Why is this nullDev = 'nan'?? %6s %s" % (
                        "nullDev:\t", nullDev)
예제 #30
0
    def test_GLM2_many_rooz_enums(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if 1 == 0 and localhost:
            n = 4000
            tryList = [
                (n, 999, 'cI', 300),
            ]
        else:
            n = 100
            tryList = [
                (n, 1, 'cD', 300),
                (n, 2, 'cE', 300),
                (n, 3, 'cF', 300),
                (n, 4, 'cG', 300),
                (n, 5, 'cH', 300),
                (n, 6, 'cI', 300),
                (n, 7, 'cJ', 300),
                (n, 9, 'cK', 300),
                (n, 10, 'cLA', 300),
                (n, 11, 'cDA', 300),
                (n, 12, 'cEA', 300),
                (n, 13, 'cFA', 300),
                (n, 14, 'cGA', 300),
                (n, 15, 'cHA', 300),
                (n, 16, 'cIA', 300),
                (n, 17, 'cJA', 300),
                (n, 19, 'cKA', 300),
                (n, 20, 'cLA', 300),
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # can randomly pick the row and col cases.
            ### colSepCase = random.randint(0,1)
            colSepCase = 1
            # using the comma is nice to ensure no craziness
            if (colSepCase == 0):
                colSepHexString = '01'
            else:
                colSepHexString = '2c'  # comma

            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar
            print "colSepInt", colSepInt

            rowSepCase = random.randint(0, 1)
            # using this instead, makes the file, 'row-readable' in an editor
            if (rowSepCase == 0):
                rowSepHexString = '0a'  # newline
            else:
                rowSepHexString = '0d0a'  # cr + newline (windows) \r\n

            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            if DO_TEN_INTEGERS:
                csvFilename = 'syn_rooz_int10_' + str(rowCount) + 'x' + str(
                    colCount) + '.csv'
            else:
                csvFilename = 'syn_rooz_enums_' + str(rowCount) + 'x' + str(
                    colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            # FIX! does 'separator=' take ints or ?? hex format
            # looks like it takes the hex string (two chars)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)

            # We should be able to see the parse result?
            ### inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename
            # we allow some NAs in the list above
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'],exceptionOnMissingValues=False)

            y = colCount
            kwargs = {
                'use_all_factor_levels': 1,
                'response': y,
                'max_iter': 6,
                'n_folds': 1,
                'alpha': 0.0,
                'lambda': 1e-5,
                'family': 'poisson'
            }
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=180,
                                 **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
예제 #31
0
    def sub_c2_rel_long(self):
        # a kludge
        h2o.setup_benchmark_log()

        avgMichalSize = 116561140 
        bucket = 'home-0xdiag-datasets'
        ### importFolderPath = 'more1_1200_link'
        importFolderPath = 'manyfiles-nflx-gz'
        print "Using .gz'ed files in", importFolderPath
        if len(h2o.nodes)==1:
            csvFilenameList= [
                ("*[1][0][0-9].dat.gz", "file_10_A.dat.gz", 10 * avgMichalSize, 600),
            ]
        else:
            csvFilenameList= [
                ("*[1][0-4][0-9].dat.gz", "file_50_A.dat.gz", 50 * avgMichalSize, 1800),
                # ("*[1][0-9][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600),
            ]

        if LOG_MACHINE_STATS:
            benchmarkLogging = ['cpu', 'disk', 'network']
        else:
            benchmarkLogging = []

        pollTimeoutSecs = 120
        retryDelaySecs = 10

        for trial, (csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
                csvPathname = importFolderPath + "/" + csvFilepattern

                (importResult, importPattern) = h2i.import_only(bucket=bucket, path=csvPathname, schema='local')

                # this accumulates performance stats into a benchmark log over multiple runs 
                # good for tracking whether we're getting slower or faster
                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")

                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local',
                    hex_key=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    benchmarkLogging=benchmarkLogging)
                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                print "Parse result['destination_key']:", parseResult['destination_key']
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    msg = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                if DO_GLM:
                    # these are all the columns that are enums in the dataset...too many for GLM!
                    x = range(542) # don't include the output column
                    # remove the output too! (378)
                    ignore_x = []
                    # for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541]:
                    for i in [3,4,5,6,7,8,9,10,11,14,16,17,18,19,20,424,425,426,540,541,378]:
                        x.remove(i)
                        ignore_x.append(i)

                    # increment by one, because we are no long zero offset!
                    x = ",".join(map(lambda x: "C" + str(x+1), x))
                    ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x))

                    GLMkwargs = {
                        'family': 'binomial',
                        'x': x,
                        'y': 'C379', 
                        'case': 15, 
                        'case_mode': '>',
                        'max_iter': 4, 
                        'n_folds': 1, 
                        'family': 'binomial',
                        'alpha': 0.2, 
                        'lambda': 1e-5
                    }

                    start = time.time()
                    glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()

                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    msg = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print msg
                    h2o.cloudPerfH2O.message(msg)

                h2o_cmd.checkKeyDistribution()
    def test_parse_multi_header_rand_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_ints.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        allowedLetters = 'abcdeABCDE01234[]'
        headerChoices = []
        for n in range(500):  # max # of cols below is 500
            done = False
            while not done:
                l = random.randint(1, 64)  # random length headers
                headerName = ''.join(
                    [random.choice(allowedLetters) for _ in range(l)])
                # we keep trying if we already have that header name. Has to be unique.
                done = headerName not in headerChoices
            headerChoices.append(headerName)

        tryList = [
            (3, 5, 9, 'cA', 60, 0),
            # (3, 5, 25, 'cA', 60, 0),
            # (10, 100, 500, 'cA', 60, 0),
        ]

        for trial in range(20):
            (fileNum, rowCount, colCount, hex_key, timeoutSecs,
             dataRowsWithHeader) = random.choice(tryList)
            print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader
            # FIX! should we add a header to them randomly???
            print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            totalCols = colCount + 1  # 1 extra for output
            totalDataRows = 0
            totalHeaderRows = 0
            # random selection of parse param choices

            # HEADER_HAS_HDR_ROW = random.randint(0,1)
            HEADER_HAS_HDR_ROW = 1

            DATA_HAS_HDR_ROW = random.randint(0, 1)
            PARSE_PATTERN_INCLUDES_HEADER = random.randint(0, 1)
            # DATA_FIRST_IS_COMMENT = random.randint(0,1)
            # HEADER_FIRST_IS_COMMENT = random.randint(0,1)
            # FIX! doesn't seem to like just comment in the header file
            DATA_FIRST_IS_COMMENT = 0
            HEADER_FIRST_IS_COMMENT = 0

            GZIP_DATA = random.randint(0, 1)
            GZIP_HEADER = random.randint(0, 1)
            SEP_CHAR_GEN = random.choice(paramsDict['separator'])

            HEADER_SEP_CHAR_GEN = random.choice(paramsDict['hdr_separator'])
            if HEADER_SEP_CHAR_GEN == 'same':
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # don't put a header in a data file with a different separator?
            if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW:
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # Hack: if both data and header files have a header, then, just in case
            # the header and data files should have the same separator
            # if they don't, make header match data
            if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW:
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # New for fvec? if separators are not the same, then the header separator needs to be comma
            if HEADER_SEP_CHAR_GEN != SEP_CHAR_GEN:
                HEADER_SEP_CHAR_GEN = ','

            # screw it. make them always match
            HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            if HEADER_SEP_CHAR_GEN in (',', ' '):
                pass
                # extra spaces? Don't add any
                # if random.randint(0,1):
                #    HEADER_SEP_CHAR_GEN = " " + HEADER_SEP_CHAR_GEN
                # if random.randint(0,1):
                #    HEADER_SEP_CHAR_GEN = HEADER_SEP_CHAR_GEN + " "

            kwargs = {}
            for k, v in paramsDict.items():
                kwargs[k] = random.choice(v)

            kwargs['separator'] = SEP_CHAR_GEN
            # parse doesn't auto-detect tab. will autodetect space and comma
            if SEP_CHAR_GEN == " " or SEP_CHAR_GEN == ",":
                del kwargs['separator']
            else:
                kwargs['separator'] = ord(SEP_CHAR_GEN)

            # randomly add leading and trailing white space
            # we have to do this after we save the single char HEADER_SEP_CHAR_GEN
            if SEP_CHAR_GEN in (',', ' '):
                if random.randint(0, 1):
                    SEP_CHAR_GEN = " " + SEP_CHAR_GEN
                if random.randint(0, 1):
                    SEP_CHAR_GEN = SEP_CHAR_GEN + " "

            print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW
            print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW
            print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER
            print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT
            print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT
            print 'SEP_CHAR_GEN:', "->" + SEP_CHAR_GEN + "<-"
            print 'HEADER_SEP_CHAR_GEN:', "->" + HEADER_SEP_CHAR_GEN + "<-"
            print 'GZIP_DATA:', GZIP_DATA
            print 'GZIP_HEADER:', GZIP_HEADER

            # they need to both use the same separator (h2o rule)
            # can't have duplicates
            hfhList = random.sample(headerChoices, colCount) + ["output"]
            # UPDATE: always use comma or space for header separator?? it should work no matter what
            # separator the data uses?

            headerForHeader = HEADER_SEP_CHAR_GEN.join(hfhList)
            print "headerForHeader:", headerForHeader

            # make these different
            # hfdList = [random.choice(headerChoices) for h in range(colCount)] + ["output"]
            # FIX! keep them the same for now to avoid some odd cases on what header gets used to RF
            hfdList = hfhList

            headerForData = SEP_CHAR_GEN.join(hfdList)

            # create data files
            for fileN in range(fileNum):
                csvFilenameSuffix = str(fileN) + "_" + str(SEED) + "_" + str(
                    trial) + "_" + rowxcol + '_csv'
                csvFilename = 'syn_data_' + csvFilenameSuffix
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN)
                (headerRowsDone, dataRowsDone) = write_syn_dataset(
                    csvPathname,
                    rowCount,
                    headerString=(headerForData if DATA_HAS_HDR_ROW else None),
                    rList=rList,
                    commentFirst=DATA_FIRST_IS_COMMENT,
                    sepChar=SEP_CHAR_GEN)
                totalDataRows += dataRowsDone
                totalHeaderRows += headerRowsDone
                if GZIP_DATA:
                    csvPathnamegz = csvPathname + ".gz"
                    print "gzipping to", csvPathnamegz
                    h2o_util.file_gzip(csvPathname, csvPathnamegz)
                    os.rename(
                        csvPathname, SYNDATASETS_DIR + "/not_used_data_" +
                        csvFilenameSuffix)
                    # pattern match should find the right key with csvPathname

            # create the header file
            hdrFilenameSuffix = str(SEED) + "_" + str(
                trial) + "_" + rowxcol + '_csv'
            hdrFilename = 'syn_header_' + hdrFilenameSuffix
            hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename
            # dataRowsWithHeader = 0 # temp hack
            (headerRowsDone, dataRowsDone) = write_syn_dataset(
                hdrPathname,
                dataRowsWithHeader,
                headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None),
                rList=rList,
                commentFirst=HEADER_FIRST_IS_COMMENT,
                sepChar=SEP_CHAR_GEN)
            # only include header file data rows if the parse pattern includes it
            if PARSE_PATTERN_INCLUDES_HEADER:
                totalDataRows += dataRowsDone
            totalHeaderRows += headerRowsDone
            if GZIP_HEADER:
                hdrPathnamegz = hdrPathname + ".gz"
                print "gzipping to", hdrPathnamegz
                h2o_util.file_gzip(hdrPathname, hdrPathnamegz)
                os.rename(
                    hdrPathname,
                    SYNDATASETS_DIR + "/not_used_header_" + hdrFilenameSuffix)
                # pattern match should find the right key with hdrPathnameh

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = "syn_dst" + str(trial) + ".hex"

            # DON"T get redirected to S3! (EC2 hack in config, remember!)
            # use it at the node level directly (because we gen'ed the files.
            # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w

            # put them, rather than using import files, so this works if remote h2o is used
            # and python creates the files locally
            fileList = os.listdir(SYNDATASETS_DIR)
            for f in fileList:
                h2i.import_only(path=SYNDATASETS_DIR + "/" + f,
                                schema='put',
                                noPrint=True)

            h2o_cmd.runStoreView()
            headerKey = h2i.find_key(hdrFilename)
            dataKey = h2i.find_key(csvFilename)

            # use regex. the only files in the dir will be the ones we just created
            # with  *fileN* match
            print "Header Key =", headerKey

            # put the right name in
            if kwargs['header_from_file'] == 'header':
                # do we need to add the .hex suffix we know h2o will append
                kwargs['header_from_file'] = headerKey
            # use one of the data files?
            elif kwargs['header_from_file'] == 'data':
                # do we need to add the .hex suffix we know h2o will append
                kwargs['header_from_file'] = dataKey

            # if there's no header in the header file, turn off the header_from_file
            if not HEADER_HAS_HDR_ROW:
                kwargs['header_from_file'] = None

            if HEADER_HAS_HDR_ROW and (kwargs['header_from_file']
                                       == headerKey):
                ignoreForRf = hfhList[0]
            elif DATA_HAS_HDR_ROW:
                ignoreForRf = hfdList[0]
            else:
                ignoreForRf = None

            print "If header_from_file= , required to force header=1 for h2o"
            if kwargs['header_from_file']:
                kwargs['header'] = 1
            # if we have a header in a data file, tell h2o (for now)
            elif DATA_HAS_HDR_ROW:
                kwargs['header'] = 1
            else:
                kwargs['header'] = 0

            # may have error if h2o doesn't get anything!
            start = time.time()
            if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW:
                pattern = 'syn_*' + str(trial) + "_" + rowxcol + '*'
            else:
                pattern = 'syn_data_*' + str(trial) + "_" + rowxcol + '*'

            # don't pass to parse
            kwargs.pop('hdr_separator', None)
            parseResult = h2i.parse_only(pattern=pattern,
                                         hex_key=hex_key,
                                         timeoutSecs=timeoutSecs,
                                         **kwargs)
            print "parseResult['destination_key']: " + parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # more reporting: (we can error here if extra col in header,
            # causes all NA for missing col of data)
            h2o_cmd.columnInfoFromInspect(parseResult['destination_key'],
                                          exceptionOnMissingValues=False)

            # should match # of cols in header or ??
            self.assertEqual(inspect['numCols'], totalCols, \
                "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols))

            # do we end up parsing one data rows as a header because of mismatch in gen/param
            h2oLosesOneData = (headerRowsDone
                               == 0) and (kwargs['header']
                                          == 1) and not DATA_HAS_HDR_ROW
            # header in data file gets treated as data
            h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \
                DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None)
            h2oGainsOneData = False
            print "h2oLosesOneData:", h2oLosesOneData
            print "h2oGainsOneData:", h2oGainsOneData
            if h2oLosesOneData:
                totalDataRows -= 1
            if h2oGainsOneData:
                totalDataRows += 1

            if 1 == 0:  # FIX! don't check for now
                self.assertEqual(inspect['numRows'], totalDataRows,
                    "parse created result with the wrong number of rows h2o %s gen'ed: %s" % \
                    (inspect['numRows'], totalDataRows))

            # put in an ignore param, that will fail unless headers were parsed correctly
            # doesn't matter if the header got a comment, should see it

            kwargs = {
                'sample': 100,
                'depth': 25,
                'ntree': 2,
                'ignore': ignoreForRf
            }
            start = time.time()
            # h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=10, **kwargs)
            elapsed = time.time() - start
            print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100)
            print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \
                'took', time.time() - start, 'seconds'

            h2o.check_sandbox_for_errors()
            h2i.delete_keys_at_all_nodes(pattern='syn_datasets')
    def test_parse_multi_header_rand_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_ints.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        allowedLetters = 'abcdeABCDE01234[]'
        headerChoices = []
        for n in range(500): # max # of cols below is 500
            done = False
            while not done:
                l = random.randint(1,64) # random length headers
                headerName = ''.join([random.choice(allowedLetters) for _ in range(l)])
                # we keep trying if we already have that header name. Has to be unique.
                done = headerName not in headerChoices
            headerChoices.append(headerName)

        tryList = [
            (3, 5, 9, 'cA', 60, 0),
            # (3, 5, 25, 'cA', 60, 0),
            # (10, 100, 500, 'cA', 60, 0),
            ]

        for trial in range(20):
            (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) = random.choice(tryList)
            print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader
            # FIX! should we add a header to them randomly???
            print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            totalCols = colCount + 1 # 1 extra for output
            totalDataRows = 0
            totalHeaderRows = 0
            # random selection of parse param choices

            # HEADER_HAS_HDR_ROW = random.randint(0,1)
            HEADER_HAS_HDR_ROW = 1
            
            DATA_HAS_HDR_ROW = random.randint(0,1)
            PARSE_PATTERN_INCLUDES_HEADER = random.randint(0,1)
            # DATA_FIRST_IS_COMMENT = random.randint(0,1)
            # HEADER_FIRST_IS_COMMENT = random.randint(0,1)
            # FIX! doesn't seem to like just comment in the header file
            DATA_FIRST_IS_COMMENT = 0
            HEADER_FIRST_IS_COMMENT = 0
            
            GZIP_DATA = random.randint(0,1)
            GZIP_HEADER = random.randint(0,1)
            SEP_CHAR_GEN = random.choice(paramsDict['separator'])

            HEADER_SEP_CHAR_GEN = random.choice(paramsDict['hdr_separator'])
            if HEADER_SEP_CHAR_GEN == 'same':
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # don't put a header in a data file with a different separator?
            if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW:
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # Hack: if both data and header files have a header, then, just in case
            # the header and data files should have the same separator
            # if they don't, make header match data
            if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW:
                HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            # New for fvec? if separators are not the same, then the header separator needs to be comma
            if HEADER_SEP_CHAR_GEN != SEP_CHAR_GEN:
                HEADER_SEP_CHAR_GEN = ','


            # screw it. make them always match
            HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN

            if HEADER_SEP_CHAR_GEN in (',', ' '):
                pass
                # extra spaces? Don't add any
                # if random.randint(0,1):
                #    HEADER_SEP_CHAR_GEN = " " + HEADER_SEP_CHAR_GEN
                # if random.randint(0,1):
                #    HEADER_SEP_CHAR_GEN = HEADER_SEP_CHAR_GEN + " "

            kwargs = {}
            for k,v in paramsDict.items():
                kwargs[k] = random.choice(v)

            kwargs['separator'] = SEP_CHAR_GEN
            # parse doesn't auto-detect tab. will autodetect space and comma
            if SEP_CHAR_GEN==" "  or SEP_CHAR_GEN==",": 
                del kwargs['separator']
            else:
                kwargs['separator'] = ord(SEP_CHAR_GEN)
            
            # randomly add leading and trailing white space
            # we have to do this after we save the single char HEADER_SEP_CHAR_GEN
            if SEP_CHAR_GEN in (',', ' '):
                if random.randint(0,1):
                    SEP_CHAR_GEN = " " + SEP_CHAR_GEN
                if random.randint(0,1):
                    SEP_CHAR_GEN = SEP_CHAR_GEN + " "


            print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW
            print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW
            print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER
            print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT
            print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT
            print 'SEP_CHAR_GEN:', "->" + SEP_CHAR_GEN + "<-"
            print 'HEADER_SEP_CHAR_GEN:', "->" + HEADER_SEP_CHAR_GEN + "<-"
            print 'GZIP_DATA:', GZIP_DATA
            print 'GZIP_HEADER:', GZIP_HEADER 

            # they need to both use the same separator (h2o rule)
# can't have duplicates
            hfhList = random.sample(headerChoices, colCount) + ["output"]
            # UPDATE: always use comma or space for header separator?? it should work no matter what 
            # separator the data uses?

            headerForHeader = HEADER_SEP_CHAR_GEN.join(hfhList)
            print "headerForHeader:", headerForHeader

            
            # make these different
            # hfdList = [random.choice(headerChoices) for h in range(colCount)] + ["output"]
            # FIX! keep them the same for now to avoid some odd cases on what header gets used to RF
            hfdList = hfhList

            headerForData   = SEP_CHAR_GEN.join(hfdList)

        
            # create data files
            for fileN in range(fileNum):
                csvFilenameSuffix = str(fileN) + "_" + str(SEED) + "_" + str(trial) + "_" + rowxcol + '_csv'
                csvFilename = 'syn_data_' + csvFilenameSuffix
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN)
                (headerRowsDone, dataRowsDone) = write_syn_dataset(csvPathname, rowCount, 
                    headerString=(headerForData if DATA_HAS_HDR_ROW else None), rList=rList,
                    commentFirst=DATA_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN)
                totalDataRows += dataRowsDone
                totalHeaderRows += headerRowsDone
                if GZIP_DATA:
                    csvPathnamegz = csvPathname + ".gz"
                    print "gzipping to", csvPathnamegz
                    h2o_util.file_gzip(csvPathname, csvPathnamegz)
                    os.rename(csvPathname, SYNDATASETS_DIR + "/not_used_data_" + csvFilenameSuffix)
                    # pattern match should find the right key with csvPathname


            # create the header file
            hdrFilenameSuffix = str(SEED) + "_" + str(trial) + "_" + rowxcol + '_csv'
            hdrFilename = 'syn_header_' + hdrFilenameSuffix
            hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename
            # dataRowsWithHeader = 0 # temp hack
            (headerRowsDone, dataRowsDone) = write_syn_dataset(hdrPathname, dataRowsWithHeader, 
                headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None), rList=rList,
                commentFirst=HEADER_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN)
            # only include header file data rows if the parse pattern includes it
            if PARSE_PATTERN_INCLUDES_HEADER: 
                totalDataRows += dataRowsDone
            totalHeaderRows += headerRowsDone
            if GZIP_HEADER:
                hdrPathnamegz = hdrPathname + ".gz"
                print "gzipping to", hdrPathnamegz
                h2o_util.file_gzip(hdrPathname, hdrPathnamegz)
                os.rename(hdrPathname, SYNDATASETS_DIR + "/not_used_header_" + hdrFilenameSuffix)
                # pattern match should find the right key with hdrPathnameh

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = "syn_dst" + str(trial) + ".hex"

            # DON"T get redirected to S3! (EC2 hack in config, remember!)
            # use it at the node level directly (because we gen'ed the files.
            # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w

            # put them, rather than using import files, so this works if remote h2o is used
            # and python creates the files locally
            fileList = os.listdir(SYNDATASETS_DIR)
            for f in fileList:
                h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True)

            h2o_cmd.runStoreView()
            headerKey = h2i.find_key(hdrFilename)
            dataKey = h2i.find_key(csvFilename)

            # use regex. the only files in the dir will be the ones we just created 
            # with  *fileN* match
            print "Header Key =", headerKey

            # put the right name in
            if kwargs['header_from_file'] == 'header':
                # do we need to add the .hex suffix we know h2o will append
                kwargs['header_from_file'] = headerKey
            # use one of the data files?
            elif kwargs['header_from_file'] == 'data':
                # do we need to add the .hex suffix we know h2o will append
                kwargs['header_from_file'] = dataKey

            # if there's no header in the header file, turn off the header_from_file
            if not HEADER_HAS_HDR_ROW:
                kwargs['header_from_file'] = None

            if HEADER_HAS_HDR_ROW and (kwargs['header_from_file'] == headerKey):
                ignoreForRf = hfhList[0]
            elif DATA_HAS_HDR_ROW:
                ignoreForRf = hfdList[0]
            else:
                ignoreForRf = None

            print "If header_from_file= , required to force header=1 for h2o"
            if kwargs['header_from_file']:
                kwargs['header'] =  1
            # if we have a header in a data file, tell h2o (for now)
            elif DATA_HAS_HDR_ROW:
                kwargs['header'] =  1
            else:
                kwargs['header'] =  0

            # may have error if h2o doesn't get anything!
            start = time.time()
            if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW:
                pattern = 'syn_*'+str(trial)+"_"+rowxcol+'*'
            else:
                pattern = 'syn_data_*'+str(trial)+"_"+rowxcol+'*'

            # don't pass to parse
            kwargs.pop('hdr_separator', None)
            parseResult = h2i.parse_only(pattern=pattern, hex_key=hex_key, timeoutSecs=timeoutSecs, **kwargs)
            print "parseResult['destination_key']: " + parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # more reporting: (we can error here if extra col in header, 
            # causes all NA for missing col of data)
            h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

            # should match # of cols in header or ??
            self.assertEqual(inspect['numCols'], totalCols, \
                "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols))

            # do we end up parsing one data rows as a header because of mismatch in gen/param
            h2oLosesOneData = (headerRowsDone==0) and (kwargs['header']==1) and not DATA_HAS_HDR_ROW
            # header in data file gets treated as data
            h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \
                DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None)
            h2oGainsOneData = False
            print "h2oLosesOneData:", h2oLosesOneData
            print "h2oGainsOneData:", h2oGainsOneData
            if h2oLosesOneData:
                totalDataRows -= 1
            if h2oGainsOneData:
                totalDataRows += 1
                
            if 1==0: # FIX! don't check for now
                self.assertEqual(inspect['numRows'], totalDataRows,
                    "parse created result with the wrong number of rows h2o %s gen'ed: %s" % \
                    (inspect['numRows'], totalDataRows))

            # put in an ignore param, that will fail unless headers were parsed correctly
            # doesn't matter if the header got a comment, should see it

            kwargs = {'sample': 100, 'depth': 25, 'ntree': 2, 'ignore': ignoreForRf}
            start = time.time()
            # h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=10, **kwargs)
            elapsed = time.time() - start
            print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \
                'took', time.time() - start, 'seconds'

            h2o.check_sandbox_for_errors()
            h2i.delete_keys_at_all_nodes(pattern='syn_datasets')
예제 #34
0
    def test_GLM2_enums_score_subset(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 500
        tryList = [
            # (n, 1, 'cD', 300),
            # (n, 2, 'cE', 300),
            # (n, 3, 'cF', 300),
            (n, 4, 'cG', 300),
            (n, 5, 'cH', 300),
            (n, 6, 'cI', 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(csvPathname,
                              enumList,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname,
                              enumListForScore,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvScorePathname,
                                           schema='put',
                                           hex_key="score_" + hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            kwargs = {
                'response': y,
                'max_iter': 8,
                'family': 'binomial',
                'n_folds': 2,
                'alpha': 0.2,
                'lambda': 1e-5
            }
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=180,
                                 **kwargs)
            print "glm end on ", parseResult[
                'destination_key'], 'took', time.time() - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            # Score *******************************
            # this messes up if you use case_mode/case_vale above
            predictKey = 'Predict.hex'
            modelKey = glm['glm_model']['_key']
            h2o_cmd.runScore(dataKey="score_" + hex_key,
                             modelKey=modelKey,
                             vactual=y,
                             vpredict=1,
                             expectedAuc=0.6)
예제 #35
0

                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                print csvFilepattern, 'parse time:', parseResult['response']['time']
                print "Parse result['destination_key']:", parseResult['destination_key']

                # BUG here?
                if not noPoll:
                    # We should be able to see the parse result?
                    h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

                        
                # the nflx data doesn't have a small enough # of classes in any col
                # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone?
                origKey = parseResult['destination_key']
                # execExpr = 'a = randomFilter('+origKey+',200,12345678)' 
                execExpr = 'a = slice('+origKey+',1,200)' 
                h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30)
                # runRFOnly takes the parseResult directly
                newParseKey = {'destination_key': 'a'}

                print "\n" + csvFilepattern
                # poker and the water.UDP.set3(UDP.java) fail issue..
                # constrain depth to 25
                print "Temporarily hacking to do nothing instead of RF on the parsed file"
예제 #36
0
    def test_find_numbers(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 3
        tryList = [
            (n, 1, 'cD', 300), 
            (n, 2, 'cE', 300), 
            (n, 3, 'cF', 300), 
            (n, 4, 'cG', 300), 
            (n, 5, 'cH', 300), 
            (n, 6, 'cI', 300), 
            (n, 7, 'cJ', 300), 
            (n, 9, 'cK', 300), 
            (n, 10, 'cLA', 300), 
            (n, 11, 'cDA', 300), 
            (n, 12, 'cEA', 300), 
            (n, 13, 'cFA', 300), 
            (n, 14, 'cGA', 300), 
            (n, 15, 'cHA', 300), 
            (n, 16, 'cIA', 300), 
            (n, 17, 'cJA', 300), 
            (n, 19, 'cKA', 300), 
            (n, 20, 'cLA', 300), 
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            if COL_SEP_HIVE:
                colSepHexString = '01'
                quoteChars = ",\'\"" # more choices for the unquoted string
            else:
                colSepHexString = '2c' # comma
                quoteChars = ""

            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar
            print "colSepInt", colSepInt

            rowSepCase = random.randint(0,1)
            # using this instead, makes the file, 'row-readable' in an editor
            if (rowSepCase==0):
                rowSepHexString = '0a' # newline
            else:
                rowSepHexString = '0d0a' # cr + newline (windows) \r\n

            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar, quoteChars=quoteChars)

            # FIX! does 'separator=' take ints or ?? hex format
            # looks like it takes the hex string (two chars)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=30, separator=colSepInt)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            ### inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)
예제 #37
0
    def test_rf_enums_score_superset_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 3000
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 3, 'cF', 300),
            (n, 4, 'cG', 300),
            (n, 5, 'cH', 300),
            (n, 6, 'cI', 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            # add a extra enum for scoring that's not in the model enumList
            enumListForScore.append("xyzzy")

            print "Creating random", csvPathname, "for rf model building"
            write_syn_dataset(csvPathname,
                              enumList,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            print "Creating random", csvScorePathname, "for rf scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname,
                              enumListForScore,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            scoreDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname,
                                           schema='put',
                                           hex_key=scoreDataKey,
                                           timeoutSecs=30,
                                           separator=colSepInt)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            modelKey = 'enums'
            ntrees = 5
            kwargs = {
                'destination_key': modelKey,
                'response': y,
                'classification': 1,
                'ntrees': ntrees,
                'validation': scoreDataKey,
            }

            start = time.time()
            rfResult = h2o_cmd.runRF(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=180,
                                     **kwargs)
            print "rf end on ", parseResult[
                'destination_key'], 'took', time.time() - start, 'seconds'
            (classification_error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult,
                                                     ntree=ntrees)
            predictKey = 'Predict.hex'
            h2o_cmd.runScore(dataKey=scoreDataKey,
                             modelKey=modelKey,
                             vactual=y,
                             vpredict=1,
                             expectedAuc=0.5)
예제 #38
0
    def test_GLM_enums_unbalanced(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 2000
        tryList = [
            (n, 1, 'cD', 300), 
            (n, 2, 'cE', 300), 
            (n, 4, 'cF', 300), 
            (n, 8, 'cG', 300), 
            (n, 16, 'cH', 300), 
            (n, 32, 'cI', 300), 
            ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c' # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a' # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList,5)

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=30, separator=colSepInt)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="score_" + hex_key, 
                timeoutSecs=30, separator=colSepInt)

            y = colCount
            kwargs = {
                'y': y, 
                'max_iter': 200, 
                'family': 'binomial',
                'n_folds': 10, 
                'alpha': 0, 
                'lambda': 0, 
                'thresholds': 0.5,
                # 'case_mode': '=', 
                # 'case': 0,
                }

            start = time.time()

            updateList= [ 
                {'alpha': 0.5, 'lambda': 1e-4},
                {'alpha': 0.25, 'lambda': 1e-6},
                {'alpha': 0.0, 'lambda': 1e-8},
                {'alpha': 0.5, 'lambda': 0.0},
                {'alpha': 0.0, 'lambda': 0.0},
            ]

            # Try each one
            h2o.beta_features = True
            for updateDict in updateList:
                print "\n#################################################################"
                print updateDict
                kwargs.update(updateDict)
                glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
                print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'

                GLMModel = glm['GLMModel']
                # submodels0 = GLMModel['submodels'][0]
                iterations = GLMModel['iterations']
                modelKey = GLMModel['model_key']

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
                if iterations > 20:
                    raise Exception("Why take so many iterations:  %s in this glm training?" % iterations)


                start = time.time()
                # score with same dataset (will change to recreated dataset with one less enum
                glmScore = h2o_cmd.runGLMScore(key=parseResult['destination_key'],
                    model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs)
                print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
                ### print h2o.dump_json(glmScore)
                classErr = glmScore['validation']['classErr']
                auc = glmScore['validation']['auc']
                err = glmScore['validation']['err']
                nullDev = glmScore['validation']['nullDev']
                resDev = glmScore['validation']['resDev']
                h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)

                print "classErr:", classErr
                print "err:", err
                print "auc:", auc
                print "resDev:", resDev
                print "nullDev:", nullDev
                if math.isnan(resDev):
                    emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validation['resDev'])
                    raise Exception(emsg)

                # what is reasonable?
                # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err)
                self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc)

                if math.isnan(err):
                    emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err)
                    raise Exception(emsg)

                if math.isnan(resDev):
                    emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", resDev)
                    raise Exception(emsg)

                if math.isnan(nullDev):
                    emsg = "Why is this nullDev = 'nan'?? %6s %s" % ("nullDev:\t", nullDev)
예제 #39
0
    def test_GLM2_ints_unbalanced(self):
        ### h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 2000
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 4, 'cF', 300),
            (n, 8, 'cG', 300),
            (n, 16, 'cH', 300),
            (n, 32, 'cI', 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list()
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(csvPathname,
                              enumList,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname,
                              enumListForScore,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            modelKey = 'xyz'
            kwargs = {
                'n_folds': 0,
                'destination_key': modelKey,
                'response': y,
                'max_iter': 200,
                'family': 'binomial',
                'alpha': 0,
                'lambda': 0,
            }

            start = time.time()

            updateList = [
                {
                    'alpha': 0.5,
                    'lambda': 1e-5
                },
                # {'alpha': 0.25, 'lambda': 1e-4},
            ]

            # Try each one
            for updateDict in updateList:
                print "\n#################################################################"
                print updateDict
                kwargs.update(updateDict)
                glm = h2o_cmd.runGLM(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=180,
                                     **kwargs)
                print "glm end on ", parseResult[
                    'destination_key'], 'took', time.time() - start, 'seconds'

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

                parseResult = h2i.import_parse(path=csvScorePathname,
                                               schema='put',
                                               hex_key="B.hex",
                                               timeoutSecs=30,
                                               separator=colSepInt)

                h2o_cmd.runScore(dataKey="B.hex",
                                 modelKey=modelKey,
                                 vactual='C' + str(y + 1),
                                 vpredict=1,
                                 expectedAuc=0.45)
예제 #40
0
    def test_rf_many_rooz_enums_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if 1==0 and localhost:
            n = 4000
            tryList = [
                (n, 999, 'cI', 300), 
                ]
        else:
            n = 100
            tryList = [
                (n, 1, 'cD', 300), 
                (n, 2, 'cE', 300), 
                (n, 3, 'cF', 300), 
                (n, 4, 'cG', 300), 
                (n, 5, 'cH', 300), 
                (n, 6, 'cI', 300), 
                (n, 7, 'cJ', 300), 
                (n, 9, 'cK', 300), 
                (n, 10, 'cLA', 300), 
                (n, 11, 'cDA', 300), 
                (n, 12, 'cEA', 300), 
                (n, 13, 'cFA', 300), 
                (n, 14, 'cGA', 300), 
                (n, 15, 'cHA', 300), 
                (n, 16, 'cIA', 300), 
                (n, 17, 'cJA', 300), 
                (n, 19, 'cKA', 300), 
                (n, 20, 'cLA', 300), 
                ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # can randomly pick the row and col cases.
            ### colSepCase = random.randint(0,1)
            colSepCase = 1
            # using the comma is nice to ensure no craziness
            if (colSepCase==0):
                colSepHexString = '01'
            else:
                colSepHexString = '2c' # comma

            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar
            print "colSepInt", colSepInt

            rowSepCase = random.randint(0,1)
            # using this instead, makes the file, 'row-readable' in an editor
            if (rowSepCase==0):
                rowSepHexString = '0a' # newline
            else:
                rowSepHexString = '0d0a' # cr + newline (windows) \r\n

            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            if DO_TEN_INTEGERS:
                csvFilename = 'syn_rooz_int10_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            else:
                csvFilename = 'syn_rooz_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            # FIX! does 'separator=' take ints or ?? hex format
            # looks like it takes the hex string (two chars)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=30, separator=colSepInt)

            # We should be able to see the parse result?
            ### inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename
            # we allow some NAs in the list above
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'],exceptionOnMissingValues=False)

            y = colCount
            ntrees = 5
            kwargs = {
                'response': y, 
                'classification': 1,
                'ntrees': ntrees,
            }
            start = time.time()
            rfResult = h2o_cmd.runRF(parseResult=parseResult, 
                timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
            print "rf end on ", csvPathname, 'took', time.time() - start, 'seconds'
            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult, ntree=ntrees)
            modelKey = rfResult['drf_model']['_key']
            h2o_cmd.runScore(dataKey=parseResult['destination_key'], modelKey=modelKey,
                vactual=colCount, vpredict=1, expectedAuc=0.5, doAUC=False)
    def test_parse_rand_enum_compress(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if DEBUG:
            n = 20
        else:
            n = 1000000

        # from command line arg -long
        if h2o.long_test_case:
            repeat = 1000 
            scale = 10 # scale up the # of rows
            tryList = [
                (n*scale, 1, 'cI', 300), 
                (n*scale, 1, 'cI', 300), 
                (n*scale, 1, 'cI', 300), 
            ]
        else:
            repeat = 1
            scale = 1
            tryList = [
                (n, 3, 'cI', 300), 
                (n, 3, 'cI', 300), 
                (n, 3, 'cI', 300), 
            ]

        lastcolsHistory = []

        enumList = create_enum_list(listSize=ENUMS_NUM)

        for r in range(repeat):
            SEED_PER_FILE = random.randint(0, sys.maxint)
            for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
                # using the comma is nice to ensure no craziness
                colSepHexString = '2c' # comma
                colSepChar = colSepHexString.decode('hex')
                colSepInt = int(colSepHexString, base=16)
                print "colSepChar:", colSepChar

                rowSepHexString = '0a' # newline
                rowSepChar = rowSepHexString.decode('hex')
                print "rowSepChar:", rowSepChar

                csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # same enum list/mapping, but different dataset?
                start = time.time()
                lastcols = write_syn_dataset(csvPathname, enumList, rowCount, colCount, scale=1,
                    colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEED_PER_FILE)
                elapsed = time.time() - start
                print "took %s seconds to create %s" % (elapsed, csvPathname)
                # why are we saving this?
                lastcolsHistory.append(lastcols)

                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, header=0,
                    timeoutSecs=30, separator=colSepInt, doSummary=DO_SUMMARY)
                print "Parse result['destination_key']:", parseResult['destination_key']
                
                inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
                numCols = inspect['numCols']
                numRows = inspect['numRows']

                h2o_cmd.infoFromInspect(inspect)

                # Each column should get .10 random NAs per iteration. Within 10%? 
                missingValuesList = h2o_cmd.infoFromInspect(inspect)
                # print "missingValuesList", missingValuesList
                # for mv in missingValuesList:
                #     self.assertAlmostEqual(mv, expectedNA, delta=0.1 * mv, 
                #        msg='mv %s is not approx. expected %s' % (mv, expectedNA))

                self.assertEqual(rowCount, numRows)
                self.assertEqual(colCount, numCols)

                (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                    h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], 
                    exceptionOnMissingValues=DISABLE_ALL_NA)
예제 #42
0
    def test_GLM_ints_unbalanced(self):
        ### h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 2000
        tryList = [
            (n, 1, "cD", 300),
            (n, 2, "cE", 300),
            (n, 4, "cF", 300),
            (n, 8, "cG", 300),
            (n, 16, "cH", 300),
            (n, 32, "cI", 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = "2c"  # comma
            colSepChar = colSepHexString.decode("hex")
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = "0a"  # newline
            rowSepChar = rowSepHexString.decode("hex")
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = "syn_enums_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename
            csvScoreFilename = "syn_enums_score_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvScorePathname = SYNDATASETS_DIR + "/" + csvScoreFilename

            enumList = create_enum_list()
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(
                csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar
            )

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(
                csvScorePathname,
                enumListForScore,
                rowCount,
                colCount,
                SEEDPERFILE,
                colSepChar=colSepChar,
                rowSepChar=rowSepChar,
            )

            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, separator=colSepInt
            )
            print csvFilename, "parse time:", parseResult["response"]["time"]
            print "Parse result['destination_key']:", parseResult["destination_key"]

            print "\n" + csvFilename
            (
                missingValuesDict,
                constantValuesDict,
                enumSizeDict,
                colTypeDict,
                colNameDict,
            ) = h2o_cmd.columnInfoFromInspect(parseResult["destination_key"], exceptionOnMissingValues=True)

            y = colCount
            kwargs = {
                "y": y,
                "max_iter": 200,
                "family": "binomial",
                "n_folds": 10,
                "alpha": 0,
                "lambda": 0,
                "thresholds": 0.5,
                # 'case_mode': '=',
                # 'case': 0,
            }

            start = time.time()

            updateList = [
                {"alpha": 0.5, "lambda": 1e-4},
                {"alpha": 0.25, "lambda": 1e-6},
                {"alpha": 0.0, "lambda": 1e-8},
                {"alpha": 0.5, "lambda": 0.0},
                {"alpha": 0.0, "lambda": 0.0},
            ]

            # Try each one
            for updateDict in updateList:
                print "\n#################################################################"
                print updateDict
                kwargs.update(updateDict)
                glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
                print "glm end on ", parseResult["destination_key"], "took", time.time() - start, "seconds"

                GLMModel = glm["GLMModel"]
                # submodels0 = GLMModel['submodels'][0]
                iterations = GLMModel["iterations"]
                modelKey = GLMModel["model_key"]

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
                # if iterations > 20:
                #    raise Exception("Why take so many iterations:  %s in this glm training?" % iterations)

                parseResult = h2i.import_parse(
                    path=csvScorePathname, schema="put", hex_key="score_" + hex_key, timeoutSecs=30, separator=colSepInt
                )

                start = time.time()
                # score with same dataset (will change to recreated dataset with one less enum
                glmScore = h2o_cmd.runGLMScore(
                    key=parseResult["destination_key"], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs
                )
                print "glm end on ", parseResult["destination_key"], "took", time.time() - start, "seconds"
                ### print h2o.dump_json(glmScore)
                classErr = glmScore["validation"]["classErr"]
                auc = glmScore["validation"]["auc"]
                err = glmScore["validation"]["err"]
                nullDev = glmScore["validation"]["nullDev"]
                resDev = glmScore["validation"]["resDev"]
                h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)

                print "classErr:", classErr
                print "err:", err
                print "auc:", auc
                print "resDev:", resDev
                print "nullDev:", nullDev
                if math.isnan(resDev):
                    emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validation["resDev"])
                    raise Exception(emsg)

                # what is reasonable?
                # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err)
                # self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc)

                if math.isnan(err):
                    emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err)
                    raise Exception(emsg)

                if math.isnan(resDev):
                    emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", resDev)
                    raise Exception(emsg)

                if math.isnan(nullDev):
                    emsg = "Why is this nullDev = 'nan'?? %6s %s" % ("nullDev:\t", nullDev)
예제 #43
0
    def test_find_numbers(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 3
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 3, 'cF', 300),
            (n, 4, 'cG', 300),
            (n, 5, 'cH', 300),
            (n, 6, 'cI', 300),
            (n, 7, 'cJ', 300),
            (n, 9, 'cK', 300),
            (n, 10, 'cLA', 300),
            (n, 11, 'cDA', 300),
            (n, 12, 'cEA', 300),
            (n, 13, 'cFA', 300),
            (n, 14, 'cGA', 300),
            (n, 15, 'cHA', 300),
            (n, 16, 'cIA', 300),
            (n, 17, 'cJA', 300),
            (n, 19, 'cKA', 300),
            (n, 20, 'cLA', 300),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            if COL_SEP_HIVE:
                colSepHexString = '01'
                quoteChars = ",\'\""  # more choices for the unquoted string
            else:
                colSepHexString = '2c'  # comma
                quoteChars = ""

            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar
            print "colSepInt", colSepInt

            rowSepCase = random.randint(0, 1)
            # using this instead, makes the file, 'row-readable' in an editor
            if (rowSepCase == 0):
                rowSepHexString = '0a'  # newline
            else:
                rowSepHexString = '0d0a'  # cr + newline (windows) \r\n

            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar,
                              quoteChars=quoteChars)

            # FIX! does 'separator=' take ints or ?? hex format
            # looks like it takes the hex string (two chars)
            parseKey = h2o_cmd.parseFile(None,
                                         csvPathname,
                                         key2=key2,
                                         timeoutSecs=30,
                                         separator=colSepInt)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            # We should be able to see the parse result?
            ### inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseKey['destination_key'], exceptionOnMissingValues=True)
예제 #44
0
    def test_GLM_many_enums(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if not localhost:
            n = 200
            tryList = [
                (n, 1, 'cD', 300),
                (n, 2, 'cE', 300),
                (n, 3, 'cF', 300),
                (n, 4, 'cG', 300),
                (n, 5, 'cH', 300),
                (n, 6, 'cI', 300),
            ]
        else:
            n = 150
            tryList = [
                (n, 1, 'cD', 300),
                (n, 2, 'cE', 300),
                (n, 3, 'cF', 300),
                (n, 4, 'cG', 300),
                (n, 5, 'cH', 300),
                (n, 6, 'cI', 300),
                (n, 7, 'cJ', 300),
                (n, 9, 'cK', 300),
                (n, 10, 'cLA', 300),
                (n, 11, 'cDA', 300),
                (n, 12, 'cEA', 300),
                (n, 13, 'cFA', 300),
                (n, 14, 'cGA', 300),
                (n, 15, 'cHA', 300),
                (n, 16, 'cIA', 300),
                (n, 17, 'cJA', 300),
                (n, 19, 'cKA', 300),
                (n, 20, 'cLA', 300),
            ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            # just randomly pick the row and col cases.
            colSepCase = random.randint(0, 1)
            colSepCase = 1
            # using the comma is nice to ensure no craziness
            if (colSepCase == 0):
                colSepHexString = '01'
                quoteChars = ",\'\""  # more choices for the unquoted string
            else:
                colSepHexString = '2c'  # comma
                quoteChars = ""

            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar
            print "colSepInt", colSepInt

            rowSepCase = random.randint(0, 1)
            # using this instead, makes the file, 'row-readable' in an editor
            if (rowSepCase == 0):
                rowSepHexString = '0a'  # newline
            else:
                rowSepHexString = '0d0a'  # cr + newline (windows) \r\n

            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar,
                              quoteChars=quoteChars)

            # FIX! does 'separator=' take ints or ?? hex format
            # looks like it takes the hex string (two chars)
            parseKey = h2o_cmd.parseFile(None,
                                         csvPathname,
                                         key2=key2,
                                         timeoutSecs=30,
                                         separator=colSepInt)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            # We should be able to see the parse result?
            ### inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseKey['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            kwargs = {
                'y': y,
                'max_iter': 1,
                'n_folds': 1,
                'alpha': 0.2,
                'lambda': 1e-5,
                'case_mode': '=',
                'case': 0
            }
            start = time.time()
            ### glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
            print "glm end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
예제 #45
0
    def test_rf_enums_mappings(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            # (n, 1, 'cD', 300),
            # (n, 2, 'cE', 300),
            # (n, 3, 'cF', 300),
            # (n, 4, 'cG', 300),
            # (n, 5, 'cH', 300),
            # (n, 6, 'cI', 300),
            (ROWS, COLS, 'cI', 300),
            (ROWS, COLS, 'cI', 300),
            (ROWS, COLS, 'cI', 300),
        ]

        # SEED_FOR_TRAIN = random.randint(0, sys.maxint)
        SEED_FOR_TRAIN = 1234567890
        SEED_FOR_SCORE = 9876543210
        errorHistory = []
        enumHistory = []
        lastcolsTrainHistory = []
        lastcolsScoreHistory = []

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            enumList = create_enum_list(listSize=ENUMS)
            # reverse the list
            enumList.reverse()

            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            # use same enum List
            enumListForScore = enumList

            print "Creating random", csvPathname, "for rf model building"
            lastcols = write_syn_dataset(csvPathname,
                                         enumList,
                                         rowCount,
                                         colCount,
                                         colSepChar=colSepChar,
                                         rowSepChar=rowSepChar,
                                         SEED=SEED_FOR_TRAIN)

            lastcolsTrainHistory.append(lastcols)

            print "Creating random", csvScorePathname, "for rf scoring with prior model (using same enum list)"
            # same enum list/mapping, but different dataset?
            lastcols = write_syn_dataset(csvScorePathname,
                                         enumListForScore,
                                         rowCount,
                                         colCount,
                                         colSepChar=colSepChar,
                                         rowSepChar=rowSepChar,
                                         SEED=SEED_FOR_SCORE)
            lastcolsScoreHistory.append(lastcols)

            scoreDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname,
                                           schema='put',
                                           hex_key=scoreDataKey,
                                           timeoutSecs=30,
                                           separator=colSepInt)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            modelKey = 'enums'
            # limit depth and number of trees to accentuate the issue with categorical split decisions

            # use mtries so both look at all cols at every split? doesn't matter for speedrf
            # does speedrf try one more time? with 3 cols, mtries=2, so another try might
            # get a look at the missing col
            # does matter for drf2. does it "just stop"
            # trying mtries always looking at all columns or 1 col might be interesting
            if SPEEDRF:
                kwargs = {
                    'sample_rate': 0.999,
                    'destination_key': modelKey,
                    'response': y,
                    'ntrees': 1,
                    'max_depth': 100,
                    # 'oobee': 1,
                    'validation': hex_key,
                    # 'validation': scoreDataKey,
                    'seed': 123456789,
                    'mtries': COLS,
                }
            elif GBM:
                kwargs = {
                    'destination_key': modelKey,
                    'response': y,
                    'validation': scoreDataKey,
                    'seed': 123456789,
                    # 'learn_rate': .1,
                    'ntrees': 1,
                    'max_depth': 100,
                    'min_rows': 1,
                    'classification': 1,
                }
            else:
                kwargs = {
                    'sample_rate': 0.999,
                    'destination_key': modelKey,
                    'response': y,
                    'classification': 1,
                    'ntrees': 1,
                    'max_depth': 100,
                    'min_rows': 1,
                    'validation': hex_key,
                    # 'validation': scoreDataKey,
                    'seed': 123456789,
                    'nbins': 1024,
                    'mtries': COLS,
                }

            for r in range(2):
                start = time.time()

                if GBM:
                    gbmResult = h2o_cmd.runGBM(parseResult=parseResult,
                                               timeoutSecs=timeoutSecs,
                                               pollTimeoutSecs=180,
                                               **kwargs)

                    print "gbm end on ", parseResult[
                        'destination_key'], 'took', time.time(
                        ) - start, 'seconds'
                    # print h2o.dump_json(gbmResult)
                    (classification_error, classErrorPctList,
                     totalScores) = h2o_gbm.simpleCheckGBMView(gbmv=gbmResult)

                elif SPEEDRF:
                    rfResult = h2o_cmd.runSpeeDRF(parseResult=parseResult,
                                                  timeoutSecs=timeoutSecs,
                                                  pollTimeoutSecs=180,
                                                  **kwargs)
                    print "speedrf end on ", parseResult[
                        'destination_key'], 'took', time.time(
                        ) - start, 'seconds'
                    (classification_error, classErrorPctList,
                     totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)

                else:
                    rfResult = h2o_cmd.runRF(parseResult=parseResult,
                                             timeoutSecs=timeoutSecs,
                                             pollTimeoutSecs=180,
                                             **kwargs)
                    print "rf end on ", parseResult[
                        'destination_key'], 'took', time.time(
                        ) - start, 'seconds'
                    (classification_error, classErrorPctList,
                     totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)

                h2o_cmd.runScore(dataKey=scoreDataKey,
                                 modelKey=modelKey,
                                 vactual=y,
                                 vpredict=1,
                                 doAUC=not MULTINOMIAL)  # , expectedAuc=0.5)

                errorHistory.append(classification_error)
                enumHistory.append(enumList)

            print "error from all runs on this dataset (with different enum mappings)"
            print errorHistory
            for e in enumHistory:
                print e

            print "last row from all train datasets, as integer"
            for l in lastcolsTrainHistory:
                print l
            print "last row from all score datasets, as integer"
            for l in lastcolsScoreHistory:
                print l
예제 #46
0
    def test_exec_enums_rand_cut2(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = ROWS
        tryList = [
            # (n, 10, 9, 'cE', 300), 
            (n, 1, 1, 'cE', 300), 
            ]

        # create key names to use for exec
        eKeys = ['e%s' % i for i in range(10)]

        # h2b.browseTheCloud()
        trial = 0
        for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList:
            colCount = iColCount + oColCount

            hex_key = 'p'
            colEnumList = create_col_enum_list(iColCount)

            # create 100 possible cut expressions here, so we don't waste time below
            rowExprList = []
            print "Creating", CUT_EXPR_CNT, 'cut expressions'
            for j in range(CUT_EXPR_CNT):
                # init cutValue. None means no compare
                cutValue = [None for i in range(iColCount)]
                # build up a random cut expression
                MAX_COLS_IN_EXPR = iColCount
                cols = random.sample(range(MAX_COLS_IN_EXPR), random.randint(1,MAX_COLS_IN_EXPR))
                for c in cols:
                    # possible choices within the column
                    cel = colEnumList[c]
                    # for now the cutValues are numbers for the enum mappings
                    if 1==1:
                        # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like
                        celChoice = str(random.choice(range(len(cel))))
                    else:
                        celChoice = random.choice(cel)
                    cutValue[c] = celChoice
    
                cutExprList = []
                for i,c in enumerate(cutValue):
                    if c is None:   
                        continue
                    else:
                        # new ...ability to reference cols
                        # src[ src$age<17 && src$zip=95120 && ... , ]
                        # randomly pick == or !=
                        if random.randint(0,1)==0:
                            cutExprList.append('p$C'+str(i+1)+'!='+c)
                        else:
                            cutExprList.append('p$C'+str(i+1)+'=='+c)

                cutExpr = ' & '.join(cutExprList)
                # print "cutExpr:", cutExpr    

                # just extract one output col (the first one)
                rowExpr = '%s[%s,%s];' % (hex_key, cutExpr, iColCount+1)
                # print "rowExpr:", rowExpr
                print rowExpr
                rowExprList.append(rowExpr)


            # CREATE DATASET*******************************************
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList)

            # PARSE*******************************************************

            src_key = csvFilename
            parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='A'+src_key, timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='B'+src_key, timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='C'+src_key, timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='D'+src_key, timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='E'+src_key, timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='F'+src_key, timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='G'+src_key, timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='H'+src_key, timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='I'+src_key, timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='J'+src_key, timeoutSecs=200)

            parseResult = h2i.parse_only(pattern='*'+src_key, hex_key=hex_key, timeoutSecs=800)

            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            pNumRows = inspect['numRows']
            pNumCols = inspect['numCols']
            # print h2o.dump_json(inspect)
            levels = h2o.nodes[0].levels(source=hex_key)
            print "levels result:", h2o.dump_json(levels)

            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

            # error if any col has constant values
            if len(constantValuesDict) != 0:
                raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict)

            # INIT all possible key names used***************************
            # remember. 1 indexing!

            # is this needed?
            if 1==1:
                a = 'a=c(1,2,3);' + ';'.join(['a[,%s]=a[,%s-1]'% (i,i) for i in range(2,colCount)])
                print a
                for eKey in eKeys:
                    # build up the columns
                    e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey), print_params=False)
                    ## print h2o.dump_json(e)


            xList = []
            eList = []
            fList = []
            for repeat in range(CUT_LOOP_CNT):
                # EXEC*******************************************************
                # don't use exec_expr to avoid issues with Inspect following etc.
                randICol = random.randint(0,iColCount-1)
                randOCol = random.randint(iColCount, iColCount+oColCount-1)

                # should be two different keys in the sample
                e = random.sample(eKeys,2)
                fKey = e[0]
                eKey = e[1]

                start = time.time()
                h2o.nodes[0].exec_query(str="%s=%s" % (fKey, random.choice(rowExprList)))
                elapsed = time.time() - start
                execTime = elapsed
                print "exec 2 took", elapsed, "seconds."
            
                inspect = h2o_cmd.runInspect(key=fKey)
                h2o_cmd.infoFromInspect(inspect, fKey)
                numRows = inspect['numRows']
                numCols = inspect['numCols']

                if numRows==0 or numCols!=colCount:
                    h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort")

                # QUANTILE*******************************************************
                quantile = 0.5 if DO_MEDIAN else .999
                # first output col. always fed by an exec cut, so 0?
                column = iColCount
                column = 0
                start = time.time()
                q = h2o.nodes[0].quantiles(source_key=fKey, column=column, 
                    quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS)
                h2p.red_print("quantile", quantile, q['result'])
                elapsed = time.time() - start
                print "quantile end on ", fKey, 'took', elapsed, 'seconds.'
                quantileTime = elapsed


                # remove all keys*******************************************************
                # what about hex_key?
                if 1==0:
                    start = time.time()
                    h2o.nodes[0].remove_all_keys()
                    elapsed = time.time() - start
                    print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.'

                trial += 1
                xList.append(trial)
                eList.append(execTime)
                fList.append(quantileTime)



        #****************************************************************
        # QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET
        print "QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET. Although it's a real col, not an enum col"
        quantile = 0.5 if DO_MEDIAN else .999
        # first output col. always fed by an exec cut, so 0?
        column = iColCount
        start = time.time()
        q = h2o.nodes[0].quantiles(source_key=hex_key, column='C'+str(iColCount+1), 
            quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=0)
        elapsed = time.time() - start
        h2p.red_print(hex_key, pNumRows, "rows Baseline: quantile single col (C" + str(iColCount+1) + ")", "one iteration", elapsed, "secs. threshold:", quantile, q['result'])
        print "quantile single col 1 iteration end on", hex_key, "took", elapsed, 'seconds.'
        quantileTime = elapsed

        #****************************************************************
        # PLOTS. look for eplot.jpg and fplot.jpg in local dir?
        if DO_PLOT:
            xLabel = 'trial'
            eLabel = 'exec cut time'
            fLabel = 'quantile time'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
예제 #47
0
def goodXFromColumnInfo(y, 
    num_cols=None, missingValuesDict=None, constantValuesDict=None, enumSizeDict=None, 
    colTypeDict=None, colNameDict=None, keepPattern=None, key=None, 
    timeoutSecs=120, forRF=False, noPrint=False):

    y = str(y)

    # if we pass a key, means we want to get the info ourselves here
    if key is not None:
        (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
            h2o_cmd.columnInfoFromInspect(key, exceptionOnMissingValues=False, 
            max_column_display=99999999, timeoutSecs=timeoutSecs)
        num_cols = len(colNameDict)

    # now remove any whose names don't match the required keepPattern
    if keepPattern is not None:
        keepX = re.compile(keepPattern)
    else:
        keepX = None

    x = range(num_cols)
    # need to walk over a copy, cause we change x
    xOrig = x[:]
    ignore_x = [] # for use by RF
    for k in xOrig:
        name = colNameDict[k]
        # remove it if it has the same name as the y output
        if str(k)== y: # if they pass the col index as y
            if not noPrint:
                print "Removing %d because name: %s matches output %s" % (k, str(k), y)
            x.remove(k)
            # rf doesn't want it in ignore list
            # ignore_x.append(k)
        elif name == y: # if they pass the name as y 
            if not noPrint:
                print "Removing %d because name: %s matches output %s" % (k, name, y)
            x.remove(k)
            # rf doesn't want it in ignore list
            # ignore_x.append(k)

        elif keepX is not None and not keepX.match(name):
            if not noPrint:
                print "Removing %d because name: %s doesn't match desired keepPattern %s" % (k, name, keepPattern)
            x.remove(k)
            ignore_x.append(k)

        # missing values reports as constant also. so do missing first.
        # remove all cols with missing values
        # could change it against num_rows for a ratio
        elif k in missingValuesDict:
            value = missingValuesDict[k]
            if not noPrint:
                print "Removing %d with name: %s because it has %d missing values" % (k, name, value)
            x.remove(k)
            ignore_x.append(k)

        elif k in constantValuesDict:
            value = constantValuesDict[k]
            if not noPrint:
                print "Removing %d with name: %s because it has constant value: %s " % (k, name, str(value))
            x.remove(k)
            ignore_x.append(k)

        # this is extra pruning..
        # remove all cols with enums, if not already removed
        elif k in enumSizeDict:
            value = enumSizeDict[k]
            if not noPrint:
                print "Removing %d %s because it has enums of size: %d" % (k, name, value)
            x.remove(k)
            ignore_x.append(k)

    if not noPrint:
        print "x has", len(x), "cols"
        print "ignore_x has", len(ignore_x), "cols"
    x = ",".join(map(str,x))
    ignore_x = ",".join(map(str,ignore_x))

    if not noPrint:
        print "\nx:", x
        print "\nignore_x:", ignore_x

    if forRF:
        return ignore_x
    else:
        return x
예제 #48
0
    def test_exec_enums_rand_cut(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (ROWS, 3, 2, 'cE', 300), 
            ]

        # create key names to use for exec
        eKeys = ['e%s' % i for i in range(10)]

        # h2b.browseTheCloud()
        trial = 0
        for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList:
            colCount = iColCount + oColCount

            hex_key = 'p'
            colEnumList = create_col_enum_list(iColCount)

            # create 100 possible cut expressions here, so we don't waste time below
            rowExprList = []
            for j in range(CUT_EXPR_CNT):
                print "Creating", CUT_EXPR_CNT, 'cut expressions'
                # init cutValue. None means no compare
                cutValue = [None for i in range(iColCount)]
                # build up a random cut expression
                cols = random.sample(range(iColCount), random.randint(1,iColCount))
                for c in cols:
                    # possible choices within the column
                    # cel = colEnumList[c]
                    cel = colEnumList
                    # for now the cutValues are numbers for the enum mappings
                    if 1==1:
                        # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like
                        celChoice = str(random.choice(range(len(cel))))
                    else:
                        celChoice = random.choice(cel)
                    cutValue[c] = celChoice
    
                cutExprList = []
                for i,c in enumerate(cutValue):
                    if c is None:   
                        continue
                    else:
                        # new ...ability to reference cols
                        # src[ src$age<17 && src$zip=95120 && ... , ]
                        cutExprList.append('p$C'+str(i+1)+'=='+c)

                cutExpr = ' && '.join(cutExprList)
                print "cutExpr:", cutExpr    

                # should be two different keys in the sample
                e = random.sample(eKeys,2)
                fKey = e[0]
                eKey = e[1]

                rowExpr = '%s[%s,];' % (hex_key, cutExpr)
                print "rowExpr:", rowExpr
                rowExprList.append(rowExpr)

                print "j:", j

            # CREATE DATASET*******************************************
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList)

            # PARSE*******************************************************

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False, header=0)

            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            # print h2o.dump_json(inspect)

            rSummary = h2o_cmd.runSummary(key=parseResult['destination_key'])
            h2o_cmd.infoFromSummary(rSummary)

            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

            # error if any col has constant values
            if len(constantValuesDict) != 0:
                raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict)

            # INIT all possible key names used***************************
            # remember. 1 indexing!

            # is this needed?
            if 1==1:
                a = 'a=c(1,2,3);' + ';'.join(['a[,%s]=a[,%s-1]'% (i,i) for i in range(2,colCount)])
                print a
                for eKey in eKeys:
                    # build up the columns
                    e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey), print_params=False)
                    ## print h2o.dump_json(e)


            xList = []
            eList = []
            fList = []
            for repeat in range(200):
                # EXEC*******************************************************
                # don't use exec_expr to avoid issues with Inspect following etc.
                randICol = random.randint(0,iColCount-1)
                randOCol = random.randint(iColCount, iColCount+oColCount-1)

                # should be two different keys in the sample
                e = random.sample(eKeys,2)
                fKey = e[0]
                eKey = e[1]

                if 1==0:
                    start = time.time()
                    e = h2o.nodes[0].exec_query(str='%s=%s[,%s]' % (fKey, hex_key, randOCol+1))

                    elapsed = time.time() - start
                    print "exec 1 took", elapsed, "seconds."
                    execTime = elapsed

                if 1==1:
                    start = time.time()
                    h2o.nodes[0].exec_query(str="%s=%s" % (fKey, random.choice(rowExprList)))
                    elapsed = time.time() - start
                    execTime = elapsed
                    print "exec 2 took", elapsed, "seconds."
                
                if 1==0:
                    gKey = random.choice(eKeys)
                    # do a 2nd random to see if things blow up
                    start = time.time()
                    h2o.nodes[0].exec_query(str="%s=%s" % (gKey, fKey))
                    elapsed = time.time() - start
                    print "exec 3 took", elapsed, "seconds."

                if 1==1:
                    inspect = h2o_cmd.runInspect(key=fKey)
                    h2o_cmd.infoFromInspect(inspect, fKey)
                    numRows = inspect['numRows']
                    numCols = inspect['numCols']

                if numRows==0 or numCols!=colCount:
                    h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort")

                # QUANTILE*******************************************************
                quantile = 0.5 if DO_MEDIAN else .999
                # first output col. always fed by an exec cut, so 0?
                column = iColCount
                start = time.time()
                q = h2o.nodes[0].quantiles(source_key=fKey, column=column, 
                    quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS)
                h2p.red_print("quantile", quantile, q['result'])
                elapsed = time.time() - start
                print "quantile end on ", fKey, 'took', elapsed, 'seconds.'
                quantileTime = elapsed


                # remove all keys*******************************************************
                # what about hex_key?
                if 1==0:
                    start = time.time()
                    h2o.nodes[0].remove_all_keys()
                    elapsed = time.time() - start
                    print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.'

                trial += 1
                xList.append(trial)
                eList.append(execTime)
                fList.append(quantileTime)


        # just get a plot of the last one (biggest)
        if DO_PLOT:
            xLabel = 'trial'
            eLabel = 'exec cut time'
            fLabel = 'quantile time'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
예제 #49
0
    def test_exec_enums_rand_cut2(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = ROWS
        tryList = [
            # (n, 10, 9, 'cE', 300),
            (n, 1, 1, 'cE', 300),
        ]

        # create key names to use for exec
        eKeys = ['e%s' % i for i in range(10)]

        # h2b.browseTheCloud()
        trial = 0
        for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList:
            colCount = iColCount + oColCount

            hex_key = 'p'
            colEnumList = create_col_enum_list(iColCount)

            # create 100 possible cut expressions here, so we don't waste time below
            rowExprList = []
            print "Creating", CUT_EXPR_CNT, 'cut expressions'
            for j in range(CUT_EXPR_CNT):
                # init cutValue. None means no compare
                cutValue = [None for i in range(iColCount)]
                # build up a random cut expression
                MAX_COLS_IN_EXPR = iColCount
                cols = random.sample(range(MAX_COLS_IN_EXPR),
                                     random.randint(1, MAX_COLS_IN_EXPR))
                for c in cols:
                    # possible choices within the column
                    cel = colEnumList[c]
                    # for now the cutValues are numbers for the enum mappings
                    if 1 == 1:
                        # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like
                        celChoice = str(random.choice(range(len(cel))))
                    else:
                        celChoice = random.choice(cel)
                    cutValue[c] = celChoice

                cutExprList = []
                for i, c in enumerate(cutValue):
                    if c is None:
                        continue
                    else:
                        # new ...ability to reference cols
                        # src[ src$age<17 && src$zip=95120 && ... , ]
                        # randomly pick == or !=
                        if random.randint(0, 1) == 0:
                            cutExprList.append('p$C' + str(i + 1) + '!=' + c)
                        else:
                            cutExprList.append('p$C' + str(i + 1) + '==' + c)

                cutExpr = ' & '.join(cutExprList)
                # print "cutExpr:", cutExpr

                # just extract one output col (the first one)
                rowExpr = '%s[%s,%s];' % (hex_key, cutExpr, iColCount + 1)
                # print "rowExpr:", rowExpr
                print rowExpr
                rowExprList.append(rowExpr)

            # CREATE DATASET*******************************************
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname,
                              rowCount,
                              iColCount,
                              oColCount,
                              SEEDPERFILE,
                              colEnumList=colEnumList)

            # PARSE*******************************************************

            src_key = csvFilename
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='A' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='B' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='C' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='D' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='E' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='F' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='G' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='H' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='I' + src_key,
                                          timeoutSecs=200)
            parseResult = h2i.import_only(path=csvPathname,
                                          schema='put',
                                          src_key='J' + src_key,
                                          timeoutSecs=200)

            parseResult = h2i.parse_only(pattern='*' + src_key,
                                         hex_key=hex_key,
                                         timeoutSecs=800)

            print "Parse result['destination_key']:", parseResult[
                'destination_key']
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            pNumRows = inspect['numRows']
            pNumCols = inspect['numCols']
            # print h2o.dump_json(inspect)
            levels = h2o.nodes[0].levels(source=hex_key)
            print "levels result:", h2o.dump_json(levels)

            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

            # error if any col has constant values
            if len(constantValuesDict) != 0:
                raise Exception(
                    "Probably got a col NA'ed and constant values as a result %s"
                    % constantValuesDict)

            # INIT all possible key names used***************************
            # remember. 1 indexing!

            # is this needed?
            if 1 == 1:
                a = 'a=c(1,2,3);' + ';'.join(
                    ['a[,%s]=a[,%s-1]' % (i, i) for i in range(2, colCount)])
                print a
                for eKey in eKeys:
                    # build up the columns
                    e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey),
                                                print_params=False)
                    ## print h2o.dump_json(e)

            xList = []
            eList = []
            fList = []
            for repeat in range(CUT_LOOP_CNT):
                # EXEC*******************************************************
                # don't use exec_expr to avoid issues with Inspect following etc.
                randICol = random.randint(0, iColCount - 1)
                randOCol = random.randint(iColCount, iColCount + oColCount - 1)

                # should be two different keys in the sample
                e = random.sample(eKeys, 2)
                fKey = e[0]
                eKey = e[1]

                start = time.time()
                h2o.nodes[0].exec_query(str="%s=%s" %
                                        (fKey, random.choice(rowExprList)))
                elapsed = time.time() - start
                execTime = elapsed
                print "exec 2 took", elapsed, "seconds."

                inspect = h2o_cmd.runInspect(key=fKey)
                h2o_cmd.infoFromInspect(inspect, fKey)
                numRows = inspect['numRows']
                numCols = inspect['numCols']

                if numRows == 0 or numCols != colCount:
                    h2p.red_print("Warning: Cut resulted in", numRows,
                                  "rows and", numCols,
                                  "cols. Quantile will abort")

                # QUANTILE*******************************************************
                quantile = 0.5 if DO_MEDIAN else .999
                # first output col. always fed by an exec cut, so 0?
                column = iColCount
                column = 0
                start = time.time()
                q = h2o.nodes[0].quantiles(source_key=fKey,
                                           column=column,
                                           quantile=quantile,
                                           max_qbins=MAX_QBINS,
                                           multiple_pass=MULTI_PASS)
                h2p.red_print("quantile", quantile, q['result'])
                elapsed = time.time() - start
                print "quantile end on ", fKey, 'took', elapsed, 'seconds.'
                quantileTime = elapsed

                # remove all keys*******************************************************
                # what about hex_key?
                if 1 == 0:
                    start = time.time()
                    h2o.nodes[0].remove_all_keys()
                    elapsed = time.time() - start
                    print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.'

                trial += 1
                xList.append(trial)
                eList.append(execTime)
                fList.append(quantileTime)

        #****************************************************************
        # QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET
        print "QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET. Although it's a real col, not an enum col"
        quantile = 0.5 if DO_MEDIAN else .999
        # first output col. always fed by an exec cut, so 0?
        column = iColCount
        start = time.time()
        q = h2o.nodes[0].quantiles(source_key=hex_key,
                                   column='C' + str(iColCount + 1),
                                   quantile=quantile,
                                   max_qbins=MAX_QBINS,
                                   multiple_pass=0)
        elapsed = time.time() - start
        h2p.red_print(
            hex_key, pNumRows,
            "rows Baseline: quantile single col (C" + str(iColCount + 1) + ")",
            "one iteration", elapsed, "secs. threshold:", quantile,
            q['result'])
        print "quantile single col 1 iteration end on", hex_key, "took", elapsed, 'seconds.'
        quantileTime = elapsed

        #****************************************************************
        # PLOTS. look for eplot.jpg and fplot.jpg in local dir?
        if DO_PLOT:
            xLabel = 'trial'
            eLabel = 'exec cut time'
            fLabel = 'quantile time'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList,
                              xLabel,
                              eListTitle,
                              eList,
                              eLabel,
                              fListTitle,
                              fList,
                              fLabel,
                              server=True)
예제 #50
0
    def test_parse_multi_header_rand(self):
        ### h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_ints.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        headerChoices = ['aA','aB','aC','aD','aE','aF','aG','aH','aI', 'tomas']

        # cols must be 9 to match the header above, otherwise a different bug is hit
        # extra output is added, so it's 10 total
        tryList = [
            # FIX! one fails count for now
            # (1, 5, 9, 'cA', 60, 0),
            (1, 5, 9, 'cA', 60, 0),
            (1, 5, 25, 'cA', 60, 0),

            # try with col mismatch on header. 
            # FIX! causes exception? don't test for now
            # (7, 300, 10, 'cA', 60, 0),
            # (7, 300, 10, 'cB', 60, 1),
            # (7, 300, 10, 'cC', 60, 2),
            # (7, 300, 10, 'cD', 60, 3),

            # (7, 300, 8, 'cA', 60, 0),
            # (7, 300, 8, 'cB', 60, 1),
            # (7, 300, 8, 'cC', 60, 2),
            # (7, 300, 8, 'cD', 60, 3),
            ]

        # so many random combos..rather than walk tryList, just do random for some amount of time
        for trial in range(50):
            (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) = random.choice(tryList)
            print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader
            # FIX! should we add a header to them randomly???
            print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            totalCols = colCount + 1 # 1 extra for output
            totalDataRows = 0
            totalHeaderRows = 0

            HEADER_HAS_HDR_ROW = random.randint(0,1)
            DATA_HAS_HDR_ROW = random.randint(0,1)
            PARSE_PATTERN_INCLUDES_HEADER = random.randint(0,1)
            ## DATA_FIRST_IS_COMMENT = random.randint(0,1)
            ## HEADER_FIRST_IS_COMMENT = random.randint(0,1)
            print "TEMPORARY: don't put any comments in"
            DATA_FIRST_IS_COMMENT = 0
            HEADER_FIRST_IS_COMMENT = 0
            # none is not legal
            SEP_CHAR_GEN = random.choice(paramsDict['separator'])
            
            print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW
            print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW
            print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER
            print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT
            print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT
            print 'SEP_CHAR_GEN:', SEP_CHAR_GEN

            # they need to both use the same separator (h2o rule)
            hh = [random.choice(headerChoices) for h in range(colCount)] + ["output"]
            print hh
            headerForHeader = SEP_CHAR_GEN.join(hh)
            # make these different
            hh = [random.choice(headerChoices) for h in range(colCount)] + ["output"]
            headerForData   = SEP_CHAR_GEN.join(hh)

            # random selection of parse param choices
            kwargs = {}
            for k,v in paramsDict.items():
                aChoice = random.choice(v)
                # can tell h2o something different compared to what we actually used!
                if k == 'separator':
                    if aChoice: 
                        sepChar = aChoice
                        sepCharInt = ord(aChoice) # make it an integer for h2o
                    else:
                        sepChar = ',' # default char for None, need it for header/data file creation
                        sepCharInt = None
                    aChoice = sepCharInt

                kwargs[k] = aChoice

            # FOR NOW: ..override the rand choice if it exists, so we can parse and expect 'A' to be found
            # match what was gen'ed if choice is not None
            if kwargs['separator']:
                if SEP_CHAR_GEN==" "  or SEP_CHAR_GEN==",": # parse doesn't auto-detect tab. will autodetect space and comma
                    del kwargs['separator']
                else:
                    kwargs['separator'] = ord(SEP_CHAR_GEN)
        
            # create data files
            for fileN in range(fileNum):
                csvFilename = 'syn_data_' + str(fileN) + "_" + str(SEED) + "_" + str(trial) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN)
                (headerRowsDone, dataRowsDone) = write_syn_dataset(csvPathname, rowCount, 
                    headerString=(headerForData if DATA_HAS_HDR_ROW else None), rList=rList,
                    commentFirst=DATA_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN)
                totalDataRows += dataRowsDone
                totalHeaderRows += headerRowsDone

            # create the header file
            hdrFilename = 'syn_header_' + str(SEED) + "_" + str(trial) + "_" + rowxcol + '.csv'
            hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename
            # dataRowsWithHeader = 0 # temp hack
            (headerRowsDone, dataRowsDone) = write_syn_dataset(hdrPathname, dataRowsWithHeader, 
                headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None), rList=rList,
                commentFirst=HEADER_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN)
            if PARSE_PATTERN_INCLUDES_HEADER: # only include header file data rows if the parse pattern includes it
                totalDataRows += dataRowsDone
            totalHeaderRows += headerRowsDone

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            hex_key = "syn_dst" + str(trial) + ".hex"

            # DON"T get redirected to S3! (EC2 hack in config, remember!)
            # use it at the node level directly (because we gen'ed the files.
            # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w
            xs = h2o.nodes[0].import_files(SYNDATASETS_DIR)['keys']
            headerKey = [x for x in xs if hdrFilename in x][0]
            dataKey = [x for x in xs if csvFilename not in x][0]

            # use regex. the only files in the dir will be the ones we just created with  *fileN* match
            print "Header Key =", headerKey

            # put the right name in
            if kwargs['header_from_file'] == 'syn_header':
                kwargs['header_from_file'] = headerKey
            # use one of the data files?
            elif kwargs['header_from_file'] == 'syn_data':
                kwargs['header_from_file'] = dataKey

            # if there's no header in the header file, turn off the header_from_file
            if not HEADER_HAS_HDR_ROW:
                kwargs['header_from_file'] = None

            print "If header_from_file= is used, we are currently required to force header=1 for h2o"
            if kwargs['header_from_file']:
                kwargs['header'] =  1
            # if we have a header in a data file, tell h2o (for now)
            elif DATA_HAS_HDR_ROW:
                kwargs['header'] =  1
            else:
                kwargs['header'] =  0

            # may have error if h2o doesn't get anything!
            start = time.time()
            if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW:
                pattern = '*syn_*'+str(trial)+"_"+rowxcol+'*'
            else:
                pattern = '*syn_data_*'+str(trial)+"_"+rowxcol+'*'
            parseResult = h2o.nodes[0].parse(pattern, hex_key=hex_key, timeoutSecs=timeoutSecs, **kwargs)

            print "parseResult['destination_key']: " + parseResult['destination_key']
            print 'parse time:', parseResult['response']['time']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            # more reporting: (we can error here if extra col in header, causes all NA for missing col of data)
            h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False)

            # should match # of cols in header or ??
            self.assertEqual(inspect['num_cols'], totalCols, \
                "parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], totalCols))

            # do we end up parsing one data rows as a header because of mismatch in gen/param
            h2oLosesOneData = (headerRowsDone==0) and (kwargs['header']==1) and not DATA_HAS_HDR_ROW
            # header in data file gets treated as data
            h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \
                DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None)
            print "h2oLosesOneData:", h2oLosesOneData
            print "h2oGainsOneData:", h2oGainsOneData
            ### print (headerRowsDone!=0), (kwargs['header']==1), DATA_HAS_HDR_ROW, (kwargs['header_from_file'] is not None)
            if h2oLosesOneData:
                totalDataRows -= 1
            if h2oGainsOneData:
                totalDataRows += 1
                
            self.assertEqual(inspect['num_rows'], totalDataRows,
                "parse created result with the wrong number of rows (header rows don't count) h2o: %s gen'ed: %s" % \
                (inspect['num_rows'], totalDataRows))

            # put in an ignore param, that will fail unless headers were parsed correctly
            # doesn't matter if the header got a comment, should see it
            h2oShouldSeeHeader = (HEADER_HAS_HDR_ROW and (kwargs['header_from_file'] is not None)) or DATA_HAS_HDR_ROW
            if h2oShouldSeeHeader:
                kwargs = {'sample': 75, 'depth': 25, 'ntree': 1, 'ignore': 'A'}
            else:
                kwargs = {'sample': 75, 'depth': 25, 'ntree': 1}

            start = time.time()
            elapsed = time.time() - start
            print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \
                'took', time.time() - start, 'seconds'

            h2o.check_sandbox_for_errors()
예제 #51
0
    def test_GLM2_ints_unbalanced(self):
        h2o.beta_features = True
        ### h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 2000
        tryList = [
            (n, 1, 'cD', 300), 
            (n, 2, 'cE', 300), 
            (n, 4, 'cF', 300), 
            (n, 8, 'cG', 300), 
            (n, 16, 'cH', 300), 
            (n, 32, 'cI', 300), 
            ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c' # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a' # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list()
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList,5)

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=30, separator=colSepInt)
            print "Parse result['destination_key']:", parseResult['destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            modelKey = 'xyz'
            kwargs = {
                'n_folds': 0,
                'destination_key': modelKey,
                'response': y, 
                'max_iter': 200, 
                'family': 'binomial',
                'alpha': 0, 
                'lambda': 0, 
                }

            start = time.time()

            updateList= [ 
                {'alpha': 0.5, 'lambda': 1e-5},
                # {'alpha': 0.25, 'lambda': 1e-4},
            ]


            # Try each one
            for updateDict in updateList:
                print "\n#################################################################"
                print updateDict
                kwargs.update(updateDict)
                glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
                print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

                parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="B.hex",
                    timeoutSecs=30, separator=colSepInt)

                h2o_cmd.runScore(dataKey="B.hex", modelKey=modelKey, 
                    vactual='C' + str(y+1), vpredict=1, expectedAuc=0.6)
예제 #52
0
    def test_parse_utf8_3(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if DEBUG:
            n = 20
        else:
            n = 10000
            n = 1000
            n = 500

        # from command line arg -long
        if h2o.long_test_case:
            repeat = 1000 
        else:
            repeat = 50

        scale = 1
        tryList = [
            (n, 3, 'cI', 300), 
            (n, 3, 'cI', 300), 
            (n, 3, 'cI', 300), 
        ]

        for r in range(repeat):
            for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                # using the comma is nice to ensure no craziness
                colSepHexString = '2c' # comma
                colSepChar = colSepHexString.decode('hex')
                colSepInt = int(colSepHexString, base=16)
                print "colSepChar:", colSepChar

                rowSepHexString = '0a' # newline
                rowSepChar = rowSepHexString.decode('hex')
                print "rowSepChar:", rowSepChar

                csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # same enum list/mapping, but different dataset?
                start = time.time()
                write_syn_dataset(csvPathname, rowCount, colCount, scale=1,
                    colSepChar=colSepChar, rowSepChar=rowSepChar, SEED=SEEDPERFILE)
                elapsed = time.time() - start
                print "took %s seconds to create %s" % (elapsed, csvPathname)

                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, header=0,
                    timeoutSecs=60, separator=colSepInt, doSummary=DO_SUMMARY)
                print "Parse result['destination_key']:", parseResult['destination_key']
                
                inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
                numCols = inspect['numCols']
                numRows = inspect['numRows']

                h2o_cmd.infoFromInspect(inspect)

                # Each column should get .10 random NAs per iteration. Within 10%? 
                missingValuesList = h2o_cmd.infoFromInspect(inspect)
                # print "missingValuesList", missingValuesList
                # for mv in missingValuesList:
                #     self.assertAlmostEqual(mv, expectedNA, delta=0.1 * mv, 
                #        msg='mv %s is not approx. expected %s' % (mv, expectedNA))

                # might have extra rows
                if numRows!=rowCount:
                    raise Exception("Expect numRows %s = rowCount %s because guaranteed not to have extra eols" % \
                        (numRows, rowCount))
                # numCols should be right?
                self.assertEqual(colCount, numCols)

                (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                    h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], 
                    exceptionOnMissingValues=False)
예제 #53
0
    def test_benchmark_import(self):
        # typical size of the michal files
        avgMichalSizeUncompressed = 237270000 
        avgMichalSize = 116561140 
        avgSynSize = 4020000
        covtype200xSize = 15033863400
        synSize =  183
        if 1==0:
            importFolderPath = '/home/0xdiag/datasets/more1_1200_link'
            print "Using .gz'ed files in", importFolderPath
            csvFilenameAll = [
                # this should hit the "more" files too?
                # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800),
                # ("*.dat.gz", "file_200.dat.gz", 1200 * avgMichalSize, 1800),
                # ("*[1][0-2][0-9].dat.gz", "file_30.dat.gz", 50 * avgMichalSize, 1800), 
                ("*file_[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), 
                ("*file_[12][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 1800), 
                ("*file_[34][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 1800), 
                ("*file_[56][0-9][0-9].dat.gz", "file_200_C.dat.gz", 200 * avgMichalSize, 1800), 
                ("*file_[78][0-9][0-9].dat.gz", "file_200_D.dat.gz", 200 * avgMichalSize, 1800), 
                # ("*.dat.gz", "file_1200.dat.gz", 1200 * avgMichalSize, 3600),
            ]

        if 1==1:
            importFolderPath = '/home/0xdiag/datasets/more1_1200_link'
            print "Using .gz'ed files in", importFolderPath
            csvFilenameAll = [
                # this should hit the "more" files too?
                # ("*10[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 3600), 
                # ("*1[0-4][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 3600), 
                # ("*[1][0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600), 
                # ("*3[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 3600),
                # ("*1[0-9][0-9].dat.gz", "file_100.dat.gz", 100 * avgMichalSize, 1800), 
                #("*[1-2][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600), 
                # ("*[3-4][0-9][0-9].dat.gz", "file_200.dat.gz", 200 * avgMichalSize, 3600),
                ("*[3-4][0-4][0-9].dat.gz", "file_100_A.dat.gz", 100 * avgMichalSize, 3600),
                ("*[3-4][0-4][0-9].dat.gz", "file_100_B.dat.gz", 100 * avgMichalSize, 3600),

                ("*[3-4][0-5][0-9].dat.gz", "file_120_A.dat.gz", 120 * avgMichalSize, 3600),
                ("*[3-4][0-5][0-9].dat.gz", "file_120_B.dat.gz", 120 * avgMichalSize, 3600),

                ("*[3-4][0-6][0-9].dat.gz", "file_140_A.dat.gz", 140 * avgMichalSize, 3600),
                ("*[3-4][0-6][0-9].dat.gz", "file_140_B.dat.gz", 140 * avgMichalSize, 3600),

                ("*[3-4][0-7][0-9].dat.gz", "file_160_A.dat.gz", 160 * avgMichalSize, 3600),
                ("*[3-4][0-7][0-9].dat.gz", "file_160_B.dat.gz", 160 * avgMichalSize, 3600),

                ("*[3-4][0-8][0-9].dat.gz", "file_180_A.dat.gz", 180 * avgMichalSize, 3600),
                ("*[3-4][0-8][0-9].dat.gz", "file_180_B.dat.gz", 180 * avgMichalSize, 3600),

                ("*[3-4][0-9][0-9].dat.gz", "file_200_A.dat.gz", 200 * avgMichalSize, 3600),
                ("*[3-4][0-9][0-9].dat.gz", "file_200_B.dat.gz", 200 * avgMichalSize, 3600),

                ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600),
                ("*[3-5][0-9][0-9].dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600),
                # for now, take too long on 2x100GB heap on 164
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
                # ("*[3-6][0-9][0-9].dat.gz", "file_400.dat.gz", 400 * avgMichalSize, 3600),
            ]

        if 1==0:
            importFolderPath = '/home/0xdiag/datasets/manyfiles-nflx-gz'
            print "Using .gz'ed files in", importFolderPath
            csvFilenameAll = [
                # this should hit the "more" files too?
                ("*_[123][0-9][0-9]*.dat.gz", "file_300.dat.gz", 300 * avgMichalSize, 3600),
                ("*_[1][5-9][0-9]*.dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 3600),
            ]

        if 1==0:
            importFolderPath = '/home2/0xdiag/datasets'
            print "Using non-.gz'ed files in", importFolderPath
            csvFilenameAll = [
                # I use different files to avoid OS caching effects
                ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700),
                ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700),
                ("manyfiles-nflx/file_[0-9][0-9]*.dat", "file_100.dat", 100 * avgMichalSizeUncompressed, 700),
                # ("onefile-nflx/file_1_to_100.dat", "file_single.dat", 100 * avgMichalSizeUncompressed, 1200),
                # ("manyfiles-nflx/file_1.dat", "file_1.dat", 1 * avgMichalSizeUncompressed, 700),
                # ("manyfiles-nflx/file_[2][0-9].dat", "file_10.dat", 10 * avgMichalSizeUncompressed, 700),
                # ("manyfiles-nflx/file_[34][0-9].dat", "file_20.dat", 20 * avgMichalSizeUncompressed, 700),
                # ("manyfiles-nflx/file_[5-9][0-9].dat", "file_50.dat", 50 * avgMichalSizeUncompressed, 700),
            ]
        if 1==0: 
            importFolderPath = '/home/0xdiag/datasets/standard'
            print "Using .gz'ed files in", importFolderPath
            # all exactly the same prior to gzip!
            # could use this, but remember import folder -> import folder s3 for jenkins?
            # how would it get it right?
            # os.path.getsize(f)
            csvFilenameAll = [
                # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 700),
                # 100 files takes too long on two machines?
                # ("covtype200x.data", "covtype200x.data", 15033863400, 700),
                # I use different files to avoid OS caching effects
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[0-9][0-9]", "syn_100.csv", 100 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_00000", "syn_1.csv", avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_0001[0-9]", "syn_10.csv", 10 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[23][0-9]", "syn_20.csv", 20 * avgSynSize, 700),
                # ("syn_datasets/syn_7350063254201195578_10000x200.csv_000[45678][0-9]", "syn_50.csv", 50 * avgSynSize, 700),
                # ("manyfiles-nflx-gz/file_10.dat.gz", "file_10_1.dat.gz", 1 * avgMichalSize, 700),
                # ("manyfiles-nflx-gz/file_1[0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700),

                ("manyfiles-nflx-gz/file_1.dat.gz", "file_1.dat.gz", 1 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[2][0-9].dat.gz", "file_10.dat.gz", 10 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[34][0-9].dat.gz", "file_20.dat.gz", 20 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[5-9][0-9].dat.gz", "file_50.dat.gz", 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_1[0-9][0-9].dat.gz", "file_100.dat.gz", 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[12][0-9][0-9].dat.gz", "file_200.dat.gz", 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_[12]?[0-9][0-9].dat.gz", "file_300.dat.gz", 50 * avgMichalSize, 700),
                ("manyfiles-nflx-gz/file_*.dat.gz", "file_384.dat.gz", 100 * avgMichalSize, 1200),
                ("covtype200x.data", "covtype200x.data", covtype200xSize, 700),

                # do it twice
                # ("covtype.data", "covtype.data"),
                # ("covtype20x.data", "covtype20x.data"),
                # "covtype200x.data",
                # "100million_rows.csv",
                # "200million_rows.csv",
                # "a5m.csv",
                # "a10m.csv",
                # "a100m.csv",
                # "a200m.csv",
                # "a400m.csv",
                # "a600m.csv",
                # "billion_rows.csv.gz",
                # "new-poker-hand.full.311M.txt.gz",
                ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # split out the pattern match and the filename used for the hex
        trialMax = 1
        # rebuild the cloud for each file
        base_port = 54321
        tryHeap = 28
        # can fire a parse off and go wait on the jobs queue (inspect afterwards is enough?)
        DO_GLM = False
        noPoll = False
        # benchmarkLogging = ['cpu','disk', 'iostats', 'jstack']
        # benchmarkLogging = None
        benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        pollTimeoutSecs = 120
        retryDelaySecs = 10

        jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails' + ' -Dh2o.find-ByteBuffer-leaks'
        jea = '-XX:MaxDirectMemorySize=512m -XX:+PrintGCDetails'
        jea = "-XX:+UseParNewGC -XX:+UseConcMarkSweepGC"
        jea = ' -Dcom.sun.management.jmxremote.port=54330' + \
              ' -Dcom.sun.management.jmxremote.authenticate=false' + \
              ' -Dcom.sun.management.jmxremote.ssl=false'  + \
              ' -Dcom.sun.management.jmxremote' + \
              ' -Dcom.sun.management.jmxremote.local.only=false'
        jea = ' -Dlog.printAll=true'


        for i,(csvFilepattern, csvFilename, totalBytes, timeoutSecs) in enumerate(csvFilenameList):
            localhost = h2o.decide_if_localhost()
            if (localhost):
                h2o.build_cloud(2,java_heap_GB=tryHeap, base_port=base_port,
                    # java_extra_args=jea,
                    enable_benchmark_log=True)

            else:
                h2o_hosts.build_cloud_with_hosts(base_port=base_port, 
                    # java_extra_args=jea,
                    enable_benchmark_log=True)

            # pop open a browser on the cloud
            ### h2b.browseTheCloud()

            # to avoid sticky ports?
            ### base_port += 2

            for trial in range(trialMax):
                importFolderResult = h2i.setupImportFolder(None, importFolderPath)
                importFullList = importFolderResult['files']
                importFailList = importFolderResult['fails']
                print "\n Problem if this is not empty: importFailList:", h2o.dump_json(importFailList)
                # creates csvFilename.hex from file in importFolder dir 

                h2o.cloudPerfH2O.change_logfile(csvFilename)
                h2o.cloudPerfH2O.message("")
                h2o.cloudPerfH2O.message("Parse " + csvFilename + " Start--------------------------------")
                start = time.time()
                parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, 
                    key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                    retryDelaySecs=retryDelaySecs,
                    pollTimeoutSecs=pollTimeoutSecs,
                    noPoll=noPoll,
                    benchmarkLogging=benchmarkLogging)

                if noPoll:
                    if (i+1) < len(csvFilenameList):
                        time.sleep(1)
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes2, timeoutSecs) = csvFilenameList[i+1]
                        parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, 
                            key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)

                    if (i+2) < len(csvFilenameList):
                        time.sleep(1)
                        h2o.check_sandbox_for_errors()
                        (csvFilepattern, csvFilename, totalBytes3, timeoutSecs) = csvFilenameList[i+2]
                        parseKey = h2i.parseImportFolderFile(None, csvFilepattern, importFolderPath, 
                            key2=csvFilename + ".hex", timeoutSecs=timeoutSecs, 
                            retryDelaySecs=retryDelaySecs,
                            pollTimeoutSecs=pollTimeoutSecs,
                            noPoll=noPoll,
                            benchmarkLogging=benchmarkLogging)

                elapsed = time.time() - start
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                # print stats on all three if noPoll
                if noPoll:
                    # does it take a little while to show up in Jobs, from where we issued the parse?
                    time.sleep(2)
                    # FIX! use the last (biggest?) timeoutSecs? maybe should increase since parallel
                    h2o_jobs.pollWaitJobs(pattern=csvFilename,
                        timeoutSecs=timeoutSecs, benchmarkLogging=benchmarkLogging)
                    # for getting the MB/sec closer to 'right'
                    totalBytes += totalBytes2 + totalBytes3
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()


                if totalBytes is not None:
                    fileMBS = (totalBytes/1e6)/elapsed
                    l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, fileMBS, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                print csvFilepattern, 'parse time:', parseKey['response']['time']
                print "Parse result['destination_key']:", parseKey['destination_key']

                # BUG here?
                if not noPoll:
                    # We should be able to see the parse result?
                    h2o_cmd.columnInfoFromInspect(parseKey['destination_key'], exceptionOnMissingValues=False)

                        
                # the nflx data doesn't have a small enough # of classes in any col
                # use exec to randomFilter out 200 rows for a quick RF. that should work for everyone?
                origKey = parseKey['destination_key']
                # execExpr = 'a = randomFilter('+origKey+',200,12345678)' 
                execExpr = 'a = slice('+origKey+',1,200)' 
                h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30)
                # runRFOnly takes the parseKey directly
                newParseKey = {'destination_key': 'a'}

                print "\n" + csvFilepattern
                # poker and the water.UDP.set3(UDP.java) fail issue..
                # constrain depth to 25
                print "Temporarily hacking to do nothing instead of RF on the parsed file"
                ### RFview = h2o_cmd.runRFOnly(trees=1,depth=25,parseKey=newParseKey, timeoutSecs=timeoutSecs)
                ### h2b.browseJsonHistoryAsUrlLastMatch("RFView")

                #**********************************************************************************
                # Do GLM too
                # Argument case error: Value 0.0 is not between 12.0 and 9987.0 (inclusive)
                if DO_GLM:
                    # these are all the columns that are enums in the dataset...too many for GLM!
                    x = range(542) # don't include the output column
                    # remove the output too! (378)
                    for i in [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 16, 17, 18, 19, 20, 424, 425, 426, 540, 541, 378]:
                        x.remove(i)
                    x = ",".join(map(str,x))

                    GLMkwargs = {'x': x, 'y': 378, 'case': 15, 'case_mode': '>',
                        'max_iter': 10, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5}
                    start = time.time()
                    glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **GLMkwargs)
                    h2o_glm.simpleCheckGLM(self, glm, None, **GLMkwargs)
                    elapsed = time.time() - start
                    h2o.check_sandbox_for_errors()
                    l = '{:d} jvms, {:d}GB heap, {:s} {:s} GLM: {:6.2f} secs'.format(
                        len(h2o.nodes), h2o.nodes[0].java_heap_GB, csvFilepattern, csvFilename, elapsed)
                    print l
                    h2o.cloudPerfH2O.message(l)

                #**********************************************************************************

                h2o_cmd.checkKeyDistribution()
                h2o_cmd.deleteCsvKey(csvFilename, importFolderResult)
                ### time.sleep(3600)
                h2o.tear_down_cloud()
                if not localhost:
                    print "Waiting 30 secs before building cloud again (sticky ports?)"
                    ### time.sleep(30)

                sys.stdout.write('.')
                sys.stdout.flush() 
예제 #54
0
    def test_GLM2_enums_score_superset(self):
        h2o.beta_features = True
        print "FIX!: this should cause an error. We should detect that it's not causing an error/warning?"
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 200
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 3, 'cF', 300),
            (n, 4, 'cG', 300),
            (n, 5, 'cH', 300),
            (n, 6, 'cI', 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            # add a extra enum for scoring that's not in the model enumList
            enumListForScore.append("xyzzy")

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(csvPathname,
                              enumList,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname,
                              enumListForScore,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            modelKey = 'enums'
            kwargs = {
                'destination_key': modelKey,
                'response': y,
                'max_iter': 1,
                'n_folds': 1,
                'alpha': 0.2,
                'lambda': 1e-5,
                'family': 'binomial'
            }

            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=180,
                                 **kwargs)
            print "glm end on ", parseResult[
                'destination_key'], 'took', time.time() - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            scoreDataKey = "score_" + hex_key
            parseResult = h2i.import_parse(path=csvScorePathname,
                                           schema='put',
                                           hex_key=scoreDataKey,
                                           timeoutSecs=30,
                                           separator=colSepInt)

            # Score *******************************
            # this messes up if you use case_mode/case_vale above
            predictKey = 'Predict.hex'
            start = time.time()

            predictResult = h2o_cmd.runPredict(data_key=scoreDataKey,
                                               model_key=modelKey,
                                               destination_key=predictKey,
                                               timeoutSecs=timeoutSecs)

            # just get a predict and AUC on the same data. has to be binomial result
            resultAUC = h2o.nodes[0].generate_auc(thresholds=None,
                                                  actual=scoreDataKey,
                                                  predict='Predict.hex',
                                                  vactual=y,
                                                  vpredict=1)
            auc = resultAUC['AUC']
            self.assertAlmostEqual(
                auc,
                0.5,
                delta=0.15,
                msg="actual auc: %s not close enough to 0.5" % auc)

            predictCMResult = h2o.nodes[0].predict_confusion_matrix(
                actual=scoreDataKey,
                predict=predictKey,
                vactual='C' + str(y + 1),
                vpredict='predict',
            )

            cm = predictCMResult['cm']

            # These will move into the h2o_gbm.py
            pctWrong = h2o_gbm.pp_cm_summary(cm)

            print "\nTest\n==========\n"
            print h2o_gbm.pp_cm(cm)