Пример #1
0
def glm_score(self, csvFilename, csvPathname, modelKey, thresholds="0.5",
    timeoutSecs=30, pollTimeoutSecs=30):
    print "\nStarting GLM score of", csvFilename
    key2 = csvFilename + ".hex"
    parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, 
        timeoutSecs=timeoutSecs, pollTimeoutSecs=pollTimeoutSecs)
    y = "10"
    x = ""
    kwargs = {'x': x, 'y':  y, 'case': -1, 'thresholds': 0.5}

    start = time.time()
    glmScore = h2o_cmd.runGLMScore(key=key2, model_key=modelKey, thresholds="0.5",
        timeoutSecs=timeoutSecs)
    print "GLMScore in",  (time.time() - start), "secs (python)"
    h2o.verboseprint(h2o.dump_json(glmScore))
    ### h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs)

    # compare this glm to the first one. since the files are replications, 
    # the results
    # should be similar?
    # UPDATE: format for returning results is slightly different than normal GLM
    validation = glmScore['validation']
    if self.validations1:
        h2o_glm.compareToFirstGlm(self, 'err', validation, self.validations1)
    else:
        self.validations1 = copy.deepcopy(validation)
Пример #2
0
def glm_score(self, csvFilename, csvPathname, modelKey, timeoutSecs=3):
    print "\nStarting GLM score of", csvFilename
    key2 = csvFilename + ".hex"
    parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=10)
    y = "10"
    x = ""
    kwargs = {'x': x, 'y':  y, 'case': -1, 'thresholds': 0.5}

    start = time.time()
    glmScore = h2o_cmd.runGLMScore(key=key2, model_key=modelKey, timeoutSecs=timeoutSecs)
    print "GLMScore in",  (time.time() - start), "secs (python)"
    h2o.verboseprint(h2o.dump_json(glmScore))
    ### h2o_glm.simpleCheckGLM(self, glm, 7, **kwargs)

    # compare this glm to the first one. since the files are replications, the results
    # should be similar?
    GLMModel = glmScore['GLMModel']
    validationsList = glmScore['GLMModel']['validations']
    validations = validationsList[0]
    if self.validations1:
        h2o_glm.compareToFirstGlm(self, 'err', validations, self.validations1)
    else:
        self.validations1 = copy.deepcopy(validations)
Пример #3
0
    def test_GLM_enums_unbalanced(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 2000
        tryList = [
            (n, 1, 'cD', 300), 
            (n, 2, 'cE', 300), 
            (n, 4, 'cF', 300), 
            (n, 8, 'cG', 300), 
            (n, 16, 'cH', 300), 
            (n, 32, 'cI', 300), 
            ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c' # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a' # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList,5)

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=30, separator=colSepInt)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="score_" + hex_key, 
                timeoutSecs=30, separator=colSepInt)

            y = colCount
            kwargs = {
                'y': y, 
                'max_iter': 200, 
                'family': 'binomial',
                'n_folds': 10, 
                'alpha': 0, 
                'lambda': 0, 
                'thresholds': 0.5,
                # 'case_mode': '=', 
                # 'case': 0,
                }

            start = time.time()

            updateList= [ 
                {'alpha': 0.5, 'lambda': 1e-4},
                {'alpha': 0.25, 'lambda': 1e-6},
                {'alpha': 0.0, 'lambda': 1e-8},
                {'alpha': 0.5, 'lambda': 0.0},
                {'alpha': 0.0, 'lambda': 0.0},
            ]

            # Try each one
            h2o.beta_features = True
            for updateDict in updateList:
                print "\n#################################################################"
                print updateDict
                kwargs.update(updateDict)
                glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
                print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'

                GLMModel = glm['GLMModel']
                # submodels0 = GLMModel['submodels'][0]
                iterations = GLMModel['iterations']
                modelKey = GLMModel['model_key']

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
                if iterations > 20:
                    raise Exception("Why take so many iterations:  %s in this glm training?" % iterations)


                start = time.time()
                # score with same dataset (will change to recreated dataset with one less enum
                glmScore = h2o_cmd.runGLMScore(key=parseResult['destination_key'],
                    model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs)
                print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
                ### print h2o.dump_json(glmScore)
                classErr = glmScore['validation']['classErr']
                auc = glmScore['validation']['auc']
                err = glmScore['validation']['err']
                nullDev = glmScore['validation']['nullDev']
                resDev = glmScore['validation']['resDev']
                h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)

                print "classErr:", classErr
                print "err:", err
                print "auc:", auc
                print "resDev:", resDev
                print "nullDev:", nullDev
                if math.isnan(resDev):
                    emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validation['resDev'])
                    raise Exception(emsg)

                # what is reasonable?
                # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err)
                self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc)

                if math.isnan(err):
                    emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err)
                    raise Exception(emsg)

                if math.isnan(resDev):
                    emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", resDev)
                    raise Exception(emsg)

                if math.isnan(nullDev):
                    emsg = "Why is this nullDev = 'nan'?? %6s %s" % ("nullDev:\t", nullDev)
Пример #4
0
    def test_c10_rel_glm(self):
        h2o.beta_features = False
        print "Since the python is not necessarily run as user=0xcust..., can't use a  schema='put' here"
        print "Want to be able to run python as jenkins"
        print "I guess for big 0xcust files, we don't need schema='put'"
        print "For files that we want to put (for testing put), we can get non-private files"

        # Parse Train***********************************************************
        importFolderPath = '/mnt/0xcustomer-datasets/c3'
        csvFilename = 'classification1Train.txt'
        csvPathname = importFolderPath + "/" + csvFilename

        start = time.time()
        parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False)
        print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds"

        print "Parse result['destination_key']:", parseResult['destination_key']

        start = time.time()
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500)
        print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        # num_rows = inspect['num_rows']
        # num_cols = inspect['num_cols']
        # do summary of the parsed dataset last, since we know it fails on this dataset
        summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'])
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)

        # keepList = []
        # h2o_glm.findXFromColumnInfo(key=parseResult['destination_key'], keepList=keepList)
        # see README.txt in 0xcustomer-datasets/c3 for the col names to use in keepList above, to get the indices
        
        # since we're no long zero based, increment by 1
        x_from_zero = [6,7,8,10,12,31,32,33,34,35,36,37,40,41,42,43,44,45,46,47,49,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70]

        x = ['C' + str(i + 1) for i in x_from_zero]
        y = 0
        # GLM Train***********************************************************
        keepPattern = None
        # don't need the intermediate Dicts produced from columnInfoFromInspect
        x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300)
        print "from goodX (not used) x:", x
        print "y:", y

        # have to use named cols, and they start with 1
        


        kwargs = {
            'x': x,
            'y': y,
            # 'case_mode': '>',
            # 'case': 0,
            'family': 'binomial',
            'lambda': 1.0E-5,
            'alpha': 0.5,
            'max_iter': 4,
            'thresholds': 0.5,
            'n_folds': 1,
            'weight': 100,
            'beta_epsilon': 1.0E-4,
            }

        timeoutSecs = 3600
        start = time.time()
        glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs)
        elapsed = time.time() - start
        print "glm completed in", elapsed, "seconds.", \
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

        # Parse Test***********************************************************
        GLMModel = glm['GLMModel']
        modelKey = GLMModel['model_key']

        csvFilename = 'classification1Test.txt'
        csvPathname = importFolderPath + "/" + csvFilename
        parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False)
        print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds"

        # GLMScore Test***********************************************************
        start = time.time()
        # score with same dataset (will change to recreated dataset with one less enum
        glmScore = h2o_cmd.runGLMScore(key=parseResult['destination_key'],
            model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs)
        print "glmScore end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
Пример #5
0
    def test_GLM_enums_score_subset(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 200
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 3, 'cF', 300),
            (n, 4, 'cG', 300),
            (n, 5, 'cH', 300),
            (n, 6, 'cI', 300),
        ]

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(csvPathname,
                              enumList,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname,
                              enumListForScore,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            parseKey = h2o_cmd.parseFile(None,
                                         csvPathname,
                                         key2=key2,
                                         timeoutSecs=30,
                                         separator=colSepInt)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseKey['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            kwargs = {
                'y': y,
                'max_iter': 1,
                'n_folds': 1,
                'alpha': 0.2,
                'lambda': 1e-5,
                'case_mode': '=',
                'case': 0
            }
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=180,
                                     **kwargs)
            print "glm end on ", parseKey[
                'destination_key'], 'took', time.time() - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            GLMModel = glm['GLMModel']
            modelKey = GLMModel['model_key']

            parseKey = h2o_cmd.parseFile(None,
                                         csvScorePathname,
                                         key2="score_" + key2,
                                         timeoutSecs=30,
                                         separator=colSepInt)

            start = time.time()
            # score with same dataset (will change to recreated dataset with one less enum
            glmScore = h2o_cmd.runGLMScore(key=parseKey['destination_key'],
                                           model_key=modelKey,
                                           thresholds="0.5",
                                           timeoutSecs=timeoutSecs)
            print "glm end on ", parseKey[
                'destination_key'], 'took', time.time() - start, 'seconds'
            ### print h2o.dump_json(glmScore)
            classErr = glmScore['validation']['classErr']
            auc = glmScore['validation']['auc']
            err = glmScore['validation']['err']
            print "classErr:", classErr
            print "err:", err
            print "auc:", auc
Пример #6
0
    def test_GLM_mnist_reals(self):
        importFolderPath = "mnist"
        csvFilelist = [
            ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz",    600), 
        ]
        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + testCsvFilename, schema='put', 
                hex_key=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0 # first column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)

            # PARSE train****************************************
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="mnist/" + trainCsvFilename,
                hex_key=trainKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GLM****************************************
            print "This is the pruned x we'll use"
            x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)
            print "x:", x

            params = {
                'x': x, 
                'y': y,
                'case_mode': '=',
                'case': 0,
                'family': 'binomial',
                'lambda': 1.0E-5,
                'alpha': 0.0,
                'max_iter': 5,
                'thresholds': 0.5,
                'n_folds': 1,
                'weight': 1,
                'beta_epsilon': 1.0E-4,
                }

            for c in [0,1,2,3,4,5,6,7,8,9]:
                kwargs = params.copy()
                print "Trying binomial with case:", c
                kwargs['case'] = c

                timeoutSecs = 1800
                start = time.time()
                glm = h2o_cmd.runGLMOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs)
                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                GLMModel = glm['GLMModel']
                modelKey = GLMModel['model_key']

                start = time.time()
                glmScore = h2o_cmd.runGLMScore(key=testKey2, model_key=modelKey, thresholds="0.5",
                    timeoutSecs=60)
                elapsed = time.time() - start
                print "GLMScore in",  elapsed, "secs", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
                h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
Пример #7
0
    def test_GLM_mnist_s3n(self):
        URI = "s3n://home-0xdiag-datasets/mnist/"
        csvFilelist = [
            ("mnist_training.csv.gz", "mnist_testing.csv.gz",    600), 
            ("mnist_testing.csv.gz",  "mnist_training.csv.gz",    600), 
            ("mnist_training.csv.gz", "mnist_training.csv.gz",    600), 
        ]
        # IMPORT**********************************************
        importHDFSResult = h2o.nodes[0].import_hdfs(URI)
        ### print "importHDFSResult:", h2o.dump_json(importHDFSResult)
        s3nFullList = importHDFSResult['succeeded']
        ### print "s3nFullList:", h2o.dump_json(s3nFullList)

        self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?")

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            s3nKey = URI + testCsvFilename
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            print "Loading s3n key: ", s3nKey, 'thru HDFS'
            start = time.time()
            parseKey = h2o.nodes[0].parse(s3nKey, testKey2,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", s3nKey, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            # PARSE train****************************************
            s3nKey = URI + trainCsvFilename
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            print "Loading s3n key: ", s3nKey, 'thru HDFS'
            start = time.time()
            parseKey = h2o.nodes[0].parse(s3nKey, trainKey2,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", s3nKey, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']


            # GLM****************************************
            y = 0 # first column is pixel value
            print "y:"
            # don't need the intermediate Dicts produced from columnInfoFromInspect
            x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300)
            print "x:", x

            kwargs = {
                'x': x, 
                'y': y,
                # 'case_mode': '>',
                # 'case': 0,
                'family': 'gaussian',
                'lambda': 1.0E-5,
                'alpha': 0.5,
                'max_iter': 5,
                'thresholds': 0.5,
                'n_folds': 1,
                'weight': 1,
                'beta_epsilon': 1.0E-4,
                }

            timeoutSecs = 1800
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs)
            elapsed = time.time() - start
            print "GLM completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            GLMModel = glm['GLMModel']
            modelKey = GLMModel['model_key']

            kwargs = {'x': x, 'y':  y, 'thresholds': 0.5}
            start = time.time()
            glmScore = h2o_cmd.runGLMScore(key=testKey2, model_key=modelKey, thresholds="0.5",
                timeoutSecs=60)
            print "GLMScore in",  (time.time() - start), "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            h2o.verboseprint(h2o.dump_json(glmScore))
    def test_GLM_enums_score_superset(self):
        print "FIX!: this should cause an error. We should detect that it's not causing an error/warning?"
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 200
        tryList = [
            (n, 1, 'cD', 300), 
            (n, 2, 'cE', 300), 
            (n, 3, 'cF', 300), 
            (n, 4, 'cG', 300), 
            (n, 5, 'cH', 300), 
            (n, 6, 'cI', 300), 
            ]

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c' # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a' # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList,5)

            # add a extra enum for scoring that's not in the model enumList
            enumListForScore.append("xyzzy")

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, 
                timeoutSecs=30, separator=colSepInt)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            print "\n" + csvFilename
            missingValuesDict = h2o_cmd.check_enums_from_inspect(parseKey)
            if missingValuesDict:
                m = [str(k) + ":" + str(v) for k,v in missingValuesDict.iteritems()]
                raise Exception("Looks like columns got flipped to NAs: " + ", ".join(m))

            y = colCount
            kwargs = {'y': y, 'max_iter': 1, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5, 
                'case_mode': '=', 'case': 0}
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
            print "glm end on ", parseKey['destination_key'], 'took', time.time() - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            GLMModel = glm['GLMModel']
            modelKey = GLMModel['model_key']

            parseKey = h2o_cmd.parseFile(None, csvScorePathname, key2="score_" + key2, 
                timeoutSecs=30, separator=colSepInt)

            start = time.time()
            # score with same dataset (will change to recreated dataset with one less enum
            glmScore = h2o_cmd.runGLMScore(key=parseKey['destination_key'],
                model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs)
            print "glm end on ", parseKey['destination_key'], 'took', time.time() - start, 'seconds'
            ### print h2o.dump_json(glmScore)
            classErr = glmScore['validation']['classErr']
            auc = glmScore['validation']['auc']
            err = glmScore['validation']['err']
            print "classErr:", classErr
            print "err:", err
            print "auc:", auc
Пример #9
0
    def test_GLM_mnist_s3n(self):
        csvFilelist = [
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600),
            ("mnist_testing.csv.gz", "mnist_training.csv.gz", 600),
            ("mnist_training.csv.gz", "mnist_training.csv.gz", 600),
        ]

        importFolderPath = "mnist"
        csvPathname = importFolderPath + "/*"
        (importHDFSResult,
         importPattern) = h2i.import_only(bucket='home-0xdiag-datasets',
                                          path=csvPathname,
                                          schema='s3n',
                                          timeoutSecs=120)
        s3nFullList = importHDFSResult['succeeded']
        self.assertGreater(len(s3nFullList), 1,
                           "Should see more than 1 files in s3n?")

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            # PARSE test****************************************
            csvPathname = importFolderPath + "/" + testCsvFilename
            testHexKey = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='s3n',
                                           hex_key=testHexKey,
                                           timeoutSecs=timeoutSecs,
                                           retryDelaySecs=10,
                                           pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + trainCsvFilename
            trainHexKey = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='s3n',
                                           hex_key=trainHexKey,
                                           timeoutSecs=timeoutSecs,
                                           retryDelaySecs=10,
                                           pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # GLM****************************************
            y = 0  # first column is pixel value
            print "y:"
            # don't need the intermediate Dicts produced from columnInfoFromInspect
            x = h2o_glm.goodXFromColumnInfo(y,
                                            key=parseResult['destination_key'],
                                            timeoutSecs=300)
            print "x:", x

            kwargs = {
                'x': x,
                'y': y,
                # 'case_mode': '>',
                # 'case': 0,
                'family': 'gaussian',
                'lambda': 1.0E-5,
                'alpha': 0.5,
                'max_iter': 5,
                'thresholds': 0.5,
                'n_folds': 1,
                'weight': 1,
                'beta_epsilon': 1.0E-4,
            }

            timeoutSecs = 1800
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=60,
                                 **kwargs)
            elapsed = time.time() - start
            print "GLM completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            GLMModel = glm['GLMModel']
            modelKey = GLMModel['model_key']

            kwargs = {'x': x, 'y': y, 'thresholds': 0.5}
            start = time.time()
            glmScore = h2o_cmd.runGLMScore(key=testHexKey,
                                           model_key=modelKey,
                                           thresholds="0.5",
                                           timeoutSecs=60)
            print "GLMScore in",  (time.time() - start), "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            h2o.verboseprint(h2o.dump_json(glmScore))
Пример #10
0
    def test_GLM_mnist_reals(self):
        importFolderPath = "/home/0xdiag/datasets/mnist"
        csvFilelist = [
            ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600),
        ]
        # IMPORT**********************************************
        # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list
        importFolderResult = h2i.setupImportFolder(None, importFolderPath)
        ### print "importHDFSResult:", h2o.dump_json(importFolderResult)
        succeededList = importFolderResult['files']
        ### print "succeededList:", h2o.dump_json(succeededList)

        self.assertGreater(len(succeededList), 1,
                           "Should see more than 1 files in the import?")
        # why does this hang? can't look at storeview after import?
        print "\nTrying StoreView after the import folder"
        h2o_cmd.runStoreView(timeoutSecs=30)

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None,
                                                 testCsvFilename,
                                                 importFolderPath,
                                                 key2=testKey2,
                                                 timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0  # first column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y,
                                            key=parseKey['destination_key'],
                                            timeoutSecs=300)

            # PARSE train****************************************
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None,
                                                 trainCsvFilename,
                                                 importFolderPath,
                                                 key2=trainKey2,
                                                 timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            # GLM****************************************
            print "This is the pruned x we'll use"
            x = h2o_glm.goodXFromColumnInfo(y,
                                            key=parseKey['destination_key'],
                                            timeoutSecs=300)
            print "x:", x

            params = {
                'x': x,
                'y': y,
                'case_mode': '=',
                'case': 0,
                'family': 'binomial',
                'lambda': 1.0E-5,
                'alpha': 0.0,
                'max_iter': 5,
                'thresholds': 0.5,
                'n_folds': 1,
                'weight': 1,
                'beta_epsilon': 1.0E-4,
            }

            for c in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
                kwargs = params.copy()
                print "Trying binomial with case:", c
                kwargs['case'] = c

                timeoutSecs = 1800
                start = time.time()
                glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                         timeoutSecs=timeoutSecs,
                                         pollTimeoutsecs=60,
                                         **kwargs)
                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                GLMModel = glm['GLMModel']
                modelKey = GLMModel['model_key']

                start = time.time()
                glmScore = h2o_cmd.runGLMScore(key=testKey2,
                                               model_key=modelKey,
                                               thresholds="0.5",
                                               timeoutSecs=60)
                elapsed = time.time() - start
                print "GLMScore in",  elapsed, "secs", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
                h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
Пример #11
0
            }

        timeoutSecs = 3600
        start = time.time()
        glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs)
        elapsed = time.time() - start
        print "glm completed in", elapsed, "seconds.", \
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

        # Parse Test***********************************************************
        GLMModel = glm['GLMModel']
        modelKey = GLMModel['model_key']

        csvFilename = 'classification1Test.txt'
        csvPathname = importFolderPath + "/" + csvFilename
        parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False)
        print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds"

        # GLMScore Test***********************************************************
        start = time.time()
        # score with same dataset (will change to recreated dataset with one less enum
        glmScore = h2o_cmd.runGLMScore(key=parseResult['destination_key'],
            model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs)
        print "glmScore end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
         

if __name__ == '__main__':
    h2o.unit_main()
Пример #12
0
def doGLM(f, folderPath, family, link, lambda_, alpha, nfolds, y, x,
          testFilehex, row):
    debug = False
    bench = "bench"
    if debug:
        print "DOING GLM DEBUG"
        bench = "bench/debug"
    date = '-'.join([str(z) for z in list(time.localtime())][0:3])
    overallWallStart = time.time()
    pre = ""
    if debug: pre = "DEBUG"
    glmbenchcsv = 'benchmarks/' + build + '/' + pre + 'glmbench.csv'
    if not os.path.exists(glmbenchcsv):
        output = open(glmbenchcsv, 'w')
        output.write(','.join(csv_header) + '\n')
    else:
        output = open(glmbenchcsv, 'a')
    csvWrt = csv.DictWriter(output,
                            fieldnames=csv_header,
                            restval=None,
                            dialect='excel',
                            extrasaction='ignore',
                            delimiter=',')
    try:
        java_heap_GB = h2o.nodes[0].java_heap_GB
        importFolderPath = bench + "/" + folderPath
        if (f in [
                'AirlinesTrain1x', 'AllBedroomsTrain1x', 'AllBedroomsTrain10x',
                'AllBedroomsTrain100x'
        ]):
            csvPathname = importFolderPath + "/" + f + '.csv'
        else:
            csvPathname = importFolderPath + "/" + f + "/*linked*"
        hex_key = f + '.hex'
        hK = folderPath + "Header.csv"
        headerPathname = importFolderPath + "/" + hK
        h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
        headerKey = h2i.find_key(hK)
        trainParseWallStart = time.time()
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       schema='local',
                                       hex_key=hex_key,
                                       header=1,
                                       header_from_file=headerKey,
                                       separator=44,
                                       timeoutSecs=7200,
                                       retryDelaySecs=5,
                                       pollTimeoutSecs=7200,
                                       doSummary=False)

        parseWallTime = time.time() - trainParseWallStart
        print "Parsing training file took ", parseWallTime, " seconds."
        inspect_train = h2o.nodes[0].inspect(parseResult['destination_key'],
                                             timeoutSecs=7200)
        inspect_test = h2o.nodes[0].inspect(testFilehex, timeoutSecs=7200)

        nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts)
        row.update({
            'h2o_build': build,
            'nMachines': nMachines,
            'nJVMs': len(h2o.nodes),
            'Xmx/JVM': java_heap_GB,
            'dataset': f,
            'nTrainRows': inspect_train['num_rows'],
            'nTestRows': inspect_test['num_rows'],
            'nCols': inspect_train['num_cols'],
            'trainParseWallTime': parseWallTime,
            'nfolds': nfolds,
            'family': family,
        })

        params = {
            'y': y,
            'x': x,
            'family': family,
            'link': link,
            'lambda': lambda_,
            'alpha': alpha,
            'n_folds': nfolds,
            'case_mode': "n/a",
            'destination_key': "GLM(" + f + ")",
            'expert_settings': 0,
        }

        kwargs = params.copy()
        glmStart = time.time()
        glm = h2o_cmd.runGLM(parseResult=parseResult,
                             timeoutSecs=7200,
                             **kwargs)
        glmTime = time.time() - glmStart
        row.update({
            'glmBuildTime': glmTime,
            #'AverageErrorOver10Folds'    : glm['GLMModel']['validations'][0]['err'],
        })

        glmScoreStart = time.time()
        glmScore = h2o_cmd.runGLMScore(key=testFilehex,
                                       model_key=params['destination_key'],
                                       timeoutSecs=1800)
        scoreTime = time.time() - glmScoreStart
        cmd = 'bash startloggers.sh ' + json + ' stop_'
        os.system(cmd)
        if family == "binomial":
            row.update({
                'scoreTime': scoreTime,
                'AUC': glmScore['validation']['auc'],
                'AIC': glmScore['validation']['aic'],
                'error': glmScore['validation']['err'],
            })
        else:
            row.update({
                'scoreTime': scoreTime,
                'AIC': glmScore['validation']['aic'],
                'AUC': 'NA',
                'error': glmScore['validation']['err'],
            })
        csvWrt.writerow(row)
    finally:
        output.close()
Пример #13
0
    def test_GLM2_mnist(self):
        h2o.beta_features = True
        if DO_HDFS:
            importFolderPath = "mnist"
            bucket = None
            schema = 'hdfs'
        else:
            importFolderPath = "mnist"
            bucket = 'home-0xdiag-datasets'
            schema = 'local'

        csvFilelist = [
            ("mnist_training.csv.gz", "mnist_testing.csv.gz",    600), 
        ]

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey = testCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()

            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=testKey, timeoutSecs=timeoutSecs)
            
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 'C0' # first column is pixel value
            print "y:"
            ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True)

            # PARSE train****************************************
            trainKey = trainCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema=schema, hex_key=trainKey, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GLM****************************************
            print "This is the pruned x we'll use"
            ignoreX = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True)
            print "ignoreX:", ignoreX 

            params = {
                'ignored_cols': ignoreX, 
                'response': y,
                'case_mode': '=',
                'case_val': 0,
                'family': 'binomial',
                'lambda': 0.5,
                'alpha': 1e-4,
                'max_iter': 5,
                ## 'thresholds': 0.5,
                ## 'weight': 1.0,
                'n_folds': 1,
                'beta_epsilon': 1.0E-4,
                }

            if DO_ALL_DIGITS:
                cases = [0,1,2,3,4,5,6,7,8,9]
            else:
                cases = [8]

            for c in cases:
                kwargs = params.copy()
                print "Trying binomial with case:", c
                kwargs['case_val'] = c

                timeoutSecs = 1800
                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                GLMModel = glm['GLMModel']
                modelKey = GLMModel['model_key']

                start = time.time()
                glmScore = h2o_cmd.runGLMScore(key=testKey, model_key=modelKey, thresholds="0.5", timeoutSecs=60)
                elapsed = time.time() - start
                print "GLMScore in",  elapsed, "secs", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
                h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
    def test_GLM_enums_unbalanced(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 2000
        tryList = [
            (n, 1, 'cD', 300), 
            (n, 2, 'cE', 300), 
            (n, 4, 'cF', 300), 
            (n, 8, 'cG', 300), 
            (n, 16, 'cH', 300), 
            (n, 32, 'cI', 300), 
            ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c' # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a' # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList,5)

            print "Creating random", csvPathname, "for glm2 model building"
            write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            print "Creating another random", csvScorePathname, "for glm2 scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=30, separator=colSepInt)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            parseResult = h2i.import_parse(path=csvScorePathname, schema='put', hex_key="score_" + hex_key, 
                timeoutSecs=30, separator=colSepInt)

            y = colCount
            modelKey = 'glm_model'
            kwargs = {
                'standardize': 0,
                'destination_key': modelKey,
                'response': 'C' + str(y+1), 
                'max_iter': 200, 
                'family': 'binomial',
                'n_folds': 10, 
                'alpha': 0, 
                'lambda': 0, 
                # 'case_mode': '=', 
                # 'case': 0,
                }

            start = time.time()

            updateList= [ 
                {'alpha': 0.5, 'lambda': 1e-4},
                {'alpha': 0.25, 'lambda': 1e-6},
                {'alpha': 0.0, 'lambda': 1e-12},
                {'alpha': 0.5, 'lambda': 1e-12},
                {'alpha': 0.0, 'lambda': 1e-12},
                {'alpha': 0.0, 'lambda': 0},
            ]

            # Try each one
            h2o.beta_features = True
            for updateDict in updateList:
                print "\n#################################################################"
                print updateDict
                kwargs.update(updateDict)
                print "If we poll, we get a message saying it was cancelled by user??"
                glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, noPoll=True, **kwargs)
                h2j.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5, errorIfCancelled=True)
                glm = h2o.nodes[0].glm_view(_modelKey=modelKey)
                print "glm2 end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'

                glm_model = glm['glm_model']
                _names = glm_model['_names']
                modelKey = glm_model['_key']
                coefficients_names = glm_model['coefficients_names']
                submodels = glm_model['submodels'][0]

                beta = submodels['beta']
                norm_beta = submodels['norm_beta']
                iteration = submodels['iteration']

                validation = submodels['validation']

                if not validation or 'avg_err' not in validation:
                    raise Exception("glm: %s" % h2o.dump_json(glm) + \
                        "\nNo avg_err in validation." + \
                        "\nLikely if you look back, the job was cancelled, so there's no cross validation.")
        
                avg_err = validation['avg_err']
                auc = validation['auc']
                aic = validation['aic']
                null_deviance = validation['null_deviance']
                residual_deviance = validation['residual_deviance']

                print '_names', _names
                print 'coefficients_names', coefficients_names
                # did beta get shortened? the simple check confirms names/beta/norm_beta are same length
                print 'beta', beta
                print 'iteration', iteration
                print 'avg_err', avg_err
                print 'auc', auc

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
                if iterations > 20:
                    raise Exception("Why take so many iterations:  %s in this glm2 training?" % iterations)

                start = time.time()
                # score with same dataset (will change to recreated dataset with one less enum
                glmScore = h2o_cmd.runGLMScore(key=parseResult['destination_key'],
                    model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs)
                print "glm2 end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
                ### print h2o.dump_json(glmScore)
                classErr = glmScore['validation']['classErr']
                auc = glmScore['validation']['auc']
                err = glmScore['validation']['err']
                nullDev = glmScore['validation']['nullDev']
                resDev = glmScore['validation']['resDev']
                h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)

                print "score classErr:", classErr
                print "score err:", err
                print "score auc:", auc
                print "score resDev:", resDev
                print "score nullDev:", nullDev

                if math.isnan(resDev):
                    emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validation['resDev'])
                    raise Exception(emsg)

                # what is reasonable?
                # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err)
                self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc)

                if math.isnan(err):
                    emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err)
                    raise Exception(emsg)

                if math.isnan(resDev):
                    emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", resDev)
                    raise Exception(emsg)

                if math.isnan(nullDev):
                    emsg = "Why is this nullDev = 'nan'?? %6s %s" % ("nullDev:\t", nullDev)
Пример #15
0
def doGLM(f, folderPath, family, link, lambda_, alpha, nfolds, y, x, testFilehex, row):
    debug = False
    bench = "bench"
    if debug:
        print "DOING GLM DEBUG"
        bench = "bench/debug"
    date = '-'.join([str(z) for z in list(time.localtime())][0:3])
    overallWallStart = time.time()
    pre              = ""
    if debug: pre    = "DEBUG"
    glmbenchcsv      = 'benchmarks/'+build+'/'+pre+'glmbench.csv'
    if not os.path.exists(glmbenchcsv):
        output = open(glmbenchcsv,'w')
        output.write(','.join(csv_header)+'\n')
    else:
        output = open(glmbenchcsv,'a')
    csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, 
                    dialect='excel', extrasaction='ignore',delimiter=',')
    try:
        java_heap_GB     = h2o.nodes[0].java_heap_GB
        importFolderPath = bench + "/" + folderPath
        if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x']): 
            csvPathname = importFolderPath + "/" + f + '.csv'
        else: 
            csvPathname = importFolderPath + "/" + f + "/*linked*"
        hex_key         = f + '.hex'
        hK              = folderPath + "Header.csv"    
        headerPathname  = importFolderPath + "/" + hK
        h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
        headerKey       = h2i.find_key(hK)
        trainParseWallStart = time.time()
        parseResult = h2i.import_parse(bucket           = 'home-0xdiag-datasets', 
                                       path             = csvPathname, 
                                       schema           = 'local', 
                                       hex_key          = hex_key,  
                                       header           = 1, 
                                       header_from_file = headerKey, 
                                       separator        = 44,
                                       timeoutSecs      = 7200,
                                       retryDelaySecs   = 5,
                                       pollTimeoutSecs  = 7200,
                                       doSummary        = False
                                      )

        parseWallTime  = time.time() - trainParseWallStart
        print "Parsing training file took ", parseWallTime ," seconds." 
        inspect_train  = h2o.nodes[0].inspect(parseResult['destination_key'], timeoutSecs=7200)
        inspect_test   = h2o.nodes[0].inspect(testFilehex, timeoutSecs=7200)

        nMachines      = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts)
        row.update( {'h2o_build'          : build,
                     'nMachines'          : nMachines,
                     'nJVMs'              : len(h2o.nodes),
                     'Xmx/JVM'            : java_heap_GB,
                     'dataset'            : f,
                     'nTrainRows'         : inspect_train['num_rows'],
                     'nTestRows'          : inspect_test['num_rows'],
                     'nCols'              : inspect_train['num_cols'],
                     'trainParseWallTime' : parseWallTime,
                     'nfolds'             : nfolds,
                     'family'             : family,
                    })

        params   =  {'y'                  : y,
                     'x'                  : x,
                     'family'             : family,
                     'link'               : link,
                     'lambda'             : lambda_,
                     'alpha'              : alpha,
                     'n_folds'            : nfolds,
                     'case_mode'          : "n/a",
                     'destination_key'    : "GLM("+f+")",
                     'expert_settings'    : 0,
                    }

        kwargs    = params.copy()
        glmStart  = time.time()
        glm       = h2o_cmd.runGLM(parseResult = parseResult, 
                                   timeoutSecs = 7200, 
                                   **kwargs)
        glmTime   = time.time() - glmStart
        row.update( {'glmBuildTime'       : glmTime,
                     #'AverageErrorOver10Folds'    : glm['GLMModel']['validations'][0]['err'],
                    })
        
        glmScoreStart = time.time()
        glmScore      = h2o_cmd.runGLMScore(key         = testFilehex,
                                            model_key   = params['destination_key'],
                                            timeoutSecs = 1800)
        scoreTime     = time.time() - glmScoreStart
        cmd = 'bash startloggers.sh ' + json + ' stop_'
        os.system(cmd)
        if family == "binomial":
            row.update( {'scoreTime'          : scoreTime,
                         'AUC'                : glmScore['validation']['auc'],
                         'AIC'                : glmScore['validation']['aic'],
                         'error'              : glmScore['validation']['err'],
                        })
        else:
            row.update( {'scoreTime'          : scoreTime,
                         'AIC'                : glmScore['validation']['aic'],
                         'AUC'                : 'NA',
                         'error'              : glmScore['validation']['err'],
                        })
        csvWrt.writerow(row)
    finally:
        output.close()
Пример #16
0
    def test_GLM_mnist_reals(self):
        importFolderPath = "mnist"
        csvFilelist = [("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600)]
        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            csvPathname = importFolderPath + "/" + testCsvFilename
            testKey = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(
                path=csvPathname, schema="hdfs", hex_key=testKey, key2=testKey, timeoutSecs=timeoutSecs
            )
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, "took", elapsed, "seconds", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )
            print "parse result:", parseResult["destination_key"]

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0  # first column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y, key=parseResult["destination_key"], timeoutSecs=300)

            # PARSE train****************************************
            trainKey = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            csvPathname = importFolderPath + "/" + trainCsvFilename
            parseResult = h2i.import_parse(
                path=csvPathname, schema="hdfs", hex_key=trainKey, key2=trainKey, timeoutSecs=timeoutSecs
            )
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, "took", elapsed, "seconds", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )
            print "parse result:", parseResult["destination_key"]

            # GLM****************************************
            print "This is the pruned x we'll use"
            x = h2o_glm.goodXFromColumnInfo(y, key=parseResult["destination_key"], timeoutSecs=300)
            print "x:", x

            params = {
                "x": x,
                "y": y,
                "case_mode": "=",
                "case": 0,
                "family": "binomial",
                "lambda": 1.0e-5,
                "alpha": 0.0,
                "max_iter": 5,
                "thresholds": 0.5,
                "n_folds": 1,
                "weight": 1,
                "beta_epsilon": 1.0e-4,
            }

            for c in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
                kwargs = params.copy()
                print "Trying binomial with case:", c
                kwargs["case"] = c

                timeoutSecs = 1800
                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed * 100) / timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                GLMModel = glm["GLMModel"]
                modelKey = GLMModel["model_key"]

                start = time.time()
                glmScore = h2o_cmd.runGLMScore(key=testKey, model_key=modelKey, thresholds="0.5", timeoutSecs=60)
                elapsed = time.time() - start
                print "GLMScore in", elapsed, "secs", "%d pct. of timeout" % ((elapsed * 100) / timeoutSecs)
                h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
Пример #17
0
    def test_GLM_ints_unbalanced(self):
        ### h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 2000
        tryList = [
            (n, 1, "cD", 300),
            (n, 2, "cE", 300),
            (n, 4, "cF", 300),
            (n, 8, "cG", 300),
            (n, 16, "cH", 300),
            (n, 32, "cI", 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = "2c"  # comma
            colSepChar = colSepHexString.decode("hex")
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = "0a"  # newline
            rowSepChar = rowSepHexString.decode("hex")
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = "syn_enums_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename
            csvScoreFilename = "syn_enums_score_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvScorePathname = SYNDATASETS_DIR + "/" + csvScoreFilename

            enumList = create_enum_list()
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(
                csvPathname, enumList, rowCount, colCount, SEEDPERFILE, colSepChar=colSepChar, rowSepChar=rowSepChar
            )

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(
                csvScorePathname,
                enumListForScore,
                rowCount,
                colCount,
                SEEDPERFILE,
                colSepChar=colSepChar,
                rowSepChar=rowSepChar,
            )

            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, separator=colSepInt
            )
            print csvFilename, "parse time:", parseResult["response"]["time"]
            print "Parse result['destination_key']:", parseResult["destination_key"]

            print "\n" + csvFilename
            (
                missingValuesDict,
                constantValuesDict,
                enumSizeDict,
                colTypeDict,
                colNameDict,
            ) = h2o_cmd.columnInfoFromInspect(parseResult["destination_key"], exceptionOnMissingValues=True)

            y = colCount
            kwargs = {
                "y": y,
                "max_iter": 200,
                "family": "binomial",
                "n_folds": 10,
                "alpha": 0,
                "lambda": 0,
                "thresholds": 0.5,
                # 'case_mode': '=',
                # 'case': 0,
            }

            start = time.time()

            updateList = [
                {"alpha": 0.5, "lambda": 1e-4},
                {"alpha": 0.25, "lambda": 1e-6},
                {"alpha": 0.0, "lambda": 1e-8},
                {"alpha": 0.5, "lambda": 0.0},
                {"alpha": 0.0, "lambda": 0.0},
            ]

            # Try each one
            for updateDict in updateList:
                print "\n#################################################################"
                print updateDict
                kwargs.update(updateDict)
                glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
                print "glm end on ", parseResult["destination_key"], "took", time.time() - start, "seconds"

                GLMModel = glm["GLMModel"]
                # submodels0 = GLMModel['submodels'][0]
                iterations = GLMModel["iterations"]
                modelKey = GLMModel["model_key"]

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
                # if iterations > 20:
                #    raise Exception("Why take so many iterations:  %s in this glm training?" % iterations)

                parseResult = h2i.import_parse(
                    path=csvScorePathname, schema="put", hex_key="score_" + hex_key, timeoutSecs=30, separator=colSepInt
                )

                start = time.time()
                # score with same dataset (will change to recreated dataset with one less enum
                glmScore = h2o_cmd.runGLMScore(
                    key=parseResult["destination_key"], model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs
                )
                print "glm end on ", parseResult["destination_key"], "took", time.time() - start, "seconds"
                ### print h2o.dump_json(glmScore)
                classErr = glmScore["validation"]["classErr"]
                auc = glmScore["validation"]["auc"]
                err = glmScore["validation"]["err"]
                nullDev = glmScore["validation"]["nullDev"]
                resDev = glmScore["validation"]["resDev"]
                h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)

                print "classErr:", classErr
                print "err:", err
                print "auc:", auc
                print "resDev:", resDev
                print "nullDev:", nullDev
                if math.isnan(resDev):
                    emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validation["resDev"])
                    raise Exception(emsg)

                # what is reasonable?
                # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err)
                # self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc)

                if math.isnan(err):
                    emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err)
                    raise Exception(emsg)

                if math.isnan(resDev):
                    emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", resDev)
                    raise Exception(emsg)

                if math.isnan(nullDev):
                    emsg = "Why is this nullDev = 'nan'?? %6s %s" % ("nullDev:\t", nullDev)
Пример #18
0
    def test_GLM_mnist(self):
        importFolderPath = "mnist"
        csvFilelist = [
            ("mnist_training.csv.gz", "mnist_testing.csv.gz",    600), 
        ]

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey = testCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=testKey, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0 # first column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)

            # PARSE train****************************************
            trainKey = trainCsvFilename + "_" + str(trial) + ".hex"
            csvPathname = importFolderPath + "/" + testCsvFilename
            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=trainKey, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # GLM****************************************
            print "This is the pruned x we'll use"
            x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)
            print "x:", x

            params = {
                'x': x, 
                'y': y,
                'case_mode': '=',
                'case': 0,
                'family': 'binomial',
                'lambda': 1.0E-5,
                'alpha': 0.0,
                'max_iter': 5,
                'thresholds': 0.5,
                'n_folds': 1,
                'weight': 1,
                'beta_epsilon': 1.0E-4,
                }

            for c in [0,1,2,3,4,5,6,7,8,9]:
                kwargs = params.copy()
                print "Trying binomial with case:", c
                kwargs['case'] = c

                timeoutSecs = 1800
                start = time.time()
                glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                GLMModel = glm['GLMModel']
                modelKey = GLMModel['model_key']

                start = time.time()
                glmScore = h2o_cmd.runGLMScore(key=testKey, model_key=modelKey, thresholds="0.5",
                    timeoutSecs=60)
                elapsed = time.time() - start
                print "GLMScore in",  elapsed, "secs", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
                h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
Пример #19
0
    def test_c10_rel_glm(self):
        h2o.beta_features = False
        print "Since the python is not necessarily run as user=0xcust..., can't use a  schema='put' here"
        print "Want to be able to run python as jenkins"
        print "I guess for big 0xcust files, we don't need schema='put'"
        print "For files that we want to put (for testing put), we can get non-private files"

        # Parse Train***********************************************************
        importFolderPath = '/mnt/0xcustomer-datasets/c3'
        csvFilename = 'classification1Train.txt'
        csvPathname = importFolderPath + "/" + csvFilename

        start = time.time()
        parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False)
        print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds"

        print "Parse result['destination_key']:", parseResult['destination_key']

        start = time.time()
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500)
        print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        # num_rows = inspect['num_rows']
        # num_cols = inspect['num_cols']
        # do summary of the parsed dataset last, since we know it fails on this dataset
        summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'])
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)

        # keepList = []
        # h2o_glm.findXFromColumnInfo(key=parseResult['destination_key'], keepList=keepList)
        # see README.txt in 0xcustomer-datasets/c3 for the col names to use in keepList above, to get the indices
        
        # since we're no long zero based, increment by 1
        x_from_zero = [6,7,8,10,12,31,32,33,34,35,36,37,40,41,42,43,44,45,46,47,49,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70]

        x = ['C' + str(i + 1) for i in x_from_zero]
        y = 0
        # GLM Train***********************************************************
        keepPattern = None
        # don't need the intermediate Dicts produced from columnInfoFromInspect
        x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300)
        print "from goodX (not used) x:", x
        print "y:", y

        # have to use named cols, and they start with 1
        


        kwargs = {
            'x': x,
            'y': y,
            # 'case_mode': '>',
            # 'case': 0,
            'family': 'binomial',
            'lambda': 1.0E-5,
            'alpha': 0.5,
            'max_iter': 10,
            'thresholds': 0.5,
            'n_folds': 1,
            'weight': 100,
            'beta_epsilon': 1.0E-4,
            }

        timeoutSecs = 3600
        start = time.time()
        glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs)
        elapsed = time.time() - start
        print "glm completed in", elapsed, "seconds.", \
            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

        h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

        # Parse Test***********************************************************
        GLMModel = glm['GLMModel']
        modelKey = GLMModel['model_key']

        csvFilename = 'classification1Test.txt'
        csvPathname = importFolderPath + "/" + csvFilename
        parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, doSummary=False)
        print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds"

        # GLMScore Test***********************************************************
        start = time.time()
        # score with same dataset (will change to recreated dataset with one less enum
        glmScore = h2o_cmd.runGLMScore(key=parseResult['destination_key'],
            model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs)
        print "glmScore end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
Пример #20
0
    def test_GLM_covtype_train(self):
        print "\nMichal will hate me for another file needed: covtype.shuffled.data"
        importFolderPath = "/home/0xdiag/datasets/standard"
        csvFilename = 'covtype.shuffled.data'
        csvPathname = importFolderPath + "/" + csvFilename
        key2 = csvFilename + ".hex"
        h2i.setupImportFolder(None, importFolderPath)

        print "\nUsing header=0 on the normal covtype.data"
        parseKey = h2i.parseImportFolderFile(None,
                                             csvFilename,
                                             importFolderPath,
                                             key2=key2,
                                             header=0,
                                             timeoutSecs=180)

        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        # how many rows for each pct?
        num_rows = inspect['num_rows']
        pct10 = int(num_rows * .1)
        rowsForPct = [i * pct10 for i in range(0, 11)]
        # this can be slightly less than 10%
        last10 = num_rows - rowsForPct[9]
        rowsForPct[10] = last10
        # use mod below for picking "rows-to-do" in case we do more than 9 trials
        # use 10 if 0 just to see (we copied 10 to 0 above)
        rowsForPct[0] = rowsForPct[10]

        print "Creating the key of the last 10% data, for scoring"
        dataKeyTest = "rTest"
        # start at 90% rows + 1

        execExpr = dataKeyTest + " = slice(" + key2 + "," + str(rowsForPct[9] +
                                                                1) + ")"
        h2o_exec.exec_expr(None,
                           execExpr,
                           resultKey=dataKeyTest,
                           timeoutSecs=10)

        kwargs = {
            'y': 54,
            'max_iter': 20,
            'n_folds': 0,
            'thresholds': 0.5,
            'alpha': 0.1,
            'lambda': 1e-5,
            'family': 'binomial',
            'case_mode': '=',
            'case': 2
        }
        timeoutSecs = 60

        for trial in range(10):
            # always slice from the beginning
            rowsToUse = rowsForPct[trial % 10]
            resultKey = "r" + str(trial)
            execExpr = resultKey + " = slice(" + key2 + ",1," + str(
                rowsToUse) + ")"
            h2o_exec.exec_expr(None,
                               execExpr,
                               resultKey=resultKey,
                               timeoutSecs=10)
            parseKey['destination_key'] = resultKey
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=180,
                                     **kwargs)
            print "glm end on ", parseKey[
                'destination_key'], 'took', time.time() - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            GLMModel = glm['GLMModel']
            modelKey = GLMModel['model_key']

            start = time.time()
            glmScore = h2o_cmd.runGLMScore(key=dataKeyTest,
                                           model_key=modelKey,
                                           thresholds="0.5",
                                           timeoutSecs=timeoutSecs)
            print "glmScore end on ", dataKeyTest, 'took', time.time(
            ) - start, 'seconds'
            ### print h2o.dump_json(glmScore)
            classErr = glmScore['validation']['classErr']
            auc = glmScore['validation']['auc']
            err = glmScore['validation']['err']
            print "classErr:", classErr
            print "err:", err
            print "auc:", auc

            print "Trial #", trial, "completed", "using %6.2f" % (
                rowsToUse * 100.0 / num_rows), "pct. of all rows"
Пример #21
0
    def test_GLM_ints_unbalanced(self):
        ### h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 2000
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 4, 'cF', 300),
            (n, 8, 'cG', 300),
            (n, 16, 'cH', 300),
            (n, 32, 'cI', 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c'  # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a'  # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list()
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList, 5)

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(csvPathname,
                              enumList,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname,
                              enumListForScore,
                              rowCount,
                              colCount,
                              SEEDPERFILE,
                              colSepChar=colSepChar,
                              rowSepChar=rowSepChar)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           separator=colSepInt)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            kwargs = {
                'y': y,
                'max_iter': 200,
                'family': 'binomial',
                'n_folds': 10,
                'alpha': 0,
                'lambda': 0,
                'thresholds': 0.5,
                # 'case_mode': '=',
                # 'case': 0,
            }

            start = time.time()

            updateList = [
                {
                    'alpha': 0.5,
                    'lambda': 1e-4
                },
                {
                    'alpha': 0.25,
                    'lambda': 1e-6
                },
                {
                    'alpha': 0.0,
                    'lambda': 1e-8
                },
                {
                    'alpha': 0.5,
                    'lambda': 0.0
                },
                {
                    'alpha': 0.0,
                    'lambda': 0.0
                },
            ]

            # Try each one
            for updateDict in updateList:
                print "\n#################################################################"
                print updateDict
                kwargs.update(updateDict)
                glm = h2o_cmd.runGLM(parseResult=parseResult,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutSecs=180,
                                     **kwargs)
                print "glm end on ", parseResult[
                    'destination_key'], 'took', time.time() - start, 'seconds'

                GLMModel = glm['GLMModel']
                # submodels0 = GLMModel['submodels'][0]
                iterations = GLMModel['iterations']
                modelKey = GLMModel['model_key']

                h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
                # if iterations > 20:
                #    raise Exception("Why take so many iterations:  %s in this glm training?" % iterations)

                parseResult = h2i.import_parse(path=csvScorePathname,
                                               schema='put',
                                               hex_key="score_" + hex_key,
                                               timeoutSecs=30,
                                               separator=colSepInt)

                start = time.time()
                # score with same dataset (will change to recreated dataset with one less enum
                glmScore = h2o_cmd.runGLMScore(
                    key=parseResult['destination_key'],
                    model_key=modelKey,
                    thresholds="0.5",
                    timeoutSecs=timeoutSecs)
                print "glm end on ", parseResult[
                    'destination_key'], 'took', time.time() - start, 'seconds'
                ### print h2o.dump_json(glmScore)
                classErr = glmScore['validation']['classErr']
                auc = glmScore['validation']['auc']
                err = glmScore['validation']['err']
                nullDev = glmScore['validation']['nullDev']
                resDev = glmScore['validation']['resDev']
                h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)

                print "classErr:", classErr
                print "err:", err
                print "auc:", auc
                print "resDev:", resDev
                print "nullDev:", nullDev
                if math.isnan(resDev):
                    emsg = "Why is this resDev = 'nan'?? %6s %s" % (
                        "resDev:\t", validation['resDev'])
                    raise Exception(emsg)

                # what is reasonable?
                # self.assertAlmostEqual(err, 0.3, delta=0.15, msg="actual err: %s not close enough to 0.3" % err)
                # self.assertAlmostEqual(auc, 0.5, delta=0.15, msg="actual auc: %s not close enough to 0.5" % auc)

                if math.isnan(err):
                    emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", err)
                    raise Exception(emsg)

                if math.isnan(resDev):
                    emsg = "Why is this resDev = 'nan'?? %6s %s" % (
                        "resDev:\t", resDev)
                    raise Exception(emsg)

                if math.isnan(nullDev):
                    emsg = "Why is this nullDev = 'nan'?? %6s %s" % (
                        "nullDev:\t", nullDev)
Пример #22
0
    def test_GLM_mnist_s3n(self):
        URI = "s3n://home-0xdiag-datasets/mnist/"
        csvFilelist = [
            ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600),
            ("mnist_testing.csv.gz", "mnist_training.csv.gz", 600),
            ("mnist_training.csv.gz", "mnist_training.csv.gz", 600),
        ]
        # IMPORT**********************************************
        importHDFSResult = h2o.nodes[0].import_hdfs(URI)
        ### print "importHDFSResult:", h2o.dump_json(importHDFSResult)
        s3nFullList = importHDFSResult['succeeded']
        ### print "s3nFullList:", h2o.dump_json(s3nFullList)

        self.assertGreater(len(s3nFullList), 1,
                           "Should see more than 1 files in s3n?")

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            s3nKey = URI + testCsvFilename
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            print "Loading s3n key: ", s3nKey, 'thru HDFS'
            start = time.time()
            parseKey = h2o.nodes[0].parse(s3nKey,
                                          testKey2,
                                          timeoutSecs=timeoutSecs,
                                          retryDelaySecs=10,
                                          pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", s3nKey, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            # PARSE train****************************************
            s3nKey = URI + trainCsvFilename
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            print "Loading s3n key: ", s3nKey, 'thru HDFS'
            start = time.time()
            parseKey = h2o.nodes[0].parse(s3nKey,
                                          trainKey2,
                                          timeoutSecs=timeoutSecs,
                                          retryDelaySecs=10,
                                          pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", s3nKey, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            # GLM****************************************
            y = 0  # first column is pixel value
            print "y:"
            # don't need the intermediate Dicts produced from columnInfoFromInspect
            x = h2o_glm.goodXFromColumnInfo(y,
                                            key=parseKey['destination_key'],
                                            timeoutSecs=300)
            print "x:", x

            kwargs = {
                'x': x,
                'y': y,
                # 'case_mode': '>',
                # 'case': 0,
                'family': 'gaussian',
                'lambda': 1.0E-5,
                'alpha': 0.5,
                'max_iter': 5,
                'thresholds': 0.5,
                'n_folds': 1,
                'weight': 1,
                'beta_epsilon': 1.0E-4,
            }

            timeoutSecs = 1800
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey,
                                     timeoutSecs=timeoutSecs,
                                     pollTimeoutsecs=60,
                                     **kwargs)
            elapsed = time.time() - start
            print "GLM completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            GLMModel = glm['GLMModel']
            modelKey = GLMModel['model_key']

            kwargs = {'x': x, 'y': y, 'thresholds': 0.5}
            start = time.time()
            glmScore = h2o_cmd.runGLMScore(key=testKey2,
                                           model_key=modelKey,
                                           thresholds="0.5",
                                           timeoutSecs=60)
            print "GLMScore in",  (time.time() - start), "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            h2o.verboseprint(h2o.dump_json(glmScore))
Пример #23
0
    def test_GLM_mnist_s3n(self):
        csvFilelist = [
            ("mnist_training.csv.gz", "mnist_testing.csv.gz",    600), 
            ("mnist_testing.csv.gz",  "mnist_training.csv.gz",    600), 
            ("mnist_training.csv.gz", "mnist_training.csv.gz",    600), 
        ]

        importFolderPath = "mnist"
        csvPathname = importFolderPath + "/*"
        (importHDFSResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', timeoutSecs=120)
        s3nFullList = importHDFSResult['succeeded']
        self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?")

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            # PARSE test****************************************
            csvPathname = importFolderPath + "/" + testCsvFilename
            testHexKey = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=testHexKey,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + trainCsvFilename
            trainHexKey = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=trainHexKey,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # GLM****************************************
            y = 0 # first column is pixel value
            print "y:"
            # don't need the intermediate Dicts produced from columnInfoFromInspect
            x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300)
            print "x:", x

            kwargs = {
                'x': x, 
                'y': y,
                # 'case_mode': '>',
                # 'case': 0,
                'family': 'gaussian',
                'lambda': 1.0E-5,
                'alpha': 0.5,
                'max_iter': 5,
                'thresholds': 0.5,
                'n_folds': 1,
                'weight': 1,
                'beta_epsilon': 1.0E-4,
                }

            timeoutSecs = 1800
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "GLM completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            GLMModel = glm['GLMModel']
            modelKey = GLMModel['model_key']

            kwargs = {'x': x, 'y':  y, 'thresholds': 0.5}
            start = time.time()
            glmScore = h2o_cmd.runGLMScore(key=testHexKey, model_key=modelKey, thresholds="0.5",
                timeoutSecs=60)
            print "GLMScore in",  (time.time() - start), "secs", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            h2o.verboseprint(h2o.dump_json(glmScore))
Пример #24
0
    def test_GLM_mnist_reals(self):
        importFolderPath = "/home/0xdiag/datasets/mnist"
        csvFilelist = [
            ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz",    600), 
        ]
        # IMPORT**********************************************
        # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list
        importFolderResult = h2i.setupImportFolder(None, importFolderPath)
        ### print "importHDFSResult:", h2o.dump_json(importFolderResult)
        succeededList = importFolderResult['files']
        ### print "succeededList:", h2o.dump_json(succeededList)

        self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?")
        # why does this hang? can't look at storeview after import?
        print "\nTrying StoreView after the import folder"
        h2o_cmd.runStoreView(timeoutSecs=30)

        trial = 0
        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()

            # PARSE test****************************************
            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None, testCsvFilename, importFolderPath,
                key2=testKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
            y = 0 # first column is pixel value
            print "y:"
            x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300)

            # PARSE train****************************************
            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseKey = h2i.parseImportFolderFile(None, trainCsvFilename, importFolderPath,
                key2=trainKey2, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseKey['destination_key']

            # GLM****************************************
            print "This is the pruned x we'll use"
            x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300)
            print "x:", x

            params = {
                'x': x, 
                'y': y,
                'case_mode': '=',
                'case': 0,
                'family': 'binomial',
                'lambda': 1.0E-5,
                'alpha': 0.0,
                'max_iter': 5,
                'thresholds': 0.5,
                'n_folds': 1,
                'weight': 1,
                'beta_epsilon': 1.0E-4,
                }

            for c in [0,1,2,3,4,5,6,7,8,9]:
                kwargs = params.copy()
                print "Trying binomial with case:", c
                kwargs['case'] = c

                timeoutSecs = 1800
                start = time.time()
                glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs)
                elapsed = time.time() - start
                print "GLM completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
                GLMModel = glm['GLMModel']
                modelKey = GLMModel['model_key']

                start = time.time()
                glmScore = h2o_cmd.runGLMScore(key=testKey2, model_key=modelKey, thresholds="0.5",
                    timeoutSecs=60)
                elapsed = time.time() - start
                print "GLMScore in",  elapsed, "secs", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
                h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
Пример #25
0
    def test_GLM_covtype_train(self):
        print "\nMichal will hate me for another file needed: covtype.shuffled.data"
        importFolderPath = "/home/0xdiag/datasets/standard"
        csvFilename = 'covtype.shuffled.data'
        csvPathname = importFolderPath + "/" + csvFilename
        key2 = csvFilename + ".hex"
        h2i.setupImportFolder(None, importFolderPath)

        print "\nUsing header=0 on the normal covtype.data"
        parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2,
            header=0, timeoutSecs=180)

        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        print "\n" + csvPathname, \
            "    num_rows:", "{:,}".format(inspect['num_rows']), \
            "    num_cols:", "{:,}".format(inspect['num_cols'])

        # how many rows for each pct?
        num_rows = inspect['num_rows']
        pct10 = int(num_rows * .1)
        rowsForPct = [i * pct10 for i in range(0,11)]
        # this can be slightly less than 10%
        last10 = num_rows - rowsForPct[9]
        rowsForPct[10] = last10
        # use mod below for picking "rows-to-do" in case we do more than 9 trials
        # use 10 if 0 just to see (we copied 10 to 0 above)
        rowsForPct[0] = rowsForPct[10]

        print "Creating the key of the last 10% data, for scoring"
        dataKeyTest = "rTest"
        # start at 90% rows + 1
        
        execExpr = dataKeyTest + " = slice(" + key2 + "," + str(rowsForPct[9]+1) + ")"
        h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10)

        kwargs = {
            'y': 54, 
            'max_iter': 20, 
            'n_folds': 0, 
            'thresholds': 0.5,
            'alpha': 0.1, 
            'lambda': 1e-5, 
            'family': 'binomial',
            'case_mode': '=', 
            'case': 2
        }
        timeoutSecs = 60

        for trial in range(10):
            # always slice from the beginning
            rowsToUse = rowsForPct[trial%10] 
            resultKey = "r" + str(trial)
            execExpr = resultKey + " = slice(" + key2 + ",1," + str(rowsToUse) + ")"
            h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10)
            parseKey['destination_key'] = resultKey
            # adjust timeoutSecs with the number of trees
            # seems ec2 can be really slow

            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
            print "glm end on ", parseKey['destination_key'], 'took', time.time() - start, 'seconds'
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            GLMModel = glm['GLMModel']
            modelKey = GLMModel['model_key']

            start = time.time()
            glmScore = h2o_cmd.runGLMScore(key=dataKeyTest, model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs)
            print "glmScore end on ", dataKeyTest, 'took', time.time() - start, 'seconds'
            ### print h2o.dump_json(glmScore)
            classErr = glmScore['validation']['classErr']
            auc = glmScore['validation']['auc']
            err = glmScore['validation']['err']
            print "classErr:", classErr
            print "err:", err
            print "auc:", auc

            print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/num_rows), "pct. of all rows"
Пример #26
0
    def test_GLM_enums_score_subset(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        n = 200
        tryList = [
            (n, 1, 'cD', 300), 
            (n, 2, 'cE', 300), 
            (n, 3, 'cF', 300), 
            (n, 4, 'cG', 300), 
            (n, 5, 'cH', 300), 
            (n, 6, 'cI', 300), 
            ]

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            # using the comma is nice to ensure no craziness
            colSepHexString = '2c' # comma
            colSepChar = colSepHexString.decode('hex')
            colSepInt = int(colSepHexString, base=16)
            print "colSepChar:", colSepChar

            rowSepHexString = '0a' # newline
            rowSepChar = rowSepHexString.decode('hex')
            print "rowSepChar:", rowSepChar

            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvScoreFilename = 'syn_enums_score_' + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvScorePathname = SYNDATASETS_DIR + '/' + csvScoreFilename

            enumList = create_enum_list(listSize=10)
            # use half of the enums for creating the scoring dataset
            enumListForScore = random.sample(enumList,5)

            print "Creating random", csvPathname, "for glm model building"
            write_syn_dataset(csvPathname, enumList, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            print "Creating random", csvScorePathname, "for glm scoring with prior model (using enum subset)"
            write_syn_dataset(csvScorePathname, enumListForScore, rowCount, colCount, SEEDPERFILE, 
                colSepChar=colSepChar, rowSepChar=rowSepChar)

            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, 
                timeoutSecs=30, separator=colSepInt)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            print "\n" + csvFilename
            (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
                h2o_cmd.columnInfoFromInspect(parseKey['destination_key'], exceptionOnMissingValues=True)

            y = colCount
            kwargs = {'y': y, 'max_iter': 1, 'n_folds': 1, 'alpha': 0.2, 'lambda': 1e-5, 
                'case_mode': '=', 'case': 0}
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
            print "glm end on ", parseKey['destination_key'], 'took', time.time() - start, 'seconds'

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

            GLMModel = glm['GLMModel']
            modelKey = GLMModel['model_key']

            parseKey = h2o_cmd.parseFile(None, csvScorePathname, key2="score_" + key2, 
                timeoutSecs=30, separator=colSepInt)

            start = time.time()
            # score with same dataset (will change to recreated dataset with one less enum
            glmScore = h2o_cmd.runGLMScore(key=parseKey['destination_key'],
                model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs)
            print "glm end on ", parseKey['destination_key'], 'took', time.time() - start, 'seconds'
            ### print h2o.dump_json(glmScore)
            classErr = glmScore['validation']['classErr']
            auc = glmScore['validation']['auc']
            err = glmScore['validation']['err']
            print "classErr:", classErr
            print "err:", err
            print "auc:", auc