def test_frame_split(self):

        csvFilename = 'iris.csv'
        csvPathname = 'iris/' + csvFilename
        hex_key = "iris.hex"

        parseResultA = h2i.import_parse(bucket='smalldata',
                                        path=csvPathname,
                                        hex_key=hex_key,
                                        timeoutSecs=10)

        print "Just split away and see if anything blows up"
        splitMe = hex_key

        pA = h2o_cmd.ParseObj(parseResultA)
        print pA.numRows
        print pA.numCols
        print pA.parse_key

        print "Just split away and see if anything blows up"
        splitMe = hex_key
        iA = h2o_cmd.InspectObj(splitMe)
        origNumRows = iA.numRows
        origNumCols = iA.numCols
        for s in range(10):
            iA = h2o_cmd.InspectObj(splitMe)
            numRows = iA.numRows

            fsResult = h2o.n0.frame_split(training_frame=splitMe,
                                          ratios='[0.5]')
            fs = OutputObj(fsResult, 'frame_split')
            model_key = fs.jobs[0].dest.name

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'],
                              'frame_split')
            # print "model:", dump_json(model)
            split_keys = [split._key.name for split in model.splits]

            iB = h2o_cmd.InspectObj(split_keys[0])
            iC = h2o_cmd.InspectObj(split_keys[1])

            numCols = iB.numCols
            split0_rows = iB.numRows
            split1_rows = iC.numRows

            # print "Iteration", s, "split0_rows:", split0_rows, "split1_rows:", split1_rows
            splitMe = split_keys[1]
            # split should be within 1 row accuracy. let's say within 20 for now
            self.assertLess(abs(split1_rows - split0_rows), 2)
            self.assertEqual(numRows, (split1_rows + split0_rows))
            self.assertEqual(numCols, origNumCols)
示例#2
0
    def test_split_frame(self):

        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        hex_key = "covtype.hex"

        parseResultA = h2i.import_parse(bucket='home-0xdiag-datasets',
                                        path=csvPathname,
                                        hex_key=hex_key,
                                        timeoutSecs=20)
        pA = h2o_cmd.ParseObj(parseResultA)
        print pA.numRows
        print pA.numCols
        print pA.parse_key

        print "Just split away and see if anything blows up"
        splitMe = hex_key
        iA = h2o_cmd.InspectObj(splitMe)
        origNumRows = iA.numRows
        origNumCols = iA.numCols
        for s in range(20):
            iA = h2o_cmd.InspectObj(splitMe)
            numRows = iA.numRows

            fsResult = h2o.n0.split_frame(dataset=splitMe, ratios='[0.5]')
            fs = OutputObj(fsResult, 'split_frame')
            d = fs.jobs[0].destination_frames

            # modelResult = h2o.n0.models(key=model_key)
            # model = OutputObj(modelResult['models'][0]['output'], 'split_frame')
            # print "model:", dump_json(model)
            split_keys = [split.name for split in d]

            iB = h2o_cmd.InspectObj(split_keys[0])
            iC = h2o_cmd.InspectObj(split_keys[1])

            numCols = iB.numCols
            split0_rows = iB.numRows
            split1_rows = iC.numRows

            # print "Iteration", s, "split0_rows:", split0_rows, "split1_rows:", split1_rows
            splitMe = split_keys[1]
            # split should be within 1 row accuracy. let's say within 20 for now
            self.assertLess(abs(split1_rows - split0_rows), 2)
            self.assertEqual(numRows, (split1_rows + split0_rows))
            self.assertEqual(numCols, origNumCols)
            if split1_rows <= 1:
                break
示例#3
0
    def test_GLM_covtype(self):
        importFolderPath = "standard"
        csvFilename = "covtype.data"
        hex_key = "covtype.hex"
        bucket = "home-0xdiag-datasets"
        csvPathname = importFolderPath + "/" + csvFilename

        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, hex_key=hex_key, 
            check_header=1, timeoutSecs=180, doSummary=False)
        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        expected = []
        allowedDelta = 0

        labelListUsed = list(labelList)
        labelListUsed.remove('C54')
        numColsUsed = numCols - 1
        for trial in range(1):
            # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie']
            # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie']
            # can we do classification with probabilities?
            # are only lambda and alpha grid searchable?
            parameters = {
                'validation_frame': parse_key,
                'ignored_columns': None,
                # FIX! for now just use a column that's binomial
                'response_column': 'C54',
                # FIX! when is this needed? redundant for binomial?
                'balance_classes': False,
                'max_after_balance_size': None,
                'standardize': False,
                'family': 'binomial', 
                'link': None, 
                'tweedie_variance_power': None,
                'tweedie_link_power': None,
                'alpha': '[1e-4]',
                'lambda': '[0.5,0.25, 0.1]',
                'prior1': None,
                'lambda_search': None,
                'nlambdas': None,
                'lambda_min_ratio': None,
                'use_all_factor_levels': False,
                # NPE with n_folds 2?
                'n_folds': 1,
            }

            model_key = 'covtype_glm.hex'
            bmResult = h2o.n0.build_model(
                algo='glm',
                destination_key=model_key,
                training_frame=parse_key,
                parameters=parameters,
                timeoutSecs=60)
            bm = OutputObj(bmResult, 'bm')


            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')
            h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed)

            cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            mcms = OutputObj({'data': cmm.max_criteria_and_metric_scores.data}, 'mcms')
            m1 = mcms.data[1:]
            h0 = mcms.data[0]
            print "\nmcms", tabulate(m1, headers=h0)

            thms = OutputObj(cmm.thresholds_and_metric_scores, 'thms')
            cmms = OutputObj({'cm': cmm.confusion_matrices}, 'cmms')

            if 1==0:
                print ""
                for i,c in enumerate(cmms.cm):
                    print "\ncmms.cm[%s]" % i, tabulate(c)
                print ""
                

            mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            mm = OutputObj(mmResult['model_metrics'][0], 'mm')

            prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
示例#4
0
    def test_w2v_basic_2(self):
        global SYNDATASETS_DIR
        SYNDATASETS_DIR = h2o.make_syn_dir()
        n = 100
        tryList = [
            # (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 3, 'cF', 300),
            (n, 4, 'cG', 300),
            (n, 5, 'cH', 300),
            (n, 6, 'cI', 300),
            (n, 7, 'cJ', 300),
            (n, 9, 'cK', 300),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:

            csvPathname = create_file_with_seps(rowCount, colCount)
            hex_key = "not_used.hex"

            # just parse to make sure it's good
            parseResult = h2i.import_parse(path=csvPathname,
                                           check_header=1,
                                           delete_on_done=0,
                                           timeoutSecs=180,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            parse_key = pA.parse_key
            numRows = iA.numRows
            numCols = iA.numCols
            labelList = iA.labelList

            src_key = h2i.find_key('syn_.*csv')

            # no cols ignored
            labelListUsed = list(labelList)
            numColsUsed = numCols
            for trial in range(1):

                parameters = {
                    'validation_frame': parse_key,  # KeyIndexed False []
                    'ignored_columns': None,  # string[] None []
                    'minWordFreq': 1,  # int 5 []
                    'wordModel': 'CBOW',  # enum [u'CBOW', u'SkipGram']
                    'normModel':
                    'NegSampling',  # enum # [u'HSM', u'NegSampling']
                    'negSampleCnt': 1,  # int 5 []
                    'vecSize': 10,  # int 100
                    'windowSize': 2,  # int 5
                    'sentSampleRate': 0.001,  # float 0.001
                    'initLearningRate': 0.05,  # float 0.05
                    'epochs': 1,  # int 5
                }

                model_key = 'benign_w2v.hex'
                bmResult = h2o.n0.build_model(algo='word2vec',
                                              destination_key=model_key,
                                              training_frame=parse_key,
                                              parameters=parameters,
                                              timeoutSecs=10)
                bm = OutputObj(bmResult, 'bm')

                modelResult = h2o.n0.models(key=model_key)
                model = OutputObj(modelResult['models'][0]['output'], 'model')

                cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                         frame=parse_key,
                                                         timeoutSecs=60)
                cmm = OutputObj(cmmResult, 'cmm')

                mmResult = h2o.n0.model_metrics(model=model_key,
                                                frame=parse_key,
                                                timeoutSecs=60)
                mm = OutputObj(mmResult['model_metrics'][0], 'mm')

                prResult = h2o.n0.predict(model=model_key,
                                          frame=parse_key,
                                          timeoutSecs=60)
                pr = OutputObj(prResult['model_metrics'][0]['predictions'],
                               'pr')

                h2o_cmd.runStoreView()
示例#5
0
    def test_billion_rows(self):
        # just do the import folder once
        timeoutSecs = 1500

        csvFilenameAll = [
            # quick test first
            # "covtype.data",
            # then the real thing
            "billion_rows.csv.gz",
        ]
        # csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        ### h2b.browseTheCloud()

        for csvFilename in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path='standard/' + csvFilename,
                                           timeoutSecs=timeoutSecs,
                                           pollTimeoutSecs=60)
            elapsed = time.time() - start
            print csvFilename, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs)

            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            parse_key = pA.parse_key
            numRows = iA.numRows
            numCols = iA.numCols
            labelList = iA.labelList

            parameters = {
                'response_column': 'C2',
                'alpha': '[0]',
                'lambda': '[0]',
            }
            model_key = 'B.hex'
            bmResult = h2o.n0.build_model(algo='glm',
                                          model_id=model_key,
                                          training_frame=parse_key,
                                          parameters=parameters,
                                          timeoutSecs=300)
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')
            h2o_glm.simpleCheckGLM(self, model, parameters, labelList,
                                   labelListUsed)

            cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                     frame=parse_key,
                                                     timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            mmResult = h2o.n0.model_metrics(model=model_key,
                                            frame=parse_key,
                                            timeoutSecs=60)
            mm = OutputObj(mmResult, 'mm')

            prResult = h2o.n0.predict(model=model_key,
                                      frame=parse_key,
                                      timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

            h2o_cmd.runStoreView()

            labelListUsed = labelList
            h2o_glm.simpleCheckGLM(self, model, parameters, labelList,
                                   labelListUsed)
示例#6
0
    def test_DL_mnist(self):
        h2o.nodes[0].remove_all_keys()
        csvPathname_train = 'laptop/mnist/train.csv.gz'
        csvPathname_test = 'laptop/mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 60
        parseResult = h2i.import_parse(bucket='bigdata',
                                       path=csvPathname_train,
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs,
                                       doSummary=False)
        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        numCols = iA.numCols
        labelList = iA.labelList
        parseResultV = h2i.import_parse(bucket='bigdata',
                                        path=csvPathname_test,
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs,
                                        doSummary=False)

        response = numCols - 1

        #Making random id
        identifier = ''.join(
            random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'deeplearning_' + identifier + '.hex'

        parameters = {
            'validation_frame': validation_key,  # KeyIndexed None
            'ignored_columns': None,  # string[] None
            'response_column': labelList[response],  # string None
            'balance_classes': None,  # boolean false
            'max_after_balance_size': None,  # float Infinity
            'keep_cross_validation_splits': None,  # boolean false
            'checkpoint': None,  # Key None
            'overwrite_with_best_model': None,  # boolean true
            'expert_mode': None,  # boolean false
            'autoencoder': None,  # boolean false
            'use_all_factor_levels': None,  # boolean true
            # [u'Tanh', u'TanhWithDropout', u'Rectifier', u'RectifierWithDropout', u'Maxout', u'MaxoutWithDropout']
            'activation': 'RectifierWithDropout',  # enum Rectifier 
            'hidden': '[117,131,129]',  # int[] [200, 200]
            'epochs': 2.0,  # double 10.0
            'train_samples_per_iteration': None,  # long -2
            'target_ratio_comm_to_comp': None,  # double 0.02
            'seed': None,  # long 1679194146842485659
            'adaptive_rate': False,  # boolean true
            'rho': None,  # double 0.99
            'epsilon': None,  # double 1.0E-8
            'rate': None,  # double 0.005
            'rate_annealing': None,  # double 1.0E-6
            'rate_decay': None,  # double 1.0
            'momentum_start': 0.5,  # double 0.0
            'momentum_ramp': 100000,  # double 1000000.0
            'momentum_stable': 0.9,  # double 0.0
            'nesterov_accelerated_gradient': None,  # boolean true
            'input_dropout_ratio': 0.2,  # double 0.0
            'hidden_dropout_ratios': None,  # double[] None (this can grid?)
            'l1': 1e-5,  # double 0.0
            'l2': 1e-7,  # double 0.0
            'max_w2': 15,  # float Infinity
            'initial_weight_distribution':
            None,  # enum UniformAdaptive [u'UniformAdaptive', u'Uniform', u'Normal']
            'initial_weight_scale': None,  # double 1.0
            'loss':
            'CrossEntropy',  # enum MeanSquare [u'Automatic', u'MeanSquare', u'CrossEntropy']
            'score_interval': None,  # double 5.0
            'score_training_samples': None,  # long 10000
            'score_validation_samples': None,  # long 0
            'score_duty_cycle': None,  # double 0.1
            'classification_stop': None,  # double 0.0
            'regression_stop': None,  # double 1.0E-6
            'quiet_mode': None,  # boolean false
            'max_confusion_matrix_size': None,  # int 20
            'max_hit_ratio_k': None,  # int 10
            'balance_classes': None,  # boolean false
            'class_sampling_factors': None,  # float[] None
            'max_after_balance_size': None,  # float Infinity
            'score_validation_sampling':
            None,  # enum Uniform [u'Uniform', u'Stratified']
            'diagnostics': None,  # boolean true
            'variable_importances': None,  # boolean false
            'fast_mode': None,  # boolean true
            'ignore_const_cols': None,  # boolean true
            'force_load_balance': None,  # boolean true
            'replicate_training_data': None,  # boolean false
            'single_node_mode': None,  # boolean false
            'shuffle_training_data': None,  # boolean false
            'missing_values_handling':
            None,  # enum MeanImputation [u'Skip', u'MeanImputation']
            'sparse': None,  # boolean false
            'col_major': None,  # boolean false
            'average_activation': None,  # double 0.0
            'sparsity_beta': None,  # double 0.0
        }
        expectedErr = 0.057  ## expected validation error for the above model
        relTol = 0.20  ## 20% rel. error tolerance due to Hogwild!

        timeoutSecs = 60
        start = time.time()

        bmResult = h2o.n0.build_model(algo='deeplearning',
                                      model_id=model_key,
                                      training_frame=hex_key,
                                      parameters=parameters,
                                      timeoutSecs=timeoutSecs)
        bm = OutputObj(bmResult, 'bm')

        print 'deep learning took', time.time() - start, 'seconds'

        modelResult = h2o.n0.models(key=model_key)
        model = OutputObj(modelResult['models'][0]['output'], 'model')
        #        print "model:", dump_json(model)

        cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                 frame=validation_key,
                                                 timeoutSecs=60)
        cmm = OutputObj(cmmResult, 'cmm')

        mmResult = h2o.n0.model_metrics(model=model_key,
                                        frame=validation_key,
                                        timeoutSecs=60)
        mm = OutputObj(mmResult['model_metrics'][0], 'mm')

        prResult = h2o.n0.predict(model=model_key,
                                  frame=validation_key,
                                  timeoutSecs=60)
        pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

        h2o_cmd.runStoreView()

        actualErr = model['errors']['valid_err']
        print "expected classification error: " + format(expectedErr)
        print "actual   classification error: " + format(actualErr)

        if actualErr != expectedErr and abs(
            (expectedErr - actualErr) / expectedErr) > relTol:
            raise Exception(
                "Scored classification error of %s is not within %s %% relative error of %s"
                % (actualErr, float(relTol) * 100, expectedErr))
示例#7
0
    def test_GLM_basic_1(self):
        importFolderPath = "logreg"
        csvFilename = "benign.csv"
        hex_key = "benign.hex"
        csvPathname = importFolderPath + "/" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       check_header=1,
                                       timeoutSecs=180,
                                       doSummary=False)
        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        expected = []
        allowedDelta = 0

        # loop, to see if we get same centers

        labelListUsed = list(labelList)
        labelListUsed.remove('STR')
        labelListUsed.remove('FNDX')  # response removed also
        numColsUsed = numCols - 2
        for trial in range(1):
            # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie']
            # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie']
            # can we do classification with probabilities?
            # are only lambda and alpha grid searchable?

            # glm parameters:

            # model_id Key<Model> False None []
            # training_frame Key<Frame> False None []
            # validation_frame Key<Frame> False None []
            # ignored_columns string[] False None []
            # drop_na20_cols boolean False False []
            # score_each_iteration boolean False False []
            # response_column VecSpecifier False None []
            # balance_classes boolean False False []
            # class_sampling_factors float[] False None []
            # max_after_balance_size float False 5.0 []
            # max_confusion_matrix_size int False 20 []
            # max_hit_ratio_k int False 10 []
            # family enum False gaussian [u'gaussian', u'binomial', u'poisson', u'gamma']
            # solver enum False IRLSM [u'AUTO', u'IRLSM', u'L_BFGS']

            # alpha double[] False None []

            # lambda double[] False None []
            # lambda_search boolean False False []
            # lambda_min_ratio double False -1.0 []
            # nlambdas int False -1 []

            # standardize boolean False True []
            # max_iterations int False -1 []
            # beta_epsilon double False 0.0001 []
            # link enum False family_default [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie']
            # prior double False -1.0 []
            # use_all_factor_levels boolean False False []
            # beta_constraints Key<Frame> False None []
            # max_active_predictors int False -1 []

            parameters = {
                'ignored_columns': '["STR"]',
                'response_column': 'FNDX',
                # FIX! when is this needed? redundant for binomial?
                'balance_classes': False,
                'max_after_balance_size': None,
                'standardize': False,
                'family': 'binomial',
                'link': None,
                'alpha': '[1e-4]',
                'lambda': '[0.5]',
                'prior1': None,
                'lambda_search': None,
                'nlambdas': None,
                'lambda_min_ratio': None,
                # 'use_all_factor_levels': False,
            }

            model_key = 'benign_glm.hex'
            bmResult = h2o.n0.build_model(algo='glm',
                                          model_id=model_key,
                                          training_frame=parse_key,
                                          parameters=parameters,
                                          timeoutSecs=10)
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')
            h2o_glm.simpleCheckGLM(self, model, parameters, labelList,
                                   labelListUsed)

            cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                     frame=parse_key,
                                                     timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            mcms = OutputObj({'data': cmm.max_criteria_and_metric_scores.data},
                             'mcms')
            m1 = mcms.data[1:]
            h0 = mcms.data[0]
            print "\nmcms", tabulate(m1, headers=h0)

            thms = OutputObj(cmm.thresholds_and_metric_scores, 'thms')
            if 1 == 0:
                cmms = OutputObj({'cm': cmm.confusion_matrices}, 'cmms')
                print ""
                for i, c in enumerate(cmms.cm):
                    print "\ncmms.cm[%s]" % i, tabulate(c)
                print ""

            mmResult = h2o.n0.model_metrics(model=model_key,
                                            frame=parse_key,
                                            timeoutSecs=60)
            mm = OutputObj(mmResult['model_metrics'][0], 'mm')

            prResult = h2o.n0.predict(model=model_key,
                                      frame=parse_key,
                                      timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
示例#8
0
    def test_bayes_basic(self):
        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'standard'
        trainFilename = 'covtype.shuffled.90pct.data'
        train_key = 'covtype.train.hex'
        b = Key(train_key)

        model_key = 'bayesModelKey'
        timeoutSecs = 1800
        csvPathname = importFolderPath + "/" + trainFilename

        # FIX! do I need to force enum for classification? what if I do regression after this?
        columnTypeDict = {54: 'Enum'}
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       columnTypeDict=columnTypeDict,
                                       schema='local',
                                       chunk_size=4194304,
                                       hex_key=train_key,
                                       timeoutSecs=timeoutSecs)

        # don't have to make it enum, if 0/1 (can't operate on enums like this)
        # make 1-7 go to 0-6. 0 isn't there.
        # make 1 thru 6 go to 1
        # change columnTypeDict to None above if I do this
        # Assign(b[:,54], b[:,54]-1)
        # Assign(b[:,54], b[:,54]!=0)
        # now we have just 0 and 1

        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        labelListUsed = list(labelList)
        numColsUsed = numCols

        # run through a couple of parameter sets
        parameters = []
        parameters.append({
            'response_column': 'C55',  # still 1-55 on colnames
        })  # just default

        model_key = 'covtype_bayes.hex'

        for p in parameters:
            bmResult = h2o.n0.build_model(algo='naivebayes',
                                          destination_key=model_key,
                                          training_frame=train_key,
                                          validation_frame=train_key,
                                          parameters=p,
                                          timeoutSecs=60)
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')

            cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                     frame=parse_key,
                                                     timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            mmResult = h2o.n0.model_metrics(model=model_key,
                                            frame=parse_key,
                                            timeoutSecs=60)
            mmResultShort = mmResult['model_metrics'][0]
            del mmResultShort['frame']  # too much!
            mm = OutputObj(mmResultShort, 'mm')

            prResult = h2o.n0.predict(model=model_key,
                                      frame=parse_key,
                                      timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
    def test_GLM_error1(self):
        importFolderPath = "covtype"
        csvFilename = "covtype.20k.data"
        hex_key = "covtype20k.hex"
        binomial_key = "covtype20k.b.hex"
        b = Key(hex_key)
        csvPathname = importFolderPath + "/" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       check_header=1,
                                       timeoutSecs=180,
                                       doSummary=False)

        ## columnTypeDict = {54: 'Enum'}
        columnTypeDict = None
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=binomial_key,
                                       columnTypeDict=columnTypeDict,
                                       check_header=1,
                                       timeoutSecs=180,
                                       doSummary=False)

        # don't have to make it enum, if 0/1 (can't operate on enums like this)
        # make 1-7 go to 0-6. 0 isn't there.
        Assign(b[:, 54], b[:, 54] - 1)
        # make 1 thru 6 go to 1
        Assign(b[:, 54], b[:, 54] != 0)
        # now we have just 0 and 1

        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        expected = []
        allowedDelta = 0

        # loop, to see if we get same centers

        labelListUsed = list(labelList)
        numColsUsed = numCols

        for trial in range(5):
            parameters = {
                'response_column': 'C55',
                'max_iterations': 3,
                'solver': 'L_BFGS',
                'ignored_columns': '["C1"]',
                'alpha': '[0.1]',
                'max_after_balance_size': 1000.0,
                'class_sampling_factors': '[0.2]',
                # 'use_all_factor_levels': None,
                'lambda': '[0]',
            }

            bHack = hex_key

            co = h2o_cmd.runSummary(key=binomial_key, column=54)
            print "binomial_key summary:", co.label, co.type, co.missing_count, co.domain, sum(
                co.histogram_bins)
            co = h2o_cmd.runSummary(key=hex_key, column=54)
            print "hex_key summary:", co.label, co.type, co.missing_count, co.domain, sum(
                co.histogram_bins)

            model_key = 'rand_glm.hex'
            bmResult = h2o.n0.build_model(algo='glm',
                                          model_id=model_key,
                                          training_frame=bHack,
                                          parameters=parameters,
                                          timeoutSecs=10)
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')
            h2o_glm.simpleCheckGLM(self,
                                   model,
                                   parameters,
                                   labelList,
                                   labelListUsed,
                                   allowNaN=True)

            cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                     frame=parse_key,
                                                     timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            # FIX! when is this legal
            doClassification = False
            if doClassification:
                mcms = OutputObj(
                    {'data': cmm.max_criteria_and_metric_scores.data}, 'mcms')
                m1 = mcms.data[1:]
                h0 = mcms.data[0]
                print "\nmcms", tabulate(m1, headers=h0)

            if doClassification:
                thms = OutputObj(cmm.thresholds_and_metric_scores, 'thms')
                cmms = OutputObj({'cm': cmm.confusion_matrices}, 'cmms')

                if 1 == 0:
                    print ""
                    for i, c in enumerate(cmms.cm):
                        print "\ncmms.cm[%s]" % i, tabulate(c)
                    print ""

            mmResult = h2o.n0.model_metrics(model=model_key,
                                            frame=parse_key,
                                            timeoutSecs=60)
            mm = OutputObj(mmResult['model_metrics'][0], 'mm')

            prResult = h2o.n0.predict(model=model_key,
                                      frame=parse_key,
                                      timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
示例#10
0
    def test_kmeans_prostate(self):
        importFolderPath = "logreg"
        csvFilename = "prostate.csv"
        hex_key = "prostate.hex"
        csvPathname = importFolderPath + "/" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, check_header=1, 
            timeoutSecs=180, doSummary=False)
        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        # loop, to see if we get same centers

        expected = [
            (None, [0.37, 65.77, 1.07, 2.23, 1.11, 10.49, 4.24, 6.31],   215,  36955),  
            (None, [0.36, 66.44, 1.09, 2.21, 1.06, 10.84, 34.16, 6.31],  136,  46045), 
            (None, [0.83, 66.17, 1.21, 2.86, 1.34, 73.30, 15.57, 7.31],   29,  33412), 
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.02, 0.02, 0.02)

        labelListUsed = list(labelList)
        labelListUsed.remove('ID')
        numColsUsed = numCols - 1

        for trial in range(5):
            # kmeansSeed = random.randint(0, sys.maxint)
            # actually can get a slightly better error sum with a different seed
            # this seed gets the same result as scikit (at least in h2o1)
            # kmeansSeed = 6655548259421773879
            kmeansSeed = 7037878434240420762
            parameters = {
                'validation_frame': parse_key,
                'ignored_columns': "['ID']",
                'k': 3,
                'max_iterations': 500,
                'standardize': False,
                'seed': kmeansSeed,
                # PlusPlus init seems bad here..should investigate
                'init': 'Furthest',
            }

            model_key = 'prostate_k.hex'
            bmResult = h2o.n0.build_model(
                algo='kmeans', 
                destination_key=model_key,
                training_frame=parse_key,
                parameters=parameters, 
                timeoutSecs=10) 

            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            km = h2o_kmeans.KMeansObj(modelResult, parameters, numRows, numColsUsed, labelListUsed)
            h2o_kmeans.compareResultsToExpected(km.tuplesSorted, expected, allowedDelta)

            cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            mm = OutputObj(mmResult['model_metrics'][0], 'mm')

            prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

            h2o_cmd.runStoreView()
示例#11
0
    def test_quant_cmp_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (5 * ROWS, 1, 'x.hex', 1, 20000,
             ['C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00]),
            (5 * ROWS, 1, 'x.hex', -5000, 0,
             ['C1', -5001.00, -3750.0, -2445, -1200.0, 99]),
            (1 * ROWS, 1, 'x.hex', -100000, 100000,
             ['C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0]),
            (1 * ROWS, 1, 'x.hex', -1, 1,
             ['C1', -1.05, -0.48, 0.0087, 0.50, 1.00]),
            (1 * ROWS, 1, 'A.hex', 1, 100,
             ['C1', 1.05, 26.00, 51.00, 76.00, 100.0]),
            (1 * ROWS, 1, 'A.hex', -99, 99, ['C1', -99, -50.0, 0, 50.00, 99]),
            (1 * ROWS, 1, 'B.hex', 1, 10000,
             ['C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00]),
            (1 * ROWS, 1, 'B.hex', -100, 100,
             ['C1', -100.10, -50.0, 0.85, 51.7, 100, 00]),
            (1 * ROWS, 1, 'C.hex', 1, 100000,
             ['C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00]),
            (1 * ROWS, 1, 'C.hex', -101, 101,
             ['C1', -100.10, -50.45, -1.18, 49.28, 100.00]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:
            # max error = half the bin size?
            colname = expected[0]
            maxDelta = ((expectedMax - expectedMin) / 1000.0) / 2.0

            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                              expectedMax, SEEDPERFILE)
            # need the full pathname when python parses the csv for numpy/sort
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)

            #***************************
            # Parse
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult,
                                  expectedNumRows=rowCount,
                                  expectedNumCols=colCount)
            numRows = pA.numRows
            numCols = pA.numCols
            parse_key = pA.parse_key
            # this guy can take json object as first thing, or re-read with key
            iA = h2o_cmd.InspectObj(parse_key,
                                    expectedNumRows=rowCount,
                                    expectedNumCols=colCount,
                                    expectedMissinglist=[])

            #***************************
            # Summary
            co = h2o_cmd.runSummary(key=parse_key)
            default_pctiles = co.default_pctiles

            coList = [
                co.base,
                len(co.bins),
                len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins,
                co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision,
                co.sigma, co.str_data, co.stride, co.type, co.zeros
            ]
            for c in coList:
                print c

            print "len(co.bins):", len(co.bins)
            print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(
                co.mean)
            print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(
                co.sigma)

            print "FIX! hacking the co.pctiles because it's short by two"
            summ_pctiles = [0] + co.pctiles + [0]

            pt = h2o_util.twoDecimals(summ_pctiles)
            mx = h2o_util.twoDecimals(co.maxs)
            mn = h2o_util.twoDecimals(co.mins)
            exp = h2o_util.twoDecimals(expected[1:])

            print "co.label:", co.label, "co.pctiles (2 places):", pt
            print "default_pctiles:", default_pctiles
            print "co.label:", co.label, "co.maxs: (2 places):", mx
            print "co.label:", co.label, "co.mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\
                mn[0], pt[3], pt[5], pt[7], mx[0])
            h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\
                exp[0], exp[1], exp[2], exp[3], exp[4])

            #***************************
            # Quantile
            # the thresholds h2o used, should match what we expected

            # using + here seems to result in an odd tuple..doesn't look right to h2o param
            # so went with this. Could add '[' and ']' to the list first, before the join.
            probsStr = "[%s]" % ",".join(map(str, probsList))
            parameters = {
                'model_id': "a.hex",
                'training_frame': parse_key,
                'validation_frame': parse_key,
                'ignored_columns': None,
                'probs': probsStr,
            }

            model_key = 'qhex'
            bmResult = h2o.n0.build_model(algo='quantile',
                                          model_id=model_key,
                                          training_frame=parse_key,
                                          parameters=parameters,
                                          timeoutSecs=10)
            bm = OutputObj(bmResult, 'bm')

            msec = bm.jobs[0]['msec']
            print "bm msec", msec

            # quantile result is just a job result to a key
            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0], 'model')

            print "model.output:", model.output
            print "model.output:['quantiles']", model.output['quantiles']
            print "model.output:['iterations']", model.output['iterations']
            print "model.output:['names']", model.output['names']
            quantiles = model.output['quantiles'][
                0]  # why is this a double array
            iterations = model.output['iterations']
            assert iterations == 11, iterations
            print "quantiles: ", quantiles
            print "iterations: ", iterations

            # cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            # cmm = OutputObj(cmmResult, 'cmm')

            # mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            # mm = OutputObj(mmResult, 'mm')

            # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
            # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
            h2o_cmd.runStoreView()

            trial += 1
            # compare the last threshold
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0,  # what col to extract from the csv
                    datatype='float',
                    quantile=CHECK_PCTILE,
                    # h2oSummary2=pctile[-1],
                    # h2oQuantilesApprox=result, # from exec
                    h2oExecQuantiles=quantiles[CHECK_PCTILE_INDEX],
                )
            h2o.nodes[0].remove_all_keys()
示例#12
0
    def test_GLM_basic_2(self):
        importFolderPath = "logreg"
        csvFilename = "prostate.csv"
        hex_key = "prostate.hex"
        csvPathname = importFolderPath + "/" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       checkHeader=1,
                                       timeoutSecs=180,
                                       doSummary=False)
        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        expected = []
        allowedDelta = 0

        labelListUsed = list(labelList)
        labelListUsed.remove('ID')
        labelListUsed.remove('CAPSULE')
        numColsUsed = numCols - 2
        for trial in range(1):
            # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie']
            # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie']
            # can we do classification with probabilities?
            # are only lambda and alpha grid searchable?
            parameters = {
                'validation_frame': parse_key,
                'ignored_columns': '[ID]',
                'score_each_iteration': True,
                'response_column': 'CAPSULE',
                # FIX! when is this needed? redundant for binomial?
                'do_classification': True,
                'balance_classes': False,
                'max_after_balance_size': None,
                'standardize': False,
                'family': 'binomial',
                'link': None,
                'tweedie_variance_power': None,
                'tweedie_link_power': None,
                'alpha': '[1e-4]',
                'lambda': '[0.5]',
                'prior1': None,
                'lambda_search': None,
                'nlambdas': None,
                'lambda_min_ratio': None,
                'higher_accuracy': True,
                'use_all_factor_levels': False,
                # NPE with n_folds 2?
                'n_folds': 1,
            }

            model_key = 'prostate_glm.hex'
            bmResult = h2o.n0.build_model(algo='glm',
                                          destination_key=model_key,
                                          training_frame=parse_key,
                                          parameters=parameters,
                                          timeoutSecs=10)
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')
            h2o_glm.simpleCheckGLM(self, model, parameters, labelList,
                                   labelListUsed)

            cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                     frame=parse_key,
                                                     timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            mmResult = h2o.n0.model_metrics(model=model_key,
                                            frame=parse_key,
                                            timeoutSecs=60)
            mm = OutputObj(mmResult, 'mm')

            prResult = h2o.n0.predict(model=model_key,
                                      frame=parse_key,
                                      timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

            h2o_cmd.runStoreView()
    def test_PCA_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (10000, 10, 'cA', 300),
            (10000, 50, 'cB', 300),
            (10000, 100, 'cC', 300),
            # (10000, 500, 'cH', 300),
            # (10000, 1000, 'cI', 300),
        ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            print(rowCount, colCount, hex_key, timeoutSecs)
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            # PARSE ****************************************
            modelKey = 'PCAModelKey'
            scoreKey = 'PCAScoreKey'

            # Parse ****************************************
            parseResult = h2i.import_parse(bucket=None,
                                           path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            parse_key = pA.parse_key
            numRows = iA.numRows
            numCols = iA.numCols
            labelList = iA.labelList

            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(numRows), \
                "    numCols:", "{:,}".format(numCols)

            # PCA(tolerance iterate)****************************************
            for tolerance in [i / 10.0 for i in range(11)]:
                parameters = {
                    # 'tolerance': tolerance,
                    # 'standardize': 1,
                    'k': 1,
                }
                model_key = 'pca.hex'
                bmResult = h2o.n0.build_model(algo='pca',
                                              model_id=model_key,
                                              training_frame=parse_key,
                                              parameters=parameters,
                                              timeoutSecs=10)
                bm = OutputObj(bmResult, 'bm')

                modelResult = h2o.n0.models(key=model_key)
                model = OutputObj(modelResult['models'][0]['output'], 'model')

                cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                         frame=parse_key,
                                                         timeoutSecs=60)
                cmm = OutputObj(cmmResult, 'cmm')

                mmResult = h2o.n0.model_metrics(model=model_key,
                                                frame=parse_key,
                                                timeoutSecs=60)
                mm = OutputObj(mmResult['model_metrics'][0], 'mm')

                prResult = h2o.n0.predict(model=model_key,
                                          frame=parse_key,
                                          timeoutSecs=60)
                pr = OutputObj(prResult['model_metrics'][0]['predictions'],
                               'pr')

                h2o_cmd.runStoreView()
示例#14
0
    def test_GBM_basic(self):
        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'standard'
        trainFilename = 'covtype.shuffled.90pct.data'
        train_key = 'covtype.train.hex'
        model_key = 'GBMModelKey'
        timeoutSecs = 1800
        csvPathname = importFolderPath + "/" + trainFilename

        # FIX! do I need to force enum for classification? what if I do regression after this?
        columnTypeDict = {54: 'Enum'}
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, columnTypeDict=columnTypeDict,
            schema='local', chunk_size=4194304, hex_key=train_key, timeoutSecs=timeoutSecs)

        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        labelListUsed = list(labelList)
        numColsUsed = numCols

        # run through a couple of parameter sets
        parameters = []
        parameters.append({
            'response_column': 'C55',
            'ntrees': 2,
            'max_depth': 10,
            'min_rows': 3,
            'nbins': 40,
            'learn_rate': 0.2,
            'loss': 'multinomial',
            # FIX! doesn't like it?
            # 'loss': 'Bernoulli',
            # FIX..no variable importance for GBM yet?
            # 'variable_importance': False,
            # 'seed': 
        })

        parameters.append({
            'response_column': 'C55', 
            'loss': 'multinomial',
            # This does nothing! intent is solely based on type of response col
            'ntrees': 1, 
            'max_depth': 20, 
            'min_rows': 3, 
            'nbins': 40, 
            'learn_rate': 0.2, 
            })

        model_key = 'covtype_gbm.hex'

        for p in parameters:
            bmResult = h2o.n0.build_model(
                algo='gbm',
                model_id=model_key,
                training_frame=train_key,
                validation_frame=train_key,
                parameters=p,
                timeoutSecs=60)
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')

            cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')
            print "\nLook!, can use dot notation: cmm.cm.confusion_matrix", cmm.cm.confusion_matrix, "\n"

            vis = OutputObj(model.variable_importances, 'vis')

            # just the first 10
            visDataChopped = [v[0:9] for v in vis.data]
            names = visDataChopped[0]
            relativeImportance = visDataChopped[1]
            print "names:", names
            print "relativeImportance:", relativeImportance
            scaledImportance = visDataChopped[2]
            percentage = visDataChopped[3]
            print "\nvis\n", tabulate(visDataChopped[1:], headers=names)
            # print "\nrelativeImportance (10)\n", tabulate(relativeImportance, headers=names)
            # print "\nscaledImportance (10)\n", tabulate(scaledImportance, headers=names)
            # print "\npercentage (10)\n", tabulate(percentage, headers=names)

            print "will say Regression or Classification. no Multinomial?"
            print "model.model_category", model.model_category
            assert model.model_category=='Multinomial', model.model_category

            print "FIX! why is mse 0 and mes_train Nan?"
            print "model.mse:", model.mse
            print "model.mse_train:", model.mse_train


            if 1==1:
                print ""
                for i,c in enumerate(cmm.cm):
                    print "\ncmms.cm[%s]" % i, tabulate(c)
                print ""



            mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            mmResultShort = mmResult['model_metrics'][0]
            del mmResultShort['frame'] # too much!
            mm = OutputObj(mmResultShort, 'mm')

            prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
    def test_GBM_airlines(self):
        files = [
            ('datasets', 'airlines_all.05p.csv', 'airlines_all.05p.hex', 1800,
             'IsDepDelayed'),
            # ('datasets', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed')
        ]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename

            model_key = 'GBMModelKey'
            # IsDepDelayed might already be enum, but just to be sure
            parseResult = h2i.import_parse(
                path=csvPathname,
                schema='hdfs',
                hex_key=trainKey,
                columnTypeDict={'IsDepDelayed': 'Enum'},
                timeoutSecs=timeoutSecs)

            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            parse_key = pA.parse_key
            numRows = iA.numRows
            numCols = iA.numCols
            labelList = iA.labelList

            labelListUsed = list(labelList)
            numColsUsed = numCols

            parameters = {
                'validation_frame': trainKey,
                # 'ignored_columns': '[CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed]',
                'response_column': response,
                # 'balance_classes':
                # 'max_after_balance_size':
                'ntrees': 2,
                'max_depth': 10,
                'min_rows': 3,
                'nbins': 40,
                'learn_rate': 0.2,
                # 'loss': 'multinomial',
                # FIX! doesn't like it?
                # 'loss': 'Bernoulli',
                # FIX..no variable importance for GBM yet?
                # 'variable_importance': False,
                # 'seed':
            }

            bmResult = h2o.n0.build_model(algo='gbm',
                                          model_id=model_key,
                                          training_frame=parse_key,
                                          parameters=parameters,
                                          timeoutSecs=360)
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')

            cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                     frame=parse_key,
                                                     timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')
            # print "\nLook!, can use dot notation: cmm.cm.confusion_matrix", cmm.cm.confusion_matrix, "\n"

            mmResult = h2o.n0.model_metrics(model=model_key,
                                            frame=parse_key,
                                            timeoutSecs=60)
            mmResultShort = mmResult['model_metrics'][0]
            del mmResultShort['frame']  # too much!
            mm = OutputObj(mmResultShort, 'mm')

            prResult = h2o.n0.predict(model=model_key,
                                      frame=parse_key,
                                      timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
    def test_GLM_many_cols_4(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u'
        ]
        tryList = [
            (100000, 10, 'cA', 600),
            (100000, 100, 'cA', 600),
        ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE,
                              translateList)

            parseResult = h2i.import_parse(path=csvPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=180,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            parse_key = pA.parse_key
            numRows = iA.numRows
            numCols = iA.numCols
            labelList = iA.labelList

            expected = []
            allowedDelta = 0

            labelListUsed = list(labelList)
            print "labelListUsed", labelListUsed
            response = labelListUsed[-1]
            labelListUsed.remove(response)
            numColsUsed = numCols - 1
            for trial in range(1):
                # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie']
                # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie']
                # can we do classification with probabilities?
                # are only lambda and alpha grid searchable?
                parameters = {
                    'validation_frame': parse_key,
                    'ignored_columns': None,
                    # FIX! for now just use a column that's binomial
                    'response_column': response,  # can't take index now?
                    # FIX! when is this needed? redundant for binomial?
                    'balance_classes': False,
                    'max_after_balance_size': None,
                    'standardize': False,
                    'family': 'binomial',
                    'link': None,
                    'tweedie_variance_power': None,
                    'tweedie_link_power': None,
                    'alpha': '[1e-4]',
                    'lambda': '[0.5,0.25, 0.1]',
                    'prior1': None,
                    'lambda_search': None,
                    'nlambdas': None,
                    'lambda_min_ratio': None,
                    'use_all_factor_levels': False,
                    'n_folds': 1,
                }
                model_key = 'many_cols_glm.hex'
                bmResult = h2o.n0.build_model(algo='glm',
                                              destination_key=model_key,
                                              training_frame=parse_key,
                                              parameters=parameters,
                                              timeoutSecs=60)
                bm = OutputObj(bmResult, 'bm')

                modelResult = h2o.n0.models(key=model_key)
                model = OutputObj(modelResult['models'][0]['output'], 'model')

                h2o_glm.simpleCheckGLM(self, model, parameters, labelList,
                                       labelListUsed)

                cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                         frame=parse_key,
                                                         timeoutSecs=60)
                cmm = OutputObj(cmmResult, 'cmm')

                mmResult = h2o.n0.model_metrics(model=model_key,
                                                frame=parse_key,
                                                timeoutSecs=60)
                mm = OutputObj(mmResult, 'mm')

                prResult = h2o.n0.predict(model=model_key,
                                          frame=parse_key,
                                          timeoutSecs=60)
                pr = OutputObj(prResult['model_metrics'][0]['predictions'],
                               'pr')
示例#17
0
    def notest_kmeans_benign(self):
        importFolderPath = "logreg"
        csvFilename = "benign.csv"
        hex_key = "benign.hex"
        csvPathname = importFolderPath + "/" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, check_header=1, 
            timeoutSecs=180, doSummary=False)
        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        expected = [
            (None, [8.86, 2.43, 35.53, 0.31, 13.22, 1.47, 1.33, 20.06, 13.08, 0.53, 2.12, 128.61, 35.33, 1.57], 49, None), 
            (None, [33.47, 2.29, 50.92, 0.34, 12.82, 1.33, 1.36, 21.43, 13.30, 0.37, 2.52, 125.40, 43.91, 1.79], 87, None), 
            (None, [27.64, 2.87, 48.11, 0.09, 11.80, 0.98, 1.51, 21.02, 12.53, 0.58, 2.89, 171.27, 42.73, 1.53], 55, None), 
            (None, [26.00, 2.67, 46.67, 0.00, 13.00, 1.33, 1.67, 21.56, 11.44, 0.22, 2.89, 234.56, 39.22, 1.56], 9, None), 
        ]

        # all are multipliers of expected tuple value
        allowedDelta = (0.01, 0.01, 0.01, 0.01)

        # loop, to see if we get same centers

        # no cols ignored
        labelListUsed = list(labelList)
        numColsUsed = numCols
        for trial in range(5):
            kmeansSeed = random.randint(0, sys.maxint)
            # kmeansSeed = 6655548259421773879
            parameters = {
                'validation_frame': parse_key,
                'ignored_columns': None,
                'k': 4,
                'max_iterations': 50,
                'standardize': False,
                'seed': kmeansSeed,
                'init': 'Furthest',
            }

            model_key = 'benign_k.hex'
            kmeansResult = h2o.n0.build_model(
                algo='kmeans', 
                destination_key=model_key,
                training_frame=parse_key,
                parameters=parameters, 
                timeoutSecs=10) 

            modelResult = h2o.n0.models(key=model_key)
            km = h2o_kmeans.KMeansObj(modelResult, parameters, numRows, numColsUsed, labelListUsed)
            # zip with * is it's own inverse here. It's sorted by centers for easy comparisons
            # changed..old order: ids, mses, rows, centers = zip(*km.tuplesSorted)
            # new order:
            # ids, centers, rows, errors = zip(*km.tuplesSorted)
            # create a tuple for each cluster, then sort by row

            # old. this was going to do a predict and a summary (histogram) (old h2o1 needed this for more info)
            # (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeansResult, csvPathname, parseResult, 'd', parameters)
            h2o_kmeans.compareResultsToExpected(km.tuplesSorted, expected, allowedDelta)

            # Not seeing any scoring results yet?
            cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            mm = OutputObj(mmResult['model_metrics'][0], 'mm')

            prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

            h2o_cmd.runStoreView()
def glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30):
    print "\nStarting GLM of", csvFilename
    # we can force a col type to enum now? with param columnTypes
    # "Numeric"
    # make the last column enum
    # Instead of string for parse, make this a dictionary, with column index, value
    # that's used for updating the ColumnTypes array before making it a string for parse
    columnTypeDict = {10: 'Enum'}
    parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, columnTypeDict=columnTypeDict,
        hex_key=csvFilename + ".hex", schema='put', timeoutSecs=30)
    pA = h2o_cmd.ParseObj(parseResult)
    iA = h2o_cmd.InspectObj(pA.parse_key)
    parse_key = pA.parse_key
    numRows = iA.numRows
    numCols = iA.numCols
    labelList = iA.labelList
    for i in range(10):
        print "Summary on column", i
        # FIX! how come only 0 works here for column
        co = h2o_cmd.runSummary(key=parse_key, column=i)
        for k,v in co:
            print k, v

    expected = []
    allowedDelta = 0

    labelListUsed = list(labelList)
    labelListUsed.remove('C11')
    numColsUsed = numCols - 1

    parameters = {
        'validation_frame': parse_key,
        'ignored_columns': None,
        # FIX! for now just use a column that's binomial
        'response_column': 'C11',
        # FIX! when is this needed? redundant for binomial?
        'balance_classes': False,
        'max_after_balance_size': None,
        'standardize': False,
        'family': 'binomial', 
        'link': None, 
        'tweedie_variance_power': None,
        'tweedie_link_power': None,
        'alpha': '[1e-4]',
        'lambda': '[0.5,0.25, 0.1]',
        'prior1': None,
        'lambda_search': None,
        'nlambdas': None,
        'lambda_min_ratio': None,
        'use_all_factor_levels': False,
        'n_folds': 1,
    }


    start = time.time()
    model_key = 'hastie_glm.hex'
    bmResult = h2o.n0.build_model(
        algo='glm',
        destination_key=model_key,
        training_frame=parse_key,
        parameters=parameters,
        timeoutSecs=60)
    bm = OutputObj(bmResult, 'bm')

    modelResult = h2o.n0.models(key=model_key)
    model = OutputObj(modelResult['models'][0]['output'], 'model')

    h2o_glm.simpleCheckGLM(self, model, parameters, labelList, labelListUsed)

    cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
    cmm = OutputObj(cmmResult, 'cmm')

    mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
    mm = OutputObj(mmResult, 'mm')

    prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
    pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

    # compare this glm to the first one. since the files are replications, the results
    # should be similar?
    if self.validation1:
        h2o_glm.compareToFirstGlm(self, 'AUC', validation, self.validation1)
    else:
        # self.validation1 = copy.deepcopy(validation)
        self.validation1 = None
示例#19
0
    def test_GLM_airlines(self):
        files = [
            # ('airlines', 'airlines_all.csv', 'airlines_all.hex', 1800, 'IsDepDelayed')
            ('airlines', 'year2013.csv', 'airlines_all.hex', 1800,
             'IsDepDelayed')
        ]

        for importFolderPath, csvFilename, trainKey, timeoutSecs, response in files:
            # PARSE train****************************************
            csvPathname = importFolderPath + "/" + csvFilename

            model_key = 'GLMModelKey'
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='local',
                                           hex_key=trainKey,
                                           timeoutSecs=timeoutSecs)

            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            parse_key = pA.parse_key
            numRows = iA.numRows
            numCols = iA.numCols
            labelList = iA.labelList

            labelListUsed = list(labelList)
            numColsUsed = numCols

            parameters = {
                'validation_frame': parse_key,
                'ignored_columns':
                '[CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed]',
                'response_column': response,
                # FIX! when is this needed? redundant for binomial?
                'balance_classes': False,
                'max_after_balance_size': None,
                'standardize': False,
                'family': 'binomial',
                'link': None,
                'tweedie_variance_power': None,
                'tweedie_link_power': None,
                'alpha': '[0]',
                'lambda': '[0.5]',
                'prior1': None,
                'lambda_search': None,
                'nlambdas': None,
                'lambda_min_ratio': None,
                'use_all_factor_levels': False,
                # NPE with n_folds 2?
                'n_folds': 1,
            }

            model_key = 'airlines_glm.hex'
            bmResult = h2o.n0.build_model(algo='glm',
                                          destination_key=model_key,
                                          training_frame=parse_key,
                                          parameters=parameters,
                                          timeoutSecs=300)
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')
            t0 = len(model.coefficients_table.data[0])
            t1 = len(model.coefficients_table.data[1])

            # not sure what the exact number should be, but it's gotta be less than the cols in the dataset?
            # Whoa! forgot GLM expands enums to individiual coefficients. Would really need to look at all the domains and sum plus other cols?
            # assert t0 <= numColsUsed, "%s %s" % (t0, numColsUsed)
            # assert t1 <= numColsUsed, "%s %s" % (t1, numColsUsed)

            h2o_glm.simpleCheckGLM(self, model, parameters, labelList,
                                   labelListUsed)

            cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                     frame=parse_key,
                                                     timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            mcms = OutputObj({'data': cmm.max_criteria_and_metric_scores.data},
                             'mcms')
            m1 = mcms.data[1:]
            h0 = mcms.data[0]
            print "\nmcms", tabulate(m1, headers=h0)

            thms = OutputObj(cmm.thresholds_and_metric_scores, 'thms')
            cmms = OutputObj({'cm': cmm.confusion_matrices}, 'cmms')

            if 1 == 0:
                print ""
                for i, c in enumerate(cmms.cm):
                    print "\ncmms.cm[%s]" % i, tabulate(c)
                print ""

            mmResult = h2o.n0.model_metrics(model=model_key,
                                            frame=parse_key,
                                            timeoutSecs=60)
            mm = OutputObj(mmResult['model_metrics'][0], 'mm')

            prResult = h2o.n0.predict(model=model_key,
                                      frame=parse_key,
                                      timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
示例#20
0
    def test_w2v_basic_1(self):
        global SYNDATASETS_DIR
        SYNDATASETS_DIR = h2o.make_syn_dir()
        n = 500000
        tryList = [
            (n, 1, 'cD', 300),
            (n, 2, 'cE', 300),
            (n, 3, 'cF', 300),
            (n, 4, 'cG', 300),
            (n, 5, 'cH', 300),
            (n, 6, 'cI', 300),
            (n, 7, 'cJ', 300),
            (n, 9, 'cK', 300),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:

            csvPathname = create_file_with_seps(rowCount, colCount)

            # just parse to make sure it's good
            parseResult = h2i.import_parse(path=csvPathname,
                                           checkHeader=1,
                                           delete_on_done=0,
                                           timeoutSecs=180,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            cA = h2o_test.OutputObj(iA.columns[0], "inspect_column")

            parse_key = pA.parse_key
            numRows = iA.numRows
            numCols = iA.numCols
            labelList = iA.labelList

            for i in range(colCount):
                print cA.type, cA.missing
                self.assertEqual(
                    0, cA.missing,
                    "Column %s Expected %s. missing: %s is incorrect" %
                    (i, 0, cA.missing))
                self.assertEqual(
                    'string', cA.type,
                    "Column %s Expected %s. type: %s is incorrect" %
                    (i, 0, cA.type))

            if DO_SUMMARY:
                for i in range(colCount):
                    co = h2o_cmd.runSummary(key=parse_key, column=i)
                    print co.label, co.type, co.missing, co.domain, sum(
                        co.bins)
                    self.assertEqual(
                        0, co.missing,
                        "Column %s Expected %s. missing: %s is incorrect" %
                        (i, 0, co.missing))
                    self.assertEqual(
                        'String', co.type,
                        "Column %s Expected %s. type: %s is incorrect" %
                        (i, 0, co.type))

            # no cols ignored
            labelListUsed = list(labelList)
            numColsUsed = numCols
            for trial in range(1):

                parameters = {
                    'validation_frame': parse_key,  # KeyIndexed False []
                    'ignored_columns': None,  # string[] None []
                    'score_each_iteration': None,  # boolean false []
                    'minWordFreq': 5,  # int 5 []
                    'wordModel': 'SkipGram',  # enum [u'CBOW', u'SkipGram']
                    'normModel': 'HSM',  # enum # [u'HSM', u'NegSampling']
                    'negSampleCnt': 5,  # int 5 []
                    'vecSize': 100,  # int 100
                    'windowSize': 5,  # int 5
                    'sentSampleRate': 0.001,  # float 0.001
                    'initLearningRate': 0.05,  # float 0.05
                    'epochs': 1,  # int 5
                }

                model_key = 'benign_w2v.hex'
                bmResult = h2o.n0.build_model(algo='word2vec',
                                              destination_key=model_key,
                                              training_frame=parse_key,
                                              parameters=parameters,
                                              timeoutSecs=60)
                bm = OutputObj(bmResult, 'bm')

                modelResult = h2o.n0.models(key=model_key)
                model = OutputObj(modelResult['models'][0]['output'], 'model')

                cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                         frame=parse_key,
                                                         timeoutSecs=60)
                cmm = OutputObj(cmmResult, 'cmm')

                mmResult = h2o.n0.model_metrics(model=model_key,
                                                frame=parse_key,
                                                timeoutSecs=60)
                mm = OutputObj(mmResult['model_metrics'][0], 'mm')

                # not implemented?

                # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
                # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

                h2o_cmd.runStoreView()
示例#21
0
    def test_GLM_params_rand2(self):
        importFolderPath = "covtype"
        csvFilename = "covtype.20k.data"
        hex_key = "covtype20k.hex"
        binomial_key = "covtype20k.b.hex"
        b = Key(hex_key)
        csvPathname = importFolderPath + "/" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       check_header=1,
                                       timeoutSecs=180,
                                       doSummary=False)

        ## columnTypeDict = {54: 'Enum'}
        columnTypeDict = None
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=binomial_key,
                                       columnTypeDict=columnTypeDict,
                                       check_header=1,
                                       timeoutSecs=180,
                                       doSummary=False)

        # don't have to make it enum, if 0/1 (can't operate on enums like this)
        # make 1-7 go to 0-6. 0 isn't there.
        Assign(b[:, 54], b[:, 54] - 1)
        # make 1 thru 6 go to 1
        Assign(b[:, 54], b[:, 54] != 0)
        # now we have just 0 and 1

        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        expected = []
        allowedDelta = 0

        # loop, to see if we get same centers

        labelListUsed = list(labelList)
        numColsUsed = numCols

        paramDict = define_params()
        for trial in range(5):
            # family [u'gaussian', u'binomial', u'poisson', u'gamma', u'tweedie']
            # link [u'family_default', u'identity', u'logit', u'log', u'inverse', u'tweedie']
            # can we do classification with probabilities?
            # are only lambda and alpha grid searchable?

            # params is mutable. This is default.
            parameters = {
                'response_column': 'C55',
                'alpha': 0.1,
                # 'lambda': 1e-4,
                'lambda': 0,
            }
            h2o_glm.pickRandGlmParams(paramDict, parameters)

            if 'family' not in parameters or parameters['family'] == 'binomial':
                bHack = binomial_key
            else:
                bHack = hex_key

            co = h2o_cmd.runSummary(key=binomial_key, column=54)
            print "binomial_key summary:", co.label, co.type, co.missing_count, co.domain, sum(
                co.histogram_bins)
            co = h2o_cmd.runSummary(key=hex_key, column=54)
            print "hex_key summary:", co.label, co.type, co.missing_count, co.domain, sum(
                co.histogram_bins)

            # fix stupid params
            fixList = [
                'alpha', 'lambda', 'ignored_columns', 'class_sampling_factors'
            ]
            for f in fixList:
                if f in parameters:
                    parameters[f] = "[%s]" % parameters[f]

            model_key = 'rand_glm.hex'
            bmResult = h2o.n0.build_model(algo='glm',
                                          model_id=model_key,
                                          training_frame=bHack,
                                          parameters=parameters,
                                          timeoutSecs=10)
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')
            h2o_glm.simpleCheckGLM(self,
                                   model,
                                   parameters,
                                   labelList,
                                   labelListUsed,
                                   allowNaN=True)

            cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                     frame=parse_key,
                                                     timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            # FIX! when is this legal
            doClassification = False
            if doClassification:
                mcms = OutputObj(
                    {'data': cmm.max_criteria_and_metric_scores.data}, 'mcms')
                m1 = mcms.data[1:]
                h0 = mcms.data[0]
                print "\nmcms", tabulate(m1, headers=h0)

            if doClassification:
                thms = OutputObj(cmm.thresholds_and_metric_scores, 'thms')
                cmms = OutputObj({'cm': cmm.confusion_matrices}, 'cmms')

                if 1 == 0:
                    print ""
                    for i, c in enumerate(cmms.cm):
                        print "\ncmms.cm[%s]" % i, tabulate(c)
                    print ""

            mmResult = h2o.n0.model_metrics(model=model_key,
                                            frame=parse_key,
                                            timeoutSecs=60)
            mm = OutputObj(mmResult['model_metrics'][0], 'mm')

            prResult = h2o.n0.predict(model=model_key,
                                      frame=parse_key,
                                      timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
示例#22
0
    def test_GBM_basic(self):
        bucket = 'home-0xdiag-datasets'
        importFolderPath = 'standard'
        trainFilename = 'covtype.shuffled.90pct.data'
        train_key = 'covtype.train.hex'
        model_key = 'GBMModelKey'
        timeoutSecs = 1800
        csvPathname = importFolderPath + "/" + trainFilename
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='local',
                                       hex_key=train_key,
                                       timeoutSecs=timeoutSecs)

        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        labelListUsed = list(labelList)
        numColsUsed = numCols

        parameters = {
            'validation_frame': train_key,
            'ignored_columns': None,
            'score_each_iteration': True,
            'response_column': 'C55',
            'do_classification': True,
            # 'balance_classes':
            # 'max_after_balance_size':
            'ntrees': 2,
            'max_depth': 10,
            'min_rows': 3,
            'nbins': 40,
            'learn_rate': 0.2,
            # FIX! doesn't like it?
            # 'loss': 'Bernoulli',
            # FIX..no variable importance for GBM yet?
            'variable_importance': False,
            # 'seed':
        }

        model_key = 'covtype_gbm.hex'
        bmResult = h2o.n0.build_model(algo='gbm',
                                      destination_key=model_key,
                                      training_frame=parse_key,
                                      parameters=parameters,
                                      timeoutSecs=60)
        bm = OutputObj(bmResult, 'bm')

        modelResult = h2o.n0.models(key=model_key)
        model = OutputObj(modelResult['models'][0]['output'], 'model')

        cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                 frame=parse_key,
                                                 timeoutSecs=60)
        cmm = OutputObj(cmmResult, 'cmm')
        print "\nLook!, can use dot notation: cmm.cm.confusion.matrix", cmm.cm.confusion_matrix, "\n"

        mmResult = h2o.n0.model_metrics(model=model_key,
                                        frame=parse_key,
                                        timeoutSecs=60)
        mmResultShort = mmResult['model_metrics'][0]
        del mmResultShort['frame']  # too much!
        mm = OutputObj(mmResultShort, 'mm')

        prResult = h2o.n0.predict(model=model_key,
                                  frame=parse_key,
                                  timeoutSecs=60)
        pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
示例#23
0
    def test_DL_basic(self):
        h2o.nodes[0].remove_all_keys()
        importFolderPath = "logreg"
        csvFilename = "benign.csv"
        hex_key = "benign.hex"
        csvPathname = importFolderPath + "/" + csvFilename

        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       check_header=1,
                                       timeoutSecs=180,
                                       doSummary=False)
        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        expected = []
        allowedDelta = 0

        # no cols ignored
        labelListUsed = list(labelList)
        labelListUsed.remove('STR')
        numColsUsed = numCols - 1
        for trial in range(1):
            parameters = {
                # required now
                # loss enum True None [u'MeanSquare', u'CrossEntropy']
                'loss': 'CrossEntropy',
                'validation_frame': parse_key,  # KeyIndexed None
                'ignored_columns': '["STR"]',  # string[] None
                'response_column': 'FNDX',  # string None
                'balance_classes': None,  # boolean false
                'max_after_balance_size': None,  # float Infinity
                'keep_cross_validation_splits': None,  # boolean false
                'checkpoint': None,  # Key None
                'overwrite_with_best_model': None,  # boolean true
                'expert_mode': None,  # boolean false
                'autoencoder': None,  # boolean false
                # 'use_all_factor_levels': None, # boolean true
                # [u'Tanh', u'TanhWithDropout', u'Rectifier', u'RectifierWithDropout', u'Maxout', u'MaxoutWithDropout']
                'activation': None,  # enum Rectifier 
                'hidden': None,  # int[] [200, 200]
                'epochs': None,  # double 10.0
                'train_samples_per_iteration': None,  # long -2
                'target_ratio_comm_to_comp': None,  # double 0.02
                'seed': None,  # long 1679194146842485659
                'adaptive_rate': None,  # boolean true
                'rho': None,  # double 0.99
                'epsilon': None,  # double 1.0E-8
                'rate': None,  # double 0.005
                'rate_annealing': None,  # double 1.0E-6
                'rate_decay': None,  # double 1.0
                'momentum_start': None,  # double 0.0
                'momentum_ramp': None,  # double 1000000.0
                'momentum_stable': None,  # double 0.0
                'nesterov_accelerated_gradient': None,  # boolean true
                'input_dropout_ratio': None,  # double 0.0
                'hidden_dropout_ratios':
                None,  # double[] None (this can grid?)
                'l1': None,  # double 0.0
                'l2': None,  # double 0.0
                'max_w2': None,  # float Infinity
                'initial_weight_distribution':
                None,  # enum UniformAdaptive [u'UniformAdaptive', u'Uniform', u'Normal']
                'initial_weight_scale': None,  # double 1.0
                'loss':
                None,  # enum MeanSquare [u'Automatic', u'MeanSquare', u'CrossEntropy']
                'score_interval': None,  # double 5.0
                'score_training_samples': None,  # long 10000
                'score_validation_samples': None,  # long 0
                'score_duty_cycle': None,  # double 0.1
                'classification_stop': None,  # double 0.0
                'regression_stop': None,  # double 1.0E-6
                'quiet_mode': None,  # boolean false
                'max_confusion_matrix_size': None,  # int 20
                'max_hit_ratio_k': None,  # int 10
                'balance_classes': None,  # boolean false
                'class_sampling_factors': None,  # float[] None
                'max_after_balance_size': None,  # float Infinity
                'score_validation_sampling':
                None,  # enum Uniform [u'Uniform', u'Stratified']
                'diagnostics': None,  # boolean true
                'variable_importances': None,  # boolean false
                'fast_mode': None,  # boolean true
                'ignore_const_cols': None,  # boolean true
                'force_load_balance': None,  # boolean true
                'replicate_training_data': None,  # boolean false
                'single_node_mode': None,  # boolean false
                'shuffle_training_data': None,  # boolean false
                'missing_values_handling':
                None,  # enum MeanImputation [u'Skip', u'MeanImputation']
                'sparse': None,  # boolean false
                'col_major': None,  # boolean false
                'average_activation': None,  # double 0.0
                'sparsity_beta': None,  # double 0.0
            }

            model_key = 'benign_dl.hex'
            bmResult = h2o.n0.build_model(algo='deeplearning',
                                          model_id=model_key,
                                          training_frame=parse_key,
                                          parameters=parameters,
                                          timeoutSecs=10)
            print "bmResult:", dump_json(bmResult)
            bm = OutputObj(bmResult, 'bm')

            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')
            print "model:", dump_json(model)

            cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                     frame=parse_key,
                                                     timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')

            mmResult = h2o.n0.model_metrics(model=model_key,
                                            frame=parse_key,
                                            timeoutSecs=60)
            mm = OutputObj(mmResult['model_metrics'][0], 'mm')

            prResult = h2o.n0.predict(model=model_key,
                                      frame=parse_key,
                                      timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

            h2o_cmd.runStoreView()
示例#24
0
    def test_DL_airlines_small(self):
        h2o.nodes[0].remove_all_keys()
        csvPathname_train = 'airlines/AirlinesTrain.csv.zip'
        csvPathname_test = 'airlines/AirlinesTest.csv.zip'
        hex_key = 'train.hex'
        validation_key = 'validation.hex'
        timeoutSecs = 60
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname_train,
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs,
                                       doSummary=False)
        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)

        parseResultV = h2i.import_parse(bucket='smalldata',
                                        path=csvPathname_test,
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs,
                                        doSummary=False)
        pAV = h2o_cmd.ParseObj(parseResultV)
        iAV = h2o_cmd.InspectObj(pAV.parse_key)

        #Making random id
        identifier = ''.join(
            random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'deeplearning_' + identifier + '.hex'

        parameters = {
            'validation_frame': validation_key,  # KeyIndexed None
            'ignored_columns': "['IsDepDelayed_REC']",  # string[] None
            'response_column': 'IsDepDelayed',  # string None
            'loss': 'CrossEntropy'
        }
        expectedErr = 0.32  ## expected validation error for the above model
        relTol = 0.15  ## 15% rel. error tolerance due to Hogwild!

        timeoutSecs = 60
        start = time.time()

        bmResult = h2o.n0.build_model(algo='deeplearning',
                                      model_id=model_key,
                                      training_frame=hex_key,
                                      parameters=parameters,
                                      timeoutSecs=timeoutSecs)
        bm = OutputObj(bmResult, 'bm')

        print 'deep learning took', time.time() - start, 'seconds'

        modelResult = h2o.n0.models(key=model_key)
        model = OutputObj(modelResult['models'][0]['output'], 'model')
        #        print "model:", dump_json(model)

        cmmResult = h2o.n0.compute_model_metrics(model=model_key,
                                                 frame=validation_key,
                                                 timeoutSecs=60)
        cmm = OutputObj(cmmResult, 'cmm')

        mmResult = h2o.n0.model_metrics(model=model_key,
                                        frame=validation_key,
                                        timeoutSecs=60)
        mm = OutputObj(mmResult['model_metrics'][0], 'mm')

        prResult = h2o.n0.predict(model=model_key,
                                  frame=validation_key,
                                  timeoutSecs=60)
        pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')

        h2o_cmd.runStoreView()

        actualErr = model['errors']['valid_err']
        print "expected classification error: " + format(expectedErr)
        print "actual   classification error: " + format(actualErr)

        if actualErr != expectedErr and abs(
            (expectedErr - actualErr) / expectedErr) > relTol:
            raise Exception(
                "Scored classification error of %s is not within %s %% relative error of %s"
                % (actualErr, float(relTol) * 100, expectedErr))
    def test_GBMGrid_basic_many(self):
        trainFilename = 'prostate.csv'
        train_key = 'prostate.hex'
        timeoutSecs = 300
        csvPathname = "logreg/" + trainFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=train_key, schema='put')

        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        labelListUsed = list(labelList)
        numColsUsed = numCols

        parameters = {
            'validation_frame': train_key,
            'ignored_columns': "['ID']", # this has to have []
            'response_column': 'CAPSULE',
            # 'balance_classes':
            # 'max_after_balance_size':
            # ??
            # 'ntrees': '[8, 10]',
            'ntrees': 8,
            # 'max_depth': '[8, 9]',
            'max_depth': 8,
            # ??
            # 'min_rows': '[1, 2]',
            'min_rows': 1,
            'nbins': 40,
            # ??
            # 'learn_rate': "[0.1, 0.2]",
            'learn_rate': 0.1,
            # FIX! doesn't like it?
            # 'loss': 'Bernoulli',
            # FIX..no variable importance for GBM yet?
            # 'variable_importance': False,
            # 'seed': 
        }

        jobs = []
        # kick off 5 of these GBM grid jobs, with different tree choices
        start = time.time()
        totalGBMGridJobs = 0

        for i in range(5):
            modelKey = 'GBMGrid_prostate_%s', i
            bmResult = h2o.n0.build_model(
                algo='gbm',
                destination_key=modelKey,
                training_frame=parse_key,
                parameters=parameters,
                timeoutSecs=60)
            bm = OutputObj(bmResult, 'bm')
            print "GBMResult:", h2o.dump_json(bm)

            # FIX! is this right for gridded? 
            job_key = bm.jobs[0].key.name
            # FIX! this isn't a full formed name (%)
            model_key = bm.jobs[0].dest.name
            jobs.append( (job_key, model_key) )
            totalGBMGridJobs += 1

        h2o_jobs.pollWaitJobs(timeoutSecs=300)
        elapsed = time.time() - start
        print "All GBM jobs completed in", elapsed, "seconds."
        print "totalGBMGridJobs:", totalGBMGridJobs

        for job_key, model_key in jobs:
            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')

            cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')
            print "\nLook!, can use dot notation: cmm.cm.confusion.matrix", cmm.cm.confusion_matrix, "\n"

            mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            mmResultShort = mmResult['model_metrics'][0]
            del mmResultShort['frame'] # too much!
            mm = OutputObj(mmResultShort, 'mm')

            prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')