Exemplo n.º 1
0
    def test_rf_covtype_train_oobe_fvec(self):
        h2o.beta_features = True
        print "\nRun test iterations/compare with covtype.data"
        rfv1 = self.rf_covtype_train_oobe('covtype.data', checkExpectedResults=False, expectedAuc=0.95)
        (ce1, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv1)
        # since we created a binomial output class..look at the error rate for class 1
        ce1pct1 = classErrorPctList[1]

        print "\nRun test iterations/compare with covtype.shuffled.data"
        rfv2 = self.rf_covtype_train_oobe('covtype.shuffled.data', checkExpectedResults=True, expectedAuc=0.95)
        (ce2, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv2)
        ce2pct1 = classErrorPctList[1]

        print "\nRun test iterations/compare with covtype.sorted.data"
        rfv3 = self.rf_covtype_train_oobe('covtype.sorted.data', checkExpectedResults=False, expectedAuc=0.95)
        (ce3, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv3)
        ce3pct1 = classErrorPctList[1]

        print "rfv3, from covtype.sorted.data"
        print "\nJsonDiff covtype.data rfv, to covtype.sorted.data rfv"
        print "rfv1:", h2o.dump_json(rfv1)
        print "rfv3:", h2o.dump_json(rfv3)
        # df = h2o_util.JsonDiff(rfv1, rfv3, with_values=True)
        df = h2o_util.JsonDiff(rfv1, rfv3)
        print "df.difference:", h2o.dump_json(df.difference)

        self.assertAlmostEqual(ce1, ce2, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce2))
        self.assertAlmostEqual(ce1, ce3, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce3))


        # we're doing separate test/train splits..so we're going to get variance
        # really should not do test/train split and use all the data? if we're comparing sorted or not?
        # but need the splits to be sorted or not. I think I have those files
        self.assertAlmostEqual(ce1pct1, ce2pct1, delta=7.0, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce2pct1))
        self.assertAlmostEqual(ce1pct1, ce3pct1, delta=7.0, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce3pct1))
Exemplo n.º 2
0
    def test_RF(self):
        h2o.beta_features = True

        paramsTrainRF = {
            'seed': '1234567890',
            'ntrees': 1,
            'max_depth': 10,
            # 'sample_rate': 1.0,
            'sample_rate': 1.0, 
            'nbins': 50,
            'timeoutSecs': 600,
            'response': 'C55',
            'classification': 1,
        }

        paramsScoreRF = {
            'vactual': 'C55',
            'timeoutSecs': 600,
        }

        # train1
        trainKey1 = self.loadData(trainDS1)
        scoreKey1 = self.loadData(scoreDS1)
        kwargs   = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs)
        kwargs   = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True)
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain1\n=========="
        h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult1, noPrint=False, **kwargs)
        print "\nScore1\n=========+"
        print h2o.dump_json(scoreResult1)
        h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult1, noPrint=False, **kwargs)

        # train2
        trainKey2 = self.loadData(trainDS2)
        scoreKey2 = self.loadData(scoreDS2)
        kwargs   = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs)
        kwargs   = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True)
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain2\n=========="
        h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult2, noPrint=False, **kwargs)
        print "\nScore2\n=========="
        h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult2, noPrint=False, **kwargs)

        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)
        # should only be two diffs
        if len(df.difference) > 2:
            raise Exception ("Too many diffs in JsonDiff sorted vs non-sorted %s" % len(df.difference))
Exemplo n.º 3
0
    def test_RF(self):
        h2o.beta_features = True

        if h2o.beta_features:
            paramsTrainRF = {
                'ntrees': 3,
                'max_depth': 10,
                'nbins': 50,
                'timeoutSecs': 600,
                'response': 'C54',
                'classification': 1,
            }

            paramsScoreRF = {
                'vactual': 'C54',
                'timeoutSecs': 600,
            }

        else:
            paramsTrainRF = {
                'use_non_local_data' : 1,
                'ntree'      : 10,
                'depth'      : 300,
                'bin_limit'  : 20000,
                'stat_type'  : 'ENTROPY',
                'out_of_bag_error_estimate': 1,
                'exclusive_split_limit'    : 0,
                'timeoutSecs': 60,
            }

            paramsScoreRF = {
                # scoring requires the response_variable. it defaults to last, so normally
                # we don't need to specify. But put this here and (above if used) 
                # in case a dataset doesn't use last col 
                'response_variable': None,
                'timeoutSecs': 60,
                'out_of_bag_error_estimate': 0,
            }


        # train1
        trainKey1 = self.loadData(trainDS1)
        kwargs   = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs)

        scoreKey1 = self.loadData(scoreDS1)
        kwargs   = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True)
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain1\n=========="
        h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult1, noPrint=False, **kwargs)
        print "\nScore1\n=========+"
        h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult1, noPrint=False, **kwargs)

        # train2
        trainKey2 = self.loadData(trainDS2)
        kwargs   = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs)

        scoreKey2 = self.loadData(scoreDS2)
        kwargs   = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True)
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain2\n=========="
        h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult2, noPrint=False, **kwargs)
        print "\nScore2\n=========="
        h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult2, noPrint=False, **kwargs)

        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)
Exemplo n.º 4
0
    def test_RF(self):

        paramsTrainRF = {
            'seed': '1234567890',
            'ntrees': 1,
            'max_depth': 10,
            # 'sample_rate': 1.0,
            'sample_rate': 1.0,
            'nbins': 50,
            'timeoutSecs': 600,
            'response': 'C55',
            'classification': 1,
        }

        paramsScoreRF = {
            'vactual': 'C55',
            'timeoutSecs': 600,
        }

        # train1
        trainKey1 = self.loadData(trainDS1)
        scoreKey1 = self.loadData(scoreDS1)
        kwargs = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs)
        kwargs = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True)
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain1\n=========="
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=trainResult1,
                                  noPrint=False,
                                  **kwargs)
        print "\nScore1\n=========+"
        print h2o.dump_json(scoreResult1)
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=scoreResult1,
                                  noPrint=False,
                                  **kwargs)

        # train2
        trainKey2 = self.loadData(trainDS2)
        scoreKey2 = self.loadData(scoreDS2)
        kwargs = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs)
        kwargs = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True)
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain2\n=========="
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=trainResult2,
                                  noPrint=False,
                                  **kwargs)
        print "\nScore2\n=========="
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=scoreResult2,
                                  noPrint=False,
                                  **kwargs)

        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)
        # should only be two diffs
        if len(df.difference) > 2:
            raise Exception(
                "Too many diffs in JsonDiff sorted vs non-sorted %s" %
                len(df.difference))
Exemplo n.º 5
0
    def test_RF(self):
        h2o.beta_features = True

        if h2o.beta_features:
            paramsTrainRF = {
                "ntrees": 3,
                "max_depth": 10,
                "nbins": 50,
                "timeoutSecs": 600,
                "response": "C55",
                "classification": 1,
            }

            paramsScoreRF = {"vactual": "C55", "timeoutSecs": 600}

        else:
            paramsTrainRF = {
                "use_non_local_data": 1,
                "ntree": 10,
                "depth": 300,
                "bin_limit": 20000,
                "stat_type": "ENTROPY",
                "out_of_bag_error_estimate": 1,
                "exclusive_split_limit": 0,
                "timeoutSecs": 60,
            }

            paramsScoreRF = {
                # scoring requires the response_variable. it defaults to last, so normally
                # we don't need to specify. But put this here and (above if used)
                # in case a dataset doesn't use last col
                "response_variable": None,
                "timeoutSecs": 60,
                "out_of_bag_error_estimate": 0,
            }

        # train1
        trainKey1 = self.loadData(trainDS1)
        kwargs = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs)

        scoreKey1 = self.loadData(scoreDS1)
        kwargs = paramsScoreRF.copy()
        h2o_cmd.runInspect(key="scoreDS1.hex", verbose=True)
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)
        h2o_cmd.runInspect(key="Predict.hex", verbose=True)
        print "\nTrain1\n=========="
        h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult1, noPrint=False, **kwargs)
        print "\nScore1\n=========+"
        print h2o.dump_json(scoreResult1)
        h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult1, noPrint=False, **kwargs)

        # train2
        trainKey2 = self.loadData(trainDS2)
        kwargs = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs)

        scoreKey2 = self.loadData(scoreDS2)
        kwargs = paramsScoreRF.copy()
        h2o_cmd.runInspect(key="scoreDS2.hex", verbose=True)
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)
        h2o_cmd.runInspect(key="Predict.hex", verbose=True)
        print "\nTrain2\n=========="
        h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult2, noPrint=False, **kwargs)
        print "\nScore2\n=========="
        h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult2, noPrint=False, **kwargs)

        if 1 == 0:
            print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
            df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
            print "df.difference:", h2o.dump_json(df.difference)

            print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
            df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
            print "df.difference:", h2o.dump_json(df.difference)
Exemplo n.º 6
0
    def test_RF(self):
        h2o.beta_features = True

        if h2o.beta_features:
            paramsTrainRF = {
                'ntrees': 3,
                'max_depth': 10,
                'nbins': 50,
                'timeoutSecs': 600,
                'response': 'C54',
                'classification': 1,
            }

            paramsScoreRF = {
                'vactual': 'C54',
                'timeoutSecs': 600,
            }

        else:
            paramsTrainRF = {
                'use_non_local_data': 1,
                'ntree': 10,
                'depth': 300,
                'bin_limit': 20000,
                'stat_type': 'ENTROPY',
                'out_of_bag_error_estimate': 1,
                'exclusive_split_limit': 0,
                'timeoutSecs': 60,
            }

            paramsScoreRF = {
                # scoring requires the response_variable. it defaults to last, so normally
                # we don't need to specify. But put this here and (above if used)
                # in case a dataset doesn't use last col
                'response_variable': None,
                'timeoutSecs': 60,
                'out_of_bag_error_estimate': 0,
            }

        # train1
        trainKey1 = self.loadData(trainDS1)
        kwargs = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs)

        scoreKey1 = self.loadData(scoreDS1)
        kwargs = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True)
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain1\n=========="
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=trainResult1,
                                  noPrint=False,
                                  **kwargs)
        print "\nScore1\n=========+"
        print h2o.dump_json(scoreResult1)
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=scoreResult1,
                                  noPrint=False,
                                  **kwargs)

        # train2
        trainKey2 = self.loadData(trainDS2)
        kwargs = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs)

        scoreKey2 = self.loadData(scoreDS2)
        kwargs = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True)
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain2\n=========="
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=trainResult2,
                                  noPrint=False,
                                  **kwargs)
        print "\nScore2\n=========="
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=scoreResult2,
                                  noPrint=False,
                                  **kwargs)

        if 1 == 0:
            print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
            df = h2o_util.JsonDiff(trainResult1,
                                   trainResult2,
                                   with_values=True)
            print "df.difference:", h2o.dump_json(df.difference)

            print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
            df = h2o_util.JsonDiff(scoreResult1,
                                   scoreResult2,
                                   with_values=True)
            print "df.difference:", h2o.dump_json(df.difference)
    def rf_covtype_train_oobe(self, csvFilename, checkExpectedResults=True, expectedAuc=0.5):
        # the expected results are only for the shuffled version
        # since getting 10% samples etc of the smallish dataset will vary between 
        # shuffled and non-shuffled datasets
        importFolderPath = "standard"
        csvPathname = importFolderPath + "/" + csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, 
            hex_key=hex_key, timeoutSecs=180)
        inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        numCols = inspect['numCols']
        numRows = inspect['numRows']
        pct10 = int(numRows * .1)
        rowsForPct = [i * pct10 for i in range(0,11)]
        # this can be slightly less than 10%
        last10 = numRows - rowsForPct[9]
        rowsForPct[10] = numRows
        # use mod below for picking "rows-to-do" in case we do more than 9 trials
        # use 10 if 0 just to see (we copied 10 to 0 above)
        rowsForPct[0] = rowsForPct[10]

        # 0 isn't used
        expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79]
        expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78]

        # keep the 0 entry empty
        actualTrainPctRightList = [0]
        actualScorePctRightList = [0]
        
        trial = 0
        for rowPct in [0.9]:
            trial += 1
            # Not using this now (did use it for slicing)
            rowsToUse = rowsForPct[trial%10] 
            resultKey = "r_" + csvFilename + "_" + str(trial)
            
            # just do random split for now
            dataKeyTrain = 'rTrain.hex'
            dataKeyTest = 'rTest.hex'

            response = "C55"
            h2o_cmd.createTestTrain(hex_key, dataKeyTrain, dataKeyTest, trainPercent=90, outputClass=4, 
                outputCol=numCols-1, changeToBinomial=not DO_MULTINOMIAL)
            sliceResult = {'destination_key': dataKeyTrain}

            # adjust timeoutSecs with the number of trees
            kwargs = paramDict.copy()
            kwargs['destination_key'] = "model_" + csvFilename + "_" + str(trial)
            timeoutSecs = 30 + kwargs['ntrees'] * 20
            start = time.time()
            # have to pass validation= param to avoid getting no error results (since 100% sample..DRF2 doesn't like that)
            rfv = h2o_cmd.runRF(parseResult=sliceResult, timeoutSecs=timeoutSecs, validation=dataKeyTest, **kwargs)

            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, **kwargs)
            # oobeTrainPctRight = 100 * (1.0 - error)
            oobeTrainPctRight = 100 - error
            if checkExpectedResults:
                self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial],
                    msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \
                        ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA)
            actualTrainPctRightList.append(oobeTrainPctRight)

            print "Now score on the last 10%. Note this is silly if we trained on 100% of the data"
            print "Or sorted by output class, so that the last 10% is the last few classes"
            rf_model = rfv['drf_model']
            used_trees = rf_model['N']
            data_key = rf_model['_dataKey']
            model_key = rf_model['_key']

            rfvScoring = h2o_cmd.runScore(dataKey=dataKeyTest, modelKey=model_key, 
                vactual=response, vpredict=1, expectedAuc=expectedAuc)
            print h2o.dump_json(rfvScoring)
            h2o_rf.simpleCheckRFScore(rfv=rfvScoring, **kwargs)
            print "hello7"
            (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfvScoring, **kwargs)
            fullScorePctRight = 100 - error

            h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest)

            if checkExpectedResults:
                self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial],
                    msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \
                        ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA)
            actualScorePctRightList.append(fullScorePctRight)

            print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/numRows), "pct. of all rows"

        actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)]
        niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList]
        print "maybe should update with actual. Remove single quotes"  
        print "actualTrainPctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp

        actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)]
        niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList]
        print "maybe should update with actual. Remove single quotes"  
        print "actualScorePctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp

        return rfvScoring
Exemplo n.º 8
0
    def rf_covtype_train_oobe(self,
                              csvFilename,
                              checkExpectedResults=True,
                              expectedAuc=0.5):
        # the expected results are only for the shuffled version
        # since getting 10% samples etc of the smallish dataset will vary between
        # shuffled and non-shuffled datasets
        importFolderPath = "standard"
        csvPathname = importFolderPath + "/" + csvFilename
        hex_key = csvFilename + ".hex"
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       timeoutSecs=180)
        inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
        print "\n" + csvPathname, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])

        numCols = inspect['numCols']
        numRows = inspect['numRows']
        pct10 = int(numRows * .1)
        rowsForPct = [i * pct10 for i in range(0, 11)]
        # this can be slightly less than 10%
        last10 = numRows - rowsForPct[9]
        rowsForPct[10] = numRows
        # use mod below for picking "rows-to-do" in case we do more than 9 trials
        # use 10 if 0 just to see (we copied 10 to 0 above)
        rowsForPct[0] = rowsForPct[10]

        # 0 isn't used
        expectTrainPctRightList = [
            0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79
        ]
        expectScorePctRightList = [
            0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78
        ]

        # keep the 0 entry empty
        actualTrainPctRightList = [0]
        actualScorePctRightList = [0]

        trial = 0
        for rowPct in [0.9]:
            trial += 1
            # Not using this now (did use it for slicing)
            rowsToUse = rowsForPct[trial % 10]
            resultKey = "r_" + csvFilename + "_" + str(trial)

            # just do random split for now
            dataKeyTrain = 'rTrain.hex'
            dataKeyTest = 'rTest.hex'

            response = "C55"
            h2o_cmd.createTestTrain(hex_key,
                                    dataKeyTrain,
                                    dataKeyTest,
                                    trainPercent=90,
                                    outputClass=4,
                                    outputCol=numCols - 1,
                                    changeToBinomial=not DO_MULTINOMIAL)
            sliceResult = {'destination_key': dataKeyTrain}

            # adjust timeoutSecs with the number of trees
            kwargs = paramDict.copy()
            kwargs['destination_key'] = "model_" + csvFilename + "_" + str(
                trial)
            timeoutSecs = 30 + kwargs['ntrees'] * 20
            start = time.time()
            # have to pass validation= param to avoid getting no error results (since 100% sample..DRF2 doesn't like that)
            rfv = h2o_cmd.runRF(parseResult=sliceResult,
                                timeoutSecs=timeoutSecs,
                                validation=dataKeyTest,
                                **kwargs)

            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            (error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, **kwargs)
            # oobeTrainPctRight = 100 * (1.0 - error)
            oobeTrainPctRight = 100 - error
            if checkExpectedResults:
                self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial],
                    msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \
                        ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA)
            actualTrainPctRightList.append(oobeTrainPctRight)

            print "Now score on the last 10%. Note this is silly if we trained on 100% of the data"
            print "Or sorted by output class, so that the last 10% is the last few classes"
            rf_model = rfv['drf_model']
            used_trees = rf_model['N']
            data_key = rf_model['_dataKey']
            model_key = rf_model['_key']

            rfvScoring = h2o_cmd.runScore(dataKey=dataKeyTest,
                                          modelKey=model_key,
                                          vactual=response,
                                          vpredict=1,
                                          expectedAuc=expectedAuc)
            print h2o.dump_json(rfvScoring)
            h2o_rf.simpleCheckRFScore(rfv=rfvScoring, **kwargs)
            print "hello7"
            (error, classErrorPctList,
             totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfvScoring, **kwargs)
            fullScorePctRight = 100 - error

            h2o.nodes[0].generate_predictions(model_key=model_key,
                                              data_key=dataKeyTest)

            if checkExpectedResults:
                self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial],
                    msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \
                        ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA)
            actualScorePctRightList.append(fullScorePctRight)

            print "Trial #", trial, "completed", "using %6.2f" % (
                rowsToUse * 100.0 / numRows), "pct. of all rows"

        actualDelta = [
            abs(a - b)
            for a, b in zip(expectTrainPctRightList, actualTrainPctRightList)
        ]
        niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList]
        print "maybe should update with actual. Remove single quotes"
        print "actualTrainPctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp

        actualDelta = [
            abs(a - b)
            for a, b in zip(expectScorePctRightList, actualScorePctRightList)
        ]
        niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList]
        print "maybe should update with actual. Remove single quotes"
        print "actualScorePctRightList =", niceFp
        niceFp = ["{0:0.2f}".format(i) for i in actualDelta]
        print "actualDelta =", niceFp

        return rfvScoring
Exemplo n.º 9
0
    def test_rf_covtype_train_oobe_fvec(self):
        print "\nRun test iterations/compare with covtype.data"
        rfv1 = self.rf_covtype_train_oobe('covtype.data',
                                          checkExpectedResults=False,
                                          expectedAuc=0.95)
        (ce1, classErrorPctList,
         totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv1)
        # since we created a binomial output class..look at the error rate for class 1
        ce1pct1 = classErrorPctList[1]

        print "\nRun test iterations/compare with covtype.shuffled.data"
        rfv2 = self.rf_covtype_train_oobe('covtype.shuffled.data',
                                          checkExpectedResults=True,
                                          expectedAuc=0.95)
        (ce2, classErrorPctList,
         totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv2)
        ce2pct1 = classErrorPctList[1]

        print "\nRun test iterations/compare with covtype.sorted.data"
        rfv3 = self.rf_covtype_train_oobe('covtype.sorted.data',
                                          checkExpectedResults=False,
                                          expectedAuc=0.95)
        (ce3, classErrorPctList,
         totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv3)
        ce3pct1 = classErrorPctList[1]

        print "rfv3, from covtype.sorted.data"
        print "\nJsonDiff covtype.data rfv, to covtype.sorted.data rfv"
        print "rfv1:", h2o.dump_json(rfv1)
        print "rfv3:", h2o.dump_json(rfv3)
        # df = h2o_util.JsonDiff(rfv1, rfv3, with_values=True)
        df = h2o_util.JsonDiff(rfv1, rfv3)
        print "df.difference:", h2o.dump_json(df.difference)

        self.assertAlmostEqual(
            ce1,
            ce2,
            delta=0.5,
            msg="classification error %s isn't close to that when sorted %s" %
            (ce1, ce2))
        self.assertAlmostEqual(
            ce1,
            ce3,
            delta=0.5,
            msg="classification error %s isn't close to that when sorted %s" %
            (ce1, ce3))

        # we're doing separate test/train splits..so we're going to get variance
        # really should not do test/train split and use all the data? if we're comparing sorted or not?
        # but need the splits to be sorted or not. I think I have those files
        self.assertAlmostEqual(
            ce1pct1,
            ce2pct1,
            delta=10.0,
            msg="classErrorPctList[1] %s isn't close to that when sorted %s" %
            (ce1pct1, ce2pct1))
        self.assertAlmostEqual(
            ce1pct1,
            ce3pct1,
            delta=10.0,
            msg="classErrorPctList[1] %s isn't close to that when sorted %s" %
            (ce1pct1, ce3pct1))