def test_RF(self):
        h2o.beta_features = True
        paramsTrainRF = {"ntrees": 2, "max_depth": 300, "nbins": 200, "timeoutSecs": 600, "response": "C55"}

        paramsScoreRF = {"vactual": "C55", "timeoutSecs": 600}

        trainKey1 = self.loadData(trainDS1)
        kwargs = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs)

        scoreKey1 = self.loadData(scoreDS1)
        kwargs = paramsScoreRF.copy()
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)

        trainKey2 = self.loadData(trainDS2)
        kwargs = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs)

        scoreKey2 = self.loadData(scoreDS2)
        kwargs = paramsScoreRF.copy()
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)

        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)
Пример #2
0
    def test_RF(self):
        trainKey1 = self.loadData(trainDS1)
        kwargs = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs)

        scoreKey1 = self.loadData(scoreDS1)
        kwargs = paramsScoreRF.copy()
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)
        print "\nTrain1\n=========={0}".format(
            h2o_rf.pp_rf_result(trainResult1))
        print "\nScore1\n========={0}".format(
            h2o_rf.pp_rf_result(scoreResult1))

        trainKey2 = self.loadData(trainDS2)
        kwargs = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs)

        scoreKey2 = self.loadData(scoreDS2)
        kwargs = paramsScoreRF.copy()
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)
        print "\nTrain2\n=========={0}".format(
            h2o_rf.pp_rf_result(trainResult2))
        print "\nScore2\n========={0}".format(
            h2o_rf.pp_rf_result(scoreResult2))

        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)
Пример #3
0
    def test_RF(self):
        trainKey1 = self.loadData(trainDS1)
        kwargs   = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs)

        scoreKey1 = self.loadData(scoreDS1)
        kwargs   = paramsScoreRF.copy()
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)
        print "\nTrain1\n=========={0}".format(h2o_rf.pp_rf_result(trainResult1))
        print "\nScore1\n========={0}".format(h2o_rf.pp_rf_result(scoreResult1))

        trainKey2 = self.loadData(trainDS2)
        kwargs   = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs)

        scoreKey2 = self.loadData(scoreDS2)
        kwargs   = paramsScoreRF.copy()
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)
        print "\nTrain2\n=========={0}".format(h2o_rf.pp_rf_result(trainResult2))
        print "\nScore2\n========={0}".format(h2o_rf.pp_rf_result(scoreResult2))

        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)
Пример #4
0
    def test_RF(self):
	normalRF = False
	#normalRF = True

	print """
Normal RF : {0}
Train data: {1}
Test data : {2}""".format(normalRF, trainDS['filename'], scoreDS['filename'])

	print "Loading data...."
        trainKey = self.loadTrainData()
        kwargs   = paramsTrainRF.copy()
	
	print "Running normal RF: {0}".format(normalRF)
	if normalRF:
        	trainResult = h2o_rf.trainRF(trainKey, model_key="rfm_normal", **kwargs)
	else:
        	trainResult = h2o_rf.trainRF(trainKey, refine=1, model_key="rfm_refined", **kwargs)

        scoreKey = self.loadScoreData()
        kwargs   = paramsScoreRF.copy()
        scoreResult = h2o_rf.scoreRF(scoreKey, trainResult, **kwargs)
	
	print """
Normal RF : {0}
Train data: {1}
Test data : {2}""".format(normalRF, trainDS['filename'], scoreDS['filename'])
        print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult))
        print "\nScoring\n========={0}".format(h2o_rf.pp_rf_result(scoreResult))
Пример #5
0
    def test_RF(self):
        h2o.beta_features = True

        paramsTrainRF = {
            'seed': '1234567890',
            'ntrees': 1,
            'max_depth': 10,
            # 'sample_rate': 1.0,
            'sample_rate': 1.0, 
            'nbins': 50,
            'timeoutSecs': 600,
            'response': 'C55',
            'classification': 1,
        }

        paramsScoreRF = {
            'vactual': 'C55',
            'timeoutSecs': 600,
        }

        # train1
        trainKey1 = self.loadData(trainDS1)
        scoreKey1 = self.loadData(scoreDS1)
        kwargs   = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs)
        kwargs   = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True)
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain1\n=========="
        h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult1, noPrint=False, **kwargs)
        print "\nScore1\n=========+"
        print h2o.dump_json(scoreResult1)
        h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult1, noPrint=False, **kwargs)

        # train2
        trainKey2 = self.loadData(trainDS2)
        scoreKey2 = self.loadData(scoreDS2)
        kwargs   = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs)
        kwargs   = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True)
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain2\n=========="
        h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult2, noPrint=False, **kwargs)
        print "\nScore2\n=========="
        h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult2, noPrint=False, **kwargs)

        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)
        # should only be two diffs
        if len(df.difference) > 2:
            raise Exception ("Too many diffs in JsonDiff sorted vs non-sorted %s" % len(df.difference))
Пример #6
0
    def test_RF(self):
        h2o.beta_features = True
        paramsTrainRF = { 
            'seed': '1234567890',
            # if I use 100, and just one tree, I should get same results for sorted/shuffled?
            # i.e. the bagging always sees everything. Means oobe will be messed up
            # so will specify validation = the 10pct holdout data (could reuse the training data?)
            'sample_rate': 1.0,
            'ntrees': 3, 
            'max_depth': 300,
            'nbins': 200,
            'timeoutSecs': 600,
            'response': 'C55',
        }

        paramsScoreRF = {
            'vactual': 'C55',
            'timeoutSecs': 600,
        }

        # 90% data
        trainKey1 = self.loadData(trainDS1)
        scoreKey1 = self.loadData(scoreDS1)
        kwargs   = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs)
        (classification_error1, classErrorPctList1, totalScores1) = h2o_rf.simpleCheckRFView(rfv=trainResult1)
        self.assertEqual(4.29, classification_error1)
        self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList1)
        self.assertEqual(58101, totalScores1)

        kwargs   = paramsScoreRF.copy()
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)

        # 10% data
        trainKey2 = self.loadData(trainDS2)
        scoreKey2 = self.loadData(scoreDS2)
        kwargs   = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs)
        (classification_error2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=trainResult2)
        self.assertEqual(4.29, classification_error2)
        self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList2)
        self.assertEqual(58101, totalScores2)

        kwargs   = paramsScoreRF.copy()
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)

      
        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        # should only be two diffs
        if len(df.difference) > 2:
            raise Exception ("Too many diffs in JsonDiff sorted vs non-sorted %s" % len(df.difference))
Пример #7
0
    def test_c8_rf_airlines_hdfs(self):
        h2o.beta_features = True
        trainParseResult = self.loadTrainData()
        kwargs   = paramsTrainRF.copy()
        trainResult = h2o_rf.trainRF(trainParseResult, **kwargs)

        scoreParseResult = self.loadScoreData()
        kwargs   = paramsScoreRF.copy()
        scoreResult = h2o_rf.scoreRF(scoreParseResult, trainResult, **kwargs)
    def test_c8_rf_airlines_hdfs(self):
        trainParseResult = self.loadTrainData()
        kwargs   = paramsTrainRF.copy()
        trainResult = h2o_rf.trainRF(trainParseResult, **kwargs)

        scoreParseResult = self.loadScoreData()
        kwargs   = paramsScoreRF.copy()
        scoreResult = h2o_rf.scoreRF(scoreParseResult, trainResult, **kwargs)
        
        h2i.delete_keys_at_all_nodes(timeoutSecs=600)
Пример #9
0
    def test_RF(self):
        trainKey = self.loadTrainData()
        kwargs   = paramsTrainRF.copy()
        trainResult = h2o_rf.trainRF(trainKey, **kwargs)

        scoreKey = self.loadScoreData()
        kwargs   = paramsScoreRF.copy()
        scoreResult = h2o_rf.scoreRF(scoreKey, trainResult, **kwargs)

        print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult))
        print "\nScoring\n========={0}".format(h2o_rf.pp_rf_result(scoreResult))
Пример #10
0
    def test_RF(self):
        trainKey = self.loadTrainData()
        kwargs   = paramsTrainRF.copy()
        trainResult = h2o_rf.trainRF(trainKey, **kwargs)

        scoreKey = self.loadScoreData()
        kwargs   = paramsScoreRF.copy()
        scoreResult = h2o_rf.scoreRF(scoreKey, trainResult, **kwargs)

        print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult))
        print "\nScoring\n========={0}".format(h2o_rf.pp_rf_result(scoreResult))
    def test_c8_rf_airlines_hdfs(self):
        trainParseResult = self.loadTrainData()
        kwargs   = paramsTrainRF.copy()
        trainResult = h2o_rf.trainRF(trainParseResult, **kwargs)

        scoreParseResult = self.loadScoreData()
        kwargs   = paramsScoreRF.copy()
        scoreResult = h2o_rf.scoreRF(scoreParseResult, trainResult, **kwargs)

        print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult))
        print "\nScoring\n========={0}".format(h2o_rf.pp_rf_result(scoreResult))
Пример #12
0
    def test_rf_iris(self):
        # Train RF
        trainParseResult = h2i.import_parse(bucket='smalldata', path='iris/iris2.csv', hex_key='train_iris2.hex', schema='put')
        kwargs = paramsTrainRF.copy()
        trainResult = h2o_rf.trainRF(trainParseResult, **kwargs)

        scoreParseResult = h2i.import_parse(bucket='smalldata', path='iris/iris2.csv', hex_key='score_iris2.hex', schema='put')
        kwargs = paramsTestRF.copy()
        scoreResult  = h2o_rf.scoreRF(scoreParseResult, trainResult, **kwargs)

        print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult))
        print "\nScoring\n========={0}".format(h2o_rf.pp_rf_result(scoreResult))
Пример #13
0
    def test_RF(self):
        trainKey = self.loadTrainData()
        scoreKey = self.loadScoreData()
        #time.sleep(3600)
        executeNormalRF = True
        executeNormalRF = False
        if executeNormalRF:
            kwargs   = paramsTrainRF.copy()
            trainResultNormal = h2o_rf.trainRF(trainKey, model_key="rfm_normal", **kwargs)
            #print h2o_rf.pp_rf_result(trainResultNormal)
            kwargs   = paramsScoreRF.copy()
            scoreResultNormal = h2o_rf.scoreRF(scoreKey, trainResultNormal, **kwargs)
            print "\nScoring normal forest\n========={0}".format(h2o_rf.pp_rf_result(scoreResultNormal))

        kwargs   = paramsTrainRF.copy()
        trainResultRefined = h2o_rf.trainRF(trainKey, refine=1, model_key="rfm_refined", **kwargs)
        #print h2o_rf.pp_rf_result(trainResultRefined)
        kwargs   = paramsScoreRF.copy()
        scoreResultRefined = h2o_rf.scoreRF(scoreKey, trainResultRefined, **kwargs)
        print "\nScoring refined forest\n========={0}".format(h2o_rf.pp_rf_result(scoreResultRefined))

        time.sleep(3600)
Пример #14
0
    def test_rf_iris(self):
        # Train RF
        trainParseResult = h2i.import_parse(bucket='smalldata',
                                            path='iris/iris2.csv',
                                            hex_key='train_iris2.hex',
                                            schema='put')
        kwargs = paramsTrainRF.copy()
        trainResult = h2o_rf.trainRF(trainParseResult, **kwargs)

        scoreParseResult = h2i.import_parse(bucket='smalldata',
                                            path='iris/iris2.csv',
                                            hex_key='score_iris2.hex',
                                            schema='put')
        kwargs = paramsTestRF.copy()
        scoreResult = h2o_rf.scoreRF(scoreParseResult, trainResult, **kwargs)

        print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult))
        print "\nScoring\n========={0}".format(
            h2o_rf.pp_rf_result(scoreResult))
Пример #15
0
    def test_RF(self):
        h2o.beta_features = True

        if h2o.beta_features:
            paramsTrainRF = { 
                'ntrees': 10, 
                'max_depth': 300,
                'nbins': 200,
                'timeoutSecs': 600,
                'response': 'C55',
            }

            paramsScoreRF = {
                'vactual': 'C55',
                'timeoutSecs': 600,
            }

        else:
            paramsTrainRF = { 
                'use_non_local_data' : 1,
                'ntree'      : 10, 
                'depth'      : 300,
                'bin_limit'  : 20000,
                'stat_type'  : 'ENTROPY',
                'out_of_bag_error_estimate': 1, 
                'exclusive_split_limit'    : 0,
                'timeoutSecs': 60,
            }

            paramsScoreRF = {
                # scoring requires the response_variable. it defaults to last, so normally
                # we don't need to specify. But put this here and (above if used) 
                # in case a dataset doesn't use last col 
                'response_variable': None,
                'timeoutSecs': 60,
                'out_of_bag_error_estimate': 0, 
            }

        trainKey1 = self.loadData(trainDS1)
        kwargs   = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs)

        scoreKey1 = self.loadData(scoreDS1)
        kwargs   = paramsScoreRF.copy()
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)

        trainKey2 = self.loadData(trainDS2)
        kwargs   = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs)

        scoreKey2 = self.loadData(scoreDS2)
        kwargs   = paramsScoreRF.copy()
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)

        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)
Пример #16
0
    def test_RF(self):
        h2o.beta_features = True

        if h2o.beta_features:
            paramsTrainRF = {
                'ntrees': 10,
                'max_depth': 300,
                'nbins': 200,
                'timeoutSecs': 600,
                'response': 'C55',
            }

            paramsScoreRF = {
                'vactual': 'C55',
                'timeoutSecs': 600,
            }

        else:
            paramsTrainRF = {
                'use_non_local_data': 1,
                'ntree': 10,
                'depth': 300,
                'bin_limit': 20000,
                'stat_type': 'ENTROPY',
                'out_of_bag_error_estimate': 1,
                'exclusive_split_limit': 0,
                'timeoutSecs': 60,
            }

            paramsScoreRF = {
                # scoring requires the response_variable. it defaults to last, so normally
                # we don't need to specify. But put this here and (above if used)
                # in case a dataset doesn't use last col
                'response_variable': None,
                'timeoutSecs': 60,
                'out_of_bag_error_estimate': 0,
            }

        trainKey1 = self.loadData(trainDS1)
        kwargs = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs)

        scoreKey1 = self.loadData(scoreDS1)
        kwargs = paramsScoreRF.copy()
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)

        trainKey2 = self.loadData(trainDS2)
        kwargs = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs)

        scoreKey2 = self.loadData(scoreDS2)
        kwargs = paramsScoreRF.copy()
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)

        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)
Пример #17
0
    def test_RF(self):
        h2o.beta_features = True
        paramsTrainRF = { 
            'seed': '1234567890',
            # if I use 100, and just one tree, I should get same results for sorted/shuffled?
            # i.e. the bagging always sees everything. Means oobe will be messed up
            # so will specify validation = the 10pct holdout data (could reuse the training data?)
            'sample_rate': 1.0,
            'ntrees': 3, 
            'max_depth': 300,
            'nbins': 200,
            'timeoutSecs': 600,
            'response': 'C55',
        }

        paramsScoreRF = {
            'vactual': 'C55',
            'timeoutSecs': 600,
        }

        # 90% data
        trainKey1 = self.loadData(trainDS1)
        scoreKey1 = self.loadData(scoreDS1)
        kwargs   = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs)
        (classification_error1, classErrorPctList1, totalScores1) = h2o_rf.simpleCheckRFView(rfv=trainResult1)
        # self.assertEqual(4.29, classification_error1)
        # self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList1)
        # with new RNG 9/26/14
        self.assertEqual(4.4, classification_error1)
        self.assertEqual([3.71, 3.56, 4.32, 18.55, 21.22, 13.51, 5.82], classErrorPctList1)
        self.assertEqual(58101, totalScores1)

        kwargs   = paramsScoreRF.copy()
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)

        # 10% data
        trainKey2 = self.loadData(trainDS2)
        scoreKey2 = self.loadData(scoreDS2)
        kwargs   = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs)
        (classification_error2, classErrorPctList2, totalScores2) = h2o_rf.simpleCheckRFView(rfv=trainResult2)
        # self.assertEqual(4.29, classification_error2)
        # self.assertEqual([4.17, 2.98, 4.09, 14.91, 21.12, 15.38, 5.22], classErrorPctList2)
        # with new RNG 9/26/14
        self.assertEqual(4.4, classification_error1)
        self.assertEqual([3.71, 3.56, 4.32, 18.55, 21.22, 13.51, 5.82], classErrorPctList1)
        self.assertEqual(58101, totalScores2)

        kwargs   = paramsScoreRF.copy()
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)

      
        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        # should only be two diffs
        if len(df.difference) > 2:
            raise Exception ("Too many diffs in JsonDiff sorted vs non-sorted %s" % len(df.difference))
Пример #18
0
    def test_RF(self):
        h2o.beta_features = True

        if h2o.beta_features:
            paramsTrainRF = {
                'ntrees': 3,
                'max_depth': 10,
                'nbins': 50,
                'timeoutSecs': 600,
                'response': 'C54',
                'classification': 1,
            }

            paramsScoreRF = {
                'vactual': 'C54',
                'timeoutSecs': 600,
            }

        else:
            paramsTrainRF = {
                'use_non_local_data' : 1,
                'ntree'      : 10,
                'depth'      : 300,
                'bin_limit'  : 20000,
                'stat_type'  : 'ENTROPY',
                'out_of_bag_error_estimate': 1,
                'exclusive_split_limit'    : 0,
                'timeoutSecs': 60,
            }

            paramsScoreRF = {
                # scoring requires the response_variable. it defaults to last, so normally
                # we don't need to specify. But put this here and (above if used) 
                # in case a dataset doesn't use last col 
                'response_variable': None,
                'timeoutSecs': 60,
                'out_of_bag_error_estimate': 0,
            }


        # train1
        trainKey1 = self.loadData(trainDS1)
        kwargs   = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs)

        scoreKey1 = self.loadData(scoreDS1)
        kwargs   = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True)
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain1\n=========="
        h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult1, noPrint=False, **kwargs)
        print "\nScore1\n=========+"
        h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult1, noPrint=False, **kwargs)

        # train2
        trainKey2 = self.loadData(trainDS2)
        kwargs   = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs)

        scoreKey2 = self.loadData(scoreDS2)
        kwargs   = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True)
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain2\n=========="
        h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult2, noPrint=False, **kwargs)
        print "\nScore2\n=========="
        h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult2, noPrint=False, **kwargs)

        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)
Пример #19
0
    def test_RF(self):
        h2o.beta_features = True

        if h2o.beta_features:
            paramsTrainRF = {
                'ntrees': 3,
                'max_depth': 10,
                'nbins': 50,
                'timeoutSecs': 600,
                'response': 'C54',
                'classification': 1,
            }

            paramsScoreRF = {
                'vactual': 'C54',
                'timeoutSecs': 600,
            }

        else:
            paramsTrainRF = {
                'use_non_local_data': 1,
                'ntree': 10,
                'depth': 300,
                'bin_limit': 20000,
                'stat_type': 'ENTROPY',
                'out_of_bag_error_estimate': 1,
                'exclusive_split_limit': 0,
                'timeoutSecs': 60,
            }

            paramsScoreRF = {
                # scoring requires the response_variable. it defaults to last, so normally
                # we don't need to specify. But put this here and (above if used)
                # in case a dataset doesn't use last col
                'response_variable': None,
                'timeoutSecs': 60,
                'out_of_bag_error_estimate': 0,
            }

        # train1
        trainKey1 = self.loadData(trainDS1)
        kwargs = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs)

        scoreKey1 = self.loadData(scoreDS1)
        kwargs = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True)
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain1\n=========="
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=trainResult1,
                                  noPrint=False,
                                  **kwargs)
        print "\nScore1\n=========+"
        print h2o.dump_json(scoreResult1)
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=scoreResult1,
                                  noPrint=False,
                                  **kwargs)

        # train2
        trainKey2 = self.loadData(trainDS2)
        kwargs = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs)

        scoreKey2 = self.loadData(scoreDS2)
        kwargs = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True)
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain2\n=========="
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=trainResult2,
                                  noPrint=False,
                                  **kwargs)
        print "\nScore2\n=========="
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=scoreResult2,
                                  noPrint=False,
                                  **kwargs)

        if 1 == 0:
            print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
            df = h2o_util.JsonDiff(trainResult1,
                                   trainResult2,
                                   with_values=True)
            print "df.difference:", h2o.dump_json(df.difference)

            print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
            df = h2o_util.JsonDiff(scoreResult1,
                                   scoreResult2,
                                   with_values=True)
            print "df.difference:", h2o.dump_json(df.difference)
Пример #20
0
    def test_RF(self):
        h2o.beta_features = True

        if h2o.beta_features:
            paramsTrainRF = {
                "ntrees": 3,
                "max_depth": 10,
                "nbins": 50,
                "timeoutSecs": 600,
                "response": "C55",
                "classification": 1,
            }

            paramsScoreRF = {"vactual": "C55", "timeoutSecs": 600}

        else:
            paramsTrainRF = {
                "use_non_local_data": 1,
                "ntree": 10,
                "depth": 300,
                "bin_limit": 20000,
                "stat_type": "ENTROPY",
                "out_of_bag_error_estimate": 1,
                "exclusive_split_limit": 0,
                "timeoutSecs": 60,
            }

            paramsScoreRF = {
                # scoring requires the response_variable. it defaults to last, so normally
                # we don't need to specify. But put this here and (above if used)
                # in case a dataset doesn't use last col
                "response_variable": None,
                "timeoutSecs": 60,
                "out_of_bag_error_estimate": 0,
            }

        # train1
        trainKey1 = self.loadData(trainDS1)
        kwargs = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs)

        scoreKey1 = self.loadData(scoreDS1)
        kwargs = paramsScoreRF.copy()
        h2o_cmd.runInspect(key="scoreDS1.hex", verbose=True)
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)
        h2o_cmd.runInspect(key="Predict.hex", verbose=True)
        print "\nTrain1\n=========="
        h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult1, noPrint=False, **kwargs)
        print "\nScore1\n=========+"
        print h2o.dump_json(scoreResult1)
        h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult1, noPrint=False, **kwargs)

        # train2
        trainKey2 = self.loadData(trainDS2)
        kwargs = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs)

        scoreKey2 = self.loadData(scoreDS2)
        kwargs = paramsScoreRF.copy()
        h2o_cmd.runInspect(key="scoreDS2.hex", verbose=True)
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)
        h2o_cmd.runInspect(key="Predict.hex", verbose=True)
        print "\nTrain2\n=========="
        h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult2, noPrint=False, **kwargs)
        print "\nScore2\n=========="
        h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult2, noPrint=False, **kwargs)

        if 1 == 0:
            print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
            df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
            print "df.difference:", h2o.dump_json(df.difference)

            print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
            df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
            print "df.difference:", h2o.dump_json(df.difference)
Пример #21
0
    def test_RF(self):

        paramsTrainRF = {
            'seed': '1234567890',
            'ntrees': 1,
            'max_depth': 10,
            # 'sample_rate': 1.0,
            'sample_rate': 1.0,
            'nbins': 50,
            'timeoutSecs': 600,
            'response': 'C55',
            'classification': 1,
        }

        paramsScoreRF = {
            'vactual': 'C55',
            'timeoutSecs': 600,
        }

        # train1
        trainKey1 = self.loadData(trainDS1)
        scoreKey1 = self.loadData(scoreDS1)
        kwargs = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs)
        kwargs = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True)
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain1\n=========="
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=trainResult1,
                                  noPrint=False,
                                  **kwargs)
        print "\nScore1\n=========+"
        print h2o.dump_json(scoreResult1)
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=scoreResult1,
                                  noPrint=False,
                                  **kwargs)

        # train2
        trainKey2 = self.loadData(trainDS2)
        scoreKey2 = self.loadData(scoreDS2)
        kwargs = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs)
        kwargs = paramsScoreRF.copy()
        h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True)
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)
        h2o_cmd.runInspect(key='Predict.hex', verbose=True)
        print "\nTrain2\n=========="
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=trainResult2,
                                  noPrint=False,
                                  **kwargs)
        print "\nScore2\n=========="
        h2o_rf.simpleCheckRFScore(node=None,
                                  rfv=scoreResult2,
                                  noPrint=False,
                                  **kwargs)

        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)
        # should only be two diffs
        if len(df.difference) > 2:
            raise Exception(
                "Too many diffs in JsonDiff sorted vs non-sorted %s" %
                len(df.difference))