예제 #1
0
    def test_RF(self):
        trainKey1 = self.loadData(trainDS1)
        kwargs = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs)

        scoreKey1 = self.loadData(scoreDS1)
        kwargs = paramsScoreRF.copy()
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)
        print "\nTrain1\n=========={0}".format(
            h2o_rf.pp_rf_result(trainResult1))
        print "\nScore1\n========={0}".format(
            h2o_rf.pp_rf_result(scoreResult1))

        trainKey2 = self.loadData(trainDS2)
        kwargs = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs)

        scoreKey2 = self.loadData(scoreDS2)
        kwargs = paramsScoreRF.copy()
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)
        print "\nTrain2\n=========={0}".format(
            h2o_rf.pp_rf_result(trainResult2))
        print "\nScore2\n========={0}".format(
            h2o_rf.pp_rf_result(scoreResult2))

        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)
예제 #2
0
    def test_RF(self):
        trainKey1 = self.loadData(trainDS1)
        kwargs   = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs)

        scoreKey1 = self.loadData(scoreDS1)
        kwargs   = paramsScoreRF.copy()
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)
        print "\nTrain1\n=========={0}".format(h2o_rf.pp_rf_result(trainResult1))
        print "\nScore1\n========={0}".format(h2o_rf.pp_rf_result(scoreResult1))

        trainKey2 = self.loadData(trainDS2)
        kwargs   = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs)

        scoreKey2 = self.loadData(scoreDS2)
        kwargs   = paramsScoreRF.copy()
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)
        print "\nTrain2\n=========={0}".format(h2o_rf.pp_rf_result(trainResult2))
        print "\nScore2\n========={0}".format(h2o_rf.pp_rf_result(scoreResult2))

        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)
예제 #3
0
    def test_RF_1000trees(self):
        # NAs cause CM to zero..don't run for now
        ### csvPathnamegz = h2o.find_file('smalldata/hhp_9_17_12.predict.100rows.data.gz')
        s3bucket = self.s3_default_bucket()
        s3dataset = "covtype20x.data.gz"
        s3dataset = "covtype.data"
        s3dataset = "covtype200x.data.gz"
        s3dataset = "covtype50x.data"
        s3dataset = "covtype100x.data"
        s3dataset = "covtype.20k.data"

        s3dataset = "covtype.data"

        start = time.time()
        parseKey = h2o_cmd.parseS3File(bucket=s3bucket, filename=s3dataset, timeoutSecs=14800)
        print "Parsing took {0}".format(time.time() - start)

        start = time.time()
        rf_train = h2o_cmd.runRFOnly(
            parseKey=parseKey,
            ntree=100,
            timeoutSecs=14800,
            bin_limit=20000,
            out_of_bag_error_estimate=1,
            gini=0,
            depth=100,
            exclusive_split_limit=0,
        )
        print "Computation took {0} sec".format(time.time() - start)
        print h2o_rf.pp_rf_result(rf_train)
예제 #4
0
    def test_RF_1000trees(self):
        # NAs cause CM to zero..don't run for now
        ### csvPathnamegz = h2o.find_file('smalldata/hhp_9_17_12.predict.100rows.data.gz')
        s3bucket = self.s3_default_bucket()
        s3dataset = 'covtype20x.data.gz'
        s3dataset = 'covtype.data'
        s3dataset = 'covtype200x.data.gz'
        s3dataset = 'covtype50x.data'
        s3dataset = 'covtype100x.data'
        s3dataset = 'covtype.20k.data'

        s3dataset = 'covtype.data'

        start = time.time()
        parseResult = h2o_cmd.parseS3File(bucket=s3bucket,
                                          filename=s3dataset,
                                          timeoutSecs=14800)
        print "Parsing took {0}".format(time.time() - start)

        start = time.time()
        rf_train = h2o_cmd.runRF(parseResult=parseResult,
                                 ntree=100,
                                 timeoutSecs=14800,
                                 bin_limit=20000,
                                 out_of_bag_error_estimate=1,
                                 stat_type='ENTROPY',
                                 depth=100,
                                 exclusive_split_limit=0)
        print "Computation took {0} sec".format(time.time() - start)
        print h2o_rf.pp_rf_result(rf_train)
예제 #5
0
    def test_RF(self):
	normalRF = False
	#normalRF = True

	print """
Normal RF : {0}
Train data: {1}
Test data : {2}""".format(normalRF, trainDS['filename'], scoreDS['filename'])

	print "Loading data...."
        trainKey = self.loadTrainData()
        kwargs   = paramsTrainRF.copy()
	
	print "Running normal RF: {0}".format(normalRF)
	if normalRF:
        	trainResult = h2o_rf.trainRF(trainKey, model_key="rfm_normal", **kwargs)
	else:
        	trainResult = h2o_rf.trainRF(trainKey, refine=1, model_key="rfm_refined", **kwargs)

        scoreKey = self.loadScoreData()
        kwargs   = paramsScoreRF.copy()
        scoreResult = h2o_rf.scoreRF(scoreKey, trainResult, **kwargs)
	
	print """
Normal RF : {0}
Train data: {1}
Test data : {2}""".format(normalRF, trainDS['filename'], scoreDS['filename'])
        print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult))
        print "\nScoring\n========={0}".format(h2o_rf.pp_rf_result(scoreResult))
예제 #6
0
    def test_RF(self):
        trainKey = self.loadTrainData()
        kwargs   = paramsTrainRF.copy()
        trainResult = h2o_rf.trainRF(trainKey, **kwargs)

        scoreKey = self.loadScoreData()
        kwargs   = paramsScoreRF.copy()
        scoreResult = h2o_rf.scoreRF(scoreKey, trainResult, **kwargs)

        print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult))
        print "\nScoring\n========={0}".format(h2o_rf.pp_rf_result(scoreResult))
    def test_c8_rf_airlines_hdfs(self):
        trainParseResult = self.loadTrainData()
        kwargs   = paramsTrainRF.copy()
        trainResult = h2o_rf.trainRF(trainParseResult, **kwargs)

        scoreParseResult = self.loadScoreData()
        kwargs   = paramsScoreRF.copy()
        scoreResult = h2o_rf.scoreRF(scoreParseResult, trainResult, **kwargs)

        print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult))
        print "\nScoring\n========={0}".format(h2o_rf.pp_rf_result(scoreResult))
예제 #8
0
    def test_RF(self):
        trainKey = self.loadTrainData()
        kwargs   = paramsTrainRF.copy()
        trainResult = h2o_rf.trainRF(trainKey, **kwargs)

        scoreKey = self.loadScoreData()
        kwargs   = paramsScoreRF.copy()
        scoreResult = h2o_rf.scoreRF(scoreKey, trainResult, **kwargs)

        print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult))
        print "\nScoring\n========={0}".format(h2o_rf.pp_rf_result(scoreResult))
예제 #9
0
    def test_rf_iris(self):
        # Train RF
        trainParseResult = h2i.import_parse(bucket='smalldata', path='iris/iris2.csv', hex_key='train_iris2.hex', schema='put')
        kwargs = paramsTrainRF.copy()
        trainResult = h2o_rf.trainRF(trainParseResult, **kwargs)

        scoreParseResult = h2i.import_parse(bucket='smalldata', path='iris/iris2.csv', hex_key='score_iris2.hex', schema='put')
        kwargs = paramsTestRF.copy()
        scoreResult  = h2o_rf.scoreRF(scoreParseResult, trainResult, **kwargs)

        print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult))
        print "\nScoring\n========={0}".format(h2o_rf.pp_rf_result(scoreResult))
예제 #10
0
    def test_rf_iris(self):
        # Train RF
        trainParseResult = h2i.import_parse(bucket='smalldata',
                                            path='iris/iris2.csv',
                                            hex_key='train_iris2.hex',
                                            schema='put')
        kwargs = paramsTrainRF.copy()
        trainResult = h2o_rf.trainRF(trainParseResult, **kwargs)

        scoreParseResult = h2i.import_parse(bucket='smalldata',
                                            path='iris/iris2.csv',
                                            hex_key='score_iris2.hex',
                                            schema='put')
        kwargs = paramsTestRF.copy()
        scoreResult = h2o_rf.scoreRF(scoreParseResult, trainResult, **kwargs)

        print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult))
        print "\nScoring\n========={0}".format(
            h2o_rf.pp_rf_result(scoreResult))
예제 #11
0
    def test_RF(self):
        trainKey = self.loadTrainData()
        scoreKey = self.loadScoreData()
        #time.sleep(3600)
        executeNormalRF = True
        executeNormalRF = False
        if executeNormalRF:
            kwargs   = paramsTrainRF.copy()
            trainResultNormal = h2o_rf.trainRF(trainKey, model_key="rfm_normal", **kwargs)
            #print h2o_rf.pp_rf_result(trainResultNormal)
            kwargs   = paramsScoreRF.copy()
            scoreResultNormal = h2o_rf.scoreRF(scoreKey, trainResultNormal, **kwargs)
            print "\nScoring normal forest\n========={0}".format(h2o_rf.pp_rf_result(scoreResultNormal))

        kwargs   = paramsTrainRF.copy()
        trainResultRefined = h2o_rf.trainRF(trainKey, refine=1, model_key="rfm_refined", **kwargs)
        #print h2o_rf.pp_rf_result(trainResultRefined)
        kwargs   = paramsScoreRF.copy()
        scoreResultRefined = h2o_rf.scoreRF(scoreKey, trainResultRefined, **kwargs)
        print "\nScoring refined forest\n========={0}".format(h2o_rf.pp_rf_result(scoreResultRefined))

        time.sleep(3600)
예제 #12
0
    def test_RF(self):
        h2o.beta_features = True

        if h2o.beta_features:
            paramsTrainRF = { 
                'ntrees': 10, 
                'max_depth': 300,
                'nbins': 200,
                'timeoutSecs': 600,
                'response': 'C54',
            }

            paramsScoreRF = {
                'vactual': 'C54',
                'timeoutSecs': 600,
            }

        else:
            paramsTrainRF = { 
                'use_non_local_data' : 1,
                'ntree'      : 10, 
                'depth'      : 300,
                'parallel'   : 1, 
                'bin_limit'  : 20000,
                'stat_type'  : 'ENTROPY',
                'out_of_bag_error_estimate': 1, 
                'exclusive_split_limit'    : 0,
                'timeoutSecs': 60,
            }

            paramsScoreRF = {
                # scoring requires the response_variable. it defaults to last, so normally
                # we don't need to specify. But put this here and (above if used) 
                # in case a dataset doesn't use last col 
                'response_variable': None,
                'timeoutSecs': 60,
                'out_of_bag_error_estimate': 0, 
            }

        trainKey1 = self.loadData(trainDS1)
        kwargs   = paramsTrainRF.copy()
        trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs)

        scoreKey1 = self.loadData(scoreDS1)
        kwargs   = paramsScoreRF.copy()
        scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs)
        print "\nTrain1\n=========={0}".format(h2o_rf.pp_rf_result(trainResult1))
        print "\nScore1\n========={0}".format(h2o_rf.pp_rf_result(scoreResult1))

        trainKey2 = self.loadData(trainDS2)
        kwargs   = paramsTrainRF.copy()
        trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs)

        scoreKey2 = self.loadData(scoreDS2)
        kwargs   = paramsScoreRF.copy()
        scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs)
        print "\nTrain2\n=========={0}".format(h2o_rf.pp_rf_result(trainResult2))
        print "\nScore2\n========={0}".format(h2o_rf.pp_rf_result(scoreResult2))

        print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)

        print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)"
        df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True)
        print "df.difference:", h2o.dump_json(df.difference)