def test_RF(self): trainKey1 = self.loadData(trainDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsScoreRF.copy() scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) print "\nTrain1\n=========={0}".format( h2o_rf.pp_rf_result(trainResult1)) print "\nScore1\n========={0}".format( h2o_rf.pp_rf_result(scoreResult1)) trainKey2 = self.loadData(trainDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsScoreRF.copy() scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) print "\nTrain2\n=========={0}".format( h2o_rf.pp_rf_result(trainResult2)) print "\nScore2\n========={0}".format( h2o_rf.pp_rf_result(scoreResult2)) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_RF(self): trainKey1 = self.loadData(trainDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsScoreRF.copy() scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) print "\nTrain1\n=========={0}".format(h2o_rf.pp_rf_result(trainResult1)) print "\nScore1\n========={0}".format(h2o_rf.pp_rf_result(scoreResult1)) trainKey2 = self.loadData(trainDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsScoreRF.copy() scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) print "\nTrain2\n=========={0}".format(h2o_rf.pp_rf_result(trainResult2)) print "\nScore2\n========={0}".format(h2o_rf.pp_rf_result(scoreResult2)) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_RF_1000trees(self): # NAs cause CM to zero..don't run for now ### csvPathnamegz = h2o.find_file('smalldata/hhp_9_17_12.predict.100rows.data.gz') s3bucket = self.s3_default_bucket() s3dataset = "covtype20x.data.gz" s3dataset = "covtype.data" s3dataset = "covtype200x.data.gz" s3dataset = "covtype50x.data" s3dataset = "covtype100x.data" s3dataset = "covtype.20k.data" s3dataset = "covtype.data" start = time.time() parseKey = h2o_cmd.parseS3File(bucket=s3bucket, filename=s3dataset, timeoutSecs=14800) print "Parsing took {0}".format(time.time() - start) start = time.time() rf_train = h2o_cmd.runRFOnly( parseKey=parseKey, ntree=100, timeoutSecs=14800, bin_limit=20000, out_of_bag_error_estimate=1, gini=0, depth=100, exclusive_split_limit=0, ) print "Computation took {0} sec".format(time.time() - start) print h2o_rf.pp_rf_result(rf_train)
def test_RF_1000trees(self): # NAs cause CM to zero..don't run for now ### csvPathnamegz = h2o.find_file('smalldata/hhp_9_17_12.predict.100rows.data.gz') s3bucket = self.s3_default_bucket() s3dataset = 'covtype20x.data.gz' s3dataset = 'covtype.data' s3dataset = 'covtype200x.data.gz' s3dataset = 'covtype50x.data' s3dataset = 'covtype100x.data' s3dataset = 'covtype.20k.data' s3dataset = 'covtype.data' start = time.time() parseResult = h2o_cmd.parseS3File(bucket=s3bucket, filename=s3dataset, timeoutSecs=14800) print "Parsing took {0}".format(time.time() - start) start = time.time() rf_train = h2o_cmd.runRF(parseResult=parseResult, ntree=100, timeoutSecs=14800, bin_limit=20000, out_of_bag_error_estimate=1, stat_type='ENTROPY', depth=100, exclusive_split_limit=0) print "Computation took {0} sec".format(time.time() - start) print h2o_rf.pp_rf_result(rf_train)
def test_RF(self): normalRF = False #normalRF = True print """ Normal RF : {0} Train data: {1} Test data : {2}""".format(normalRF, trainDS['filename'], scoreDS['filename']) print "Loading data...." trainKey = self.loadTrainData() kwargs = paramsTrainRF.copy() print "Running normal RF: {0}".format(normalRF) if normalRF: trainResult = h2o_rf.trainRF(trainKey, model_key="rfm_normal", **kwargs) else: trainResult = h2o_rf.trainRF(trainKey, refine=1, model_key="rfm_refined", **kwargs) scoreKey = self.loadScoreData() kwargs = paramsScoreRF.copy() scoreResult = h2o_rf.scoreRF(scoreKey, trainResult, **kwargs) print """ Normal RF : {0} Train data: {1} Test data : {2}""".format(normalRF, trainDS['filename'], scoreDS['filename']) print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult)) print "\nScoring\n========={0}".format(h2o_rf.pp_rf_result(scoreResult))
def test_RF(self): trainKey = self.loadTrainData() kwargs = paramsTrainRF.copy() trainResult = h2o_rf.trainRF(trainKey, **kwargs) scoreKey = self.loadScoreData() kwargs = paramsScoreRF.copy() scoreResult = h2o_rf.scoreRF(scoreKey, trainResult, **kwargs) print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult)) print "\nScoring\n========={0}".format(h2o_rf.pp_rf_result(scoreResult))
def test_c8_rf_airlines_hdfs(self): trainParseResult = self.loadTrainData() kwargs = paramsTrainRF.copy() trainResult = h2o_rf.trainRF(trainParseResult, **kwargs) scoreParseResult = self.loadScoreData() kwargs = paramsScoreRF.copy() scoreResult = h2o_rf.scoreRF(scoreParseResult, trainResult, **kwargs) print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult)) print "\nScoring\n========={0}".format(h2o_rf.pp_rf_result(scoreResult))
def test_rf_iris(self): # Train RF trainParseResult = h2i.import_parse(bucket='smalldata', path='iris/iris2.csv', hex_key='train_iris2.hex', schema='put') kwargs = paramsTrainRF.copy() trainResult = h2o_rf.trainRF(trainParseResult, **kwargs) scoreParseResult = h2i.import_parse(bucket='smalldata', path='iris/iris2.csv', hex_key='score_iris2.hex', schema='put') kwargs = paramsTestRF.copy() scoreResult = h2o_rf.scoreRF(scoreParseResult, trainResult, **kwargs) print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult)) print "\nScoring\n========={0}".format(h2o_rf.pp_rf_result(scoreResult))
def test_rf_iris(self): # Train RF trainParseResult = h2i.import_parse(bucket='smalldata', path='iris/iris2.csv', hex_key='train_iris2.hex', schema='put') kwargs = paramsTrainRF.copy() trainResult = h2o_rf.trainRF(trainParseResult, **kwargs) scoreParseResult = h2i.import_parse(bucket='smalldata', path='iris/iris2.csv', hex_key='score_iris2.hex', schema='put') kwargs = paramsTestRF.copy() scoreResult = h2o_rf.scoreRF(scoreParseResult, trainResult, **kwargs) print "\nTrain\n=========={0}".format(h2o_rf.pp_rf_result(trainResult)) print "\nScoring\n========={0}".format( h2o_rf.pp_rf_result(scoreResult))
def test_RF(self): trainKey = self.loadTrainData() scoreKey = self.loadScoreData() #time.sleep(3600) executeNormalRF = True executeNormalRF = False if executeNormalRF: kwargs = paramsTrainRF.copy() trainResultNormal = h2o_rf.trainRF(trainKey, model_key="rfm_normal", **kwargs) #print h2o_rf.pp_rf_result(trainResultNormal) kwargs = paramsScoreRF.copy() scoreResultNormal = h2o_rf.scoreRF(scoreKey, trainResultNormal, **kwargs) print "\nScoring normal forest\n========={0}".format(h2o_rf.pp_rf_result(scoreResultNormal)) kwargs = paramsTrainRF.copy() trainResultRefined = h2o_rf.trainRF(trainKey, refine=1, model_key="rfm_refined", **kwargs) #print h2o_rf.pp_rf_result(trainResultRefined) kwargs = paramsScoreRF.copy() scoreResultRefined = h2o_rf.scoreRF(scoreKey, trainResultRefined, **kwargs) print "\nScoring refined forest\n========={0}".format(h2o_rf.pp_rf_result(scoreResultRefined)) time.sleep(3600)
def test_RF(self): h2o.beta_features = True if h2o.beta_features: paramsTrainRF = { 'ntrees': 10, 'max_depth': 300, 'nbins': 200, 'timeoutSecs': 600, 'response': 'C54', } paramsScoreRF = { 'vactual': 'C54', 'timeoutSecs': 600, } else: paramsTrainRF = { 'use_non_local_data' : 1, 'ntree' : 10, 'depth' : 300, 'parallel' : 1, 'bin_limit' : 20000, 'stat_type' : 'ENTROPY', 'out_of_bag_error_estimate': 1, 'exclusive_split_limit' : 0, 'timeoutSecs': 60, } paramsScoreRF = { # scoring requires the response_variable. it defaults to last, so normally # we don't need to specify. But put this here and (above if used) # in case a dataset doesn't use last col 'response_variable': None, 'timeoutSecs': 60, 'out_of_bag_error_estimate': 0, } trainKey1 = self.loadData(trainDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsScoreRF.copy() scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) print "\nTrain1\n=========={0}".format(h2o_rf.pp_rf_result(trainResult1)) print "\nScore1\n========={0}".format(h2o_rf.pp_rf_result(scoreResult1)) trainKey2 = self.loadData(trainDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsScoreRF.copy() scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) print "\nTrain2\n=========={0}".format(h2o_rf.pp_rf_result(trainResult2)) print "\nScore2\n========={0}".format(h2o_rf.pp_rf_result(scoreResult2)) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference)