def test_rf_covtype_train_oobe_fvec(self): h2o.beta_features = True print "\nRun test iterations/compare with covtype.data" rfv1 = self.rf_covtype_train_oobe('covtype.data', checkExpectedResults=False, expectedAuc=0.95) (ce1, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv1) # since we created a binomial output class..look at the error rate for class 1 ce1pct1 = classErrorPctList[1] print "\nRun test iterations/compare with covtype.shuffled.data" rfv2 = self.rf_covtype_train_oobe('covtype.shuffled.data', checkExpectedResults=True, expectedAuc=0.95) (ce2, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv2) ce2pct1 = classErrorPctList[1] print "\nRun test iterations/compare with covtype.sorted.data" rfv3 = self.rf_covtype_train_oobe('covtype.sorted.data', checkExpectedResults=False, expectedAuc=0.95) (ce3, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv3) ce3pct1 = classErrorPctList[1] print "rfv3, from covtype.sorted.data" print "\nJsonDiff covtype.data rfv, to covtype.sorted.data rfv" print "rfv1:", h2o.dump_json(rfv1) print "rfv3:", h2o.dump_json(rfv3) # df = h2o_util.JsonDiff(rfv1, rfv3, with_values=True) df = h2o_util.JsonDiff(rfv1, rfv3) print "df.difference:", h2o.dump_json(df.difference) self.assertAlmostEqual(ce1, ce2, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce2)) self.assertAlmostEqual(ce1, ce3, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce3)) # we're doing separate test/train splits..so we're going to get variance # really should not do test/train split and use all the data? if we're comparing sorted or not? # but need the splits to be sorted or not. I think I have those files self.assertAlmostEqual(ce1pct1, ce2pct1, delta=7.0, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce2pct1)) self.assertAlmostEqual(ce1pct1, ce3pct1, delta=7.0, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce3pct1))
def test_RF(self): h2o.beta_features = True paramsTrainRF = { 'seed': '1234567890', 'ntrees': 1, 'max_depth': 10, # 'sample_rate': 1.0, 'sample_rate': 1.0, 'nbins': 50, 'timeoutSecs': 600, 'response': 'C55', 'classification': 1, } paramsScoreRF = { 'vactual': 'C55', 'timeoutSecs': 600, } # train1 trainKey1 = self.loadData(trainDS1) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True) scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain1\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult1, noPrint=False, **kwargs) print "\nScore1\n=========+" print h2o.dump_json(scoreResult1) h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult1, noPrint=False, **kwargs) # train2 trainKey2 = self.loadData(trainDS2) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True) scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult2, noPrint=False, **kwargs) print "\nScore2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult2, noPrint=False, **kwargs) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) # should only be two diffs if len(df.difference) > 2: raise Exception ("Too many diffs in JsonDiff sorted vs non-sorted %s" % len(df.difference))
def test_RF(self): h2o.beta_features = True if h2o.beta_features: paramsTrainRF = { 'ntrees': 3, 'max_depth': 10, 'nbins': 50, 'timeoutSecs': 600, 'response': 'C54', 'classification': 1, } paramsScoreRF = { 'vactual': 'C54', 'timeoutSecs': 600, } else: paramsTrainRF = { 'use_non_local_data' : 1, 'ntree' : 10, 'depth' : 300, 'bin_limit' : 20000, 'stat_type' : 'ENTROPY', 'out_of_bag_error_estimate': 1, 'exclusive_split_limit' : 0, 'timeoutSecs': 60, } paramsScoreRF = { # scoring requires the response_variable. it defaults to last, so normally # we don't need to specify. But put this here and (above if used) # in case a dataset doesn't use last col 'response_variable': None, 'timeoutSecs': 60, 'out_of_bag_error_estimate': 0, } # train1 trainKey1 = self.loadData(trainDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True) scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain1\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult1, noPrint=False, **kwargs) print "\nScore1\n=========+" h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult1, noPrint=False, **kwargs) # train2 trainKey2 = self.loadData(trainDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True) scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult2, noPrint=False, **kwargs) print "\nScore2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult2, noPrint=False, **kwargs) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_RF(self): paramsTrainRF = { 'seed': '1234567890', 'ntrees': 1, 'max_depth': 10, # 'sample_rate': 1.0, 'sample_rate': 1.0, 'nbins': 50, 'timeoutSecs': 600, 'response': 'C55', 'classification': 1, } paramsScoreRF = { 'vactual': 'C55', 'timeoutSecs': 600, } # train1 trainKey1 = self.loadData(trainDS1) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, scoreKey1, **kwargs) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True) scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain1\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult1, noPrint=False, **kwargs) print "\nScore1\n=========+" print h2o.dump_json(scoreResult1) h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult1, noPrint=False, **kwargs) # train2 trainKey2 = self.loadData(trainDS2) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, scoreKey2, **kwargs) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True) scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult2, noPrint=False, **kwargs) print "\nScore2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult2, noPrint=False, **kwargs) print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) # should only be two diffs if len(df.difference) > 2: raise Exception( "Too many diffs in JsonDiff sorted vs non-sorted %s" % len(df.difference))
def test_RF(self): h2o.beta_features = True if h2o.beta_features: paramsTrainRF = { "ntrees": 3, "max_depth": 10, "nbins": 50, "timeoutSecs": 600, "response": "C55", "classification": 1, } paramsScoreRF = {"vactual": "C55", "timeoutSecs": 600} else: paramsTrainRF = { "use_non_local_data": 1, "ntree": 10, "depth": 300, "bin_limit": 20000, "stat_type": "ENTROPY", "out_of_bag_error_estimate": 1, "exclusive_split_limit": 0, "timeoutSecs": 60, } paramsScoreRF = { # scoring requires the response_variable. it defaults to last, so normally # we don't need to specify. But put this here and (above if used) # in case a dataset doesn't use last col "response_variable": None, "timeoutSecs": 60, "out_of_bag_error_estimate": 0, } # train1 trainKey1 = self.loadData(trainDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key="scoreDS1.hex", verbose=True) scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) h2o_cmd.runInspect(key="Predict.hex", verbose=True) print "\nTrain1\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult1, noPrint=False, **kwargs) print "\nScore1\n=========+" print h2o.dump_json(scoreResult1) h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult1, noPrint=False, **kwargs) # train2 trainKey2 = self.loadData(trainDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key="scoreDS2.hex", verbose=True) scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) h2o_cmd.runInspect(key="Predict.hex", verbose=True) print "\nTrain2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult2, noPrint=False, **kwargs) print "\nScore2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult2, noPrint=False, **kwargs) if 1 == 0: print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_RF(self): h2o.beta_features = True if h2o.beta_features: paramsTrainRF = { 'ntrees': 3, 'max_depth': 10, 'nbins': 50, 'timeoutSecs': 600, 'response': 'C54', 'classification': 1, } paramsScoreRF = { 'vactual': 'C54', 'timeoutSecs': 600, } else: paramsTrainRF = { 'use_non_local_data': 1, 'ntree': 10, 'depth': 300, 'bin_limit': 20000, 'stat_type': 'ENTROPY', 'out_of_bag_error_estimate': 1, 'exclusive_split_limit': 0, 'timeoutSecs': 60, } paramsScoreRF = { # scoring requires the response_variable. it defaults to last, so normally # we don't need to specify. But put this here and (above if used) # in case a dataset doesn't use last col 'response_variable': None, 'timeoutSecs': 60, 'out_of_bag_error_estimate': 0, } # train1 trainKey1 = self.loadData(trainDS1) kwargs = paramsTrainRF.copy() trainResult1 = h2o_rf.trainRF(trainKey1, **kwargs) scoreKey1 = self.loadData(scoreDS1) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS1.hex', verbose=True) scoreResult1 = h2o_rf.scoreRF(scoreKey1, trainResult1, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain1\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult1, noPrint=False, **kwargs) print "\nScore1\n=========+" print h2o.dump_json(scoreResult1) h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult1, noPrint=False, **kwargs) # train2 trainKey2 = self.loadData(trainDS2) kwargs = paramsTrainRF.copy() trainResult2 = h2o_rf.trainRF(trainKey2, **kwargs) scoreKey2 = self.loadData(scoreDS2) kwargs = paramsScoreRF.copy() h2o_cmd.runInspect(key='scoreDS2.hex', verbose=True) scoreResult2 = h2o_rf.scoreRF(scoreKey2, trainResult2, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) print "\nTrain2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=trainResult2, noPrint=False, **kwargs) print "\nScore2\n==========" h2o_rf.simpleCheckRFScore(node=None, rfv=scoreResult2, noPrint=False, **kwargs) if 1 == 0: print "\nTraining: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(trainResult1, trainResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference) print "\nScoring: JsonDiff sorted data results, to non-sorted results (json responses)" df = h2o_util.JsonDiff(scoreResult1, scoreResult2, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def rf_covtype_train_oobe(self, csvFilename, checkExpectedResults=True, expectedAuc=0.5): # the expected results are only for the shuffled version # since getting 10% samples etc of the smallish dataset will vary between # shuffled and non-shuffled datasets importFolderPath = "standard" csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numCols = inspect['numCols'] numRows = inspect['numRows'] pct10 = int(numRows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = numRows - rowsForPct[9] rowsForPct[10] = numRows # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] # 0 isn't used expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79] expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78] # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] trial = 0 for rowPct in [0.9]: trial += 1 # Not using this now (did use it for slicing) rowsToUse = rowsForPct[trial%10] resultKey = "r_" + csvFilename + "_" + str(trial) # just do random split for now dataKeyTrain = 'rTrain.hex' dataKeyTest = 'rTest.hex' response = "C55" h2o_cmd.createTestTrain(hex_key, dataKeyTrain, dataKeyTest, trainPercent=90, outputClass=4, outputCol=numCols-1, changeToBinomial=not DO_MULTINOMIAL) sliceResult = {'destination_key': dataKeyTrain} # adjust timeoutSecs with the number of trees kwargs = paramDict.copy() kwargs['destination_key'] = "model_" + csvFilename + "_" + str(trial) timeoutSecs = 30 + kwargs['ntrees'] * 20 start = time.time() # have to pass validation= param to avoid getting no error results (since 100% sample..DRF2 doesn't like that) rfv = h2o_cmd.runRF(parseResult=sliceResult, timeoutSecs=timeoutSecs, validation=dataKeyTest, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, **kwargs) # oobeTrainPctRight = 100 * (1.0 - error) oobeTrainPctRight = 100 - error if checkExpectedResults: self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%. Note this is silly if we trained on 100% of the data" print "Or sorted by output class, so that the last 10% is the last few classes" rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] rfvScoring = h2o_cmd.runScore(dataKey=dataKeyTest, modelKey=model_key, vactual=response, vpredict=1, expectedAuc=expectedAuc) print h2o.dump_json(rfvScoring) h2o_rf.simpleCheckRFScore(rfv=rfvScoring, **kwargs) print "hello7" (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfvScoring, **kwargs) fullScorePctRight = 100 - error h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) if checkExpectedResults: self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/numRows), "pct. of all rows" actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "maybe should update with actual. Remove single quotes" print "actualTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "actualScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp return rfvScoring
def rf_covtype_train_oobe(self, csvFilename, checkExpectedResults=True, expectedAuc=0.5): # the expected results are only for the shuffled version # since getting 10% samples etc of the smallish dataset will vary between # shuffled and non-shuffled datasets importFolderPath = "standard" csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numCols = inspect['numCols'] numRows = inspect['numRows'] pct10 = int(numRows * .1) rowsForPct = [i * pct10 for i in range(0, 11)] # this can be slightly less than 10% last10 = numRows - rowsForPct[9] rowsForPct[10] = numRows # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] # 0 isn't used expectTrainPctRightList = [ 0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79 ] expectScorePctRightList = [ 0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78 ] # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] trial = 0 for rowPct in [0.9]: trial += 1 # Not using this now (did use it for slicing) rowsToUse = rowsForPct[trial % 10] resultKey = "r_" + csvFilename + "_" + str(trial) # just do random split for now dataKeyTrain = 'rTrain.hex' dataKeyTest = 'rTest.hex' response = "C55" h2o_cmd.createTestTrain(hex_key, dataKeyTrain, dataKeyTest, trainPercent=90, outputClass=4, outputCol=numCols - 1, changeToBinomial=not DO_MULTINOMIAL) sliceResult = {'destination_key': dataKeyTrain} # adjust timeoutSecs with the number of trees kwargs = paramDict.copy() kwargs['destination_key'] = "model_" + csvFilename + "_" + str( trial) timeoutSecs = 30 + kwargs['ntrees'] * 20 start = time.time() # have to pass validation= param to avoid getting no error results (since 100% sample..DRF2 doesn't like that) rfv = h2o_cmd.runRF(parseResult=sliceResult, timeoutSecs=timeoutSecs, validation=dataKeyTest, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, **kwargs) # oobeTrainPctRight = 100 * (1.0 - error) oobeTrainPctRight = 100 - error if checkExpectedResults: self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%. Note this is silly if we trained on 100% of the data" print "Or sorted by output class, so that the last 10% is the last few classes" rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] rfvScoring = h2o_cmd.runScore(dataKey=dataKeyTest, modelKey=model_key, vactual=response, vpredict=1, expectedAuc=expectedAuc) print h2o.dump_json(rfvScoring) h2o_rf.simpleCheckRFScore(rfv=rfvScoring, **kwargs) print "hello7" (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfvScoring, **kwargs) fullScorePctRight = 100 - error h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) if checkExpectedResults: self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % ( rowsToUse * 100.0 / numRows), "pct. of all rows" actualDelta = [ abs(a - b) for a, b in zip(expectTrainPctRightList, actualTrainPctRightList) ] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "maybe should update with actual. Remove single quotes" print "actualTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [ abs(a - b) for a, b in zip(expectScorePctRightList, actualScorePctRightList) ] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "actualScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp return rfvScoring
def test_rf_covtype_train_oobe_fvec(self): print "\nRun test iterations/compare with covtype.data" rfv1 = self.rf_covtype_train_oobe('covtype.data', checkExpectedResults=False, expectedAuc=0.95) (ce1, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv1) # since we created a binomial output class..look at the error rate for class 1 ce1pct1 = classErrorPctList[1] print "\nRun test iterations/compare with covtype.shuffled.data" rfv2 = self.rf_covtype_train_oobe('covtype.shuffled.data', checkExpectedResults=True, expectedAuc=0.95) (ce2, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv2) ce2pct1 = classErrorPctList[1] print "\nRun test iterations/compare with covtype.sorted.data" rfv3 = self.rf_covtype_train_oobe('covtype.sorted.data', checkExpectedResults=False, expectedAuc=0.95) (ce3, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFScore(rfv=rfv3) ce3pct1 = classErrorPctList[1] print "rfv3, from covtype.sorted.data" print "\nJsonDiff covtype.data rfv, to covtype.sorted.data rfv" print "rfv1:", h2o.dump_json(rfv1) print "rfv3:", h2o.dump_json(rfv3) # df = h2o_util.JsonDiff(rfv1, rfv3, with_values=True) df = h2o_util.JsonDiff(rfv1, rfv3) print "df.difference:", h2o.dump_json(df.difference) self.assertAlmostEqual( ce1, ce2, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce2)) self.assertAlmostEqual( ce1, ce3, delta=0.5, msg="classification error %s isn't close to that when sorted %s" % (ce1, ce3)) # we're doing separate test/train splits..so we're going to get variance # really should not do test/train split and use all the data? if we're comparing sorted or not? # but need the splits to be sorted or not. I think I have those files self.assertAlmostEqual( ce1pct1, ce2pct1, delta=10.0, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce2pct1)) self.assertAlmostEqual( ce1pct1, ce3pct1, delta=10.0, msg="classErrorPctList[1] %s isn't close to that when sorted %s" % (ce1pct1, ce3pct1))