def test_rf_covtype_train_full_fvec(self): h2o.beta_features = True csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=csvFilename + ".hex", timeoutSecs=180) for trial in range(1): # params is mutable. This is default. kwargs = paramDict # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = kwargs['ntrees'] * 60 start = time.time() print "Note train.csv is used for both train and validation" rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, retryDelaySecs=5) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) job_key = rfv['job_key'] model_key = rfv['destination_key'] rfv = h2o_cmd.runRFView(data_key=parseResult['destination_key'], model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1, print_params=True) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv) self.assertLess(classification_error, 3, "train.csv should have full classification error: %s < 3" % classification_error) print "Trial #", trial, "completed"
def test_rf_covtype_train_full_fvec(self): h2o.beta_features = True csvFilename = 'covtype.data' csvPathname = 'standard/' + csvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=csvFilename + ".hex", timeoutSecs=180) for trial in range(1): # params is mutable. This is default. kwargs = paramDict # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = kwargs['ntrees'] * 60 start = time.time() print "Note train.csv is used for both train and validation" rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, retryDelaySecs=5) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) job_key = rfv['job_key'] model_key = rfv['destination_key'] rfv = h2o_cmd.runRFView(data_key=parseResult['destination_key'], model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv) # hmm..just using defaults above in RF? self.assertLess(classification_error, 4.8, "train.csv should have full classification error: %s < 4.8" % classification_error) print "Trial #", trial, "completed"
def test_rf_big1_nopoll_fvec(self): h2o.beta_features = True csvFilename = 'hhp_107_01.data.gz' hex_key = csvFilename + ".hex" print "\n" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, hex_key=hex_key, timeoutSecs=30, schema='put') rfViewInitial = [] # dispatch multiple jobs back to back for jobDispatch in range(3): start = time.time() kwargs = {} if OVERWRITE_RF_MODEL: print "Since we're overwriting here, we have to wait for each to complete noPoll=False" model_key = 'RF_model' else: model_key = 'RF_model' + str(jobDispatch) kwargs['ntrees'] = 1 if OVERWRITE_RF_MODEL: print "Change the number of trees, while keeping the rf model key name the same" print "Checks that we correctly overwrite previous rf model" kwargs['ntrees'] += 1 kwargs['seed'] = random.randint(0, sys.maxint) # FIX! what model keys do these get? randomNode = h2o.nodes[random.randint(0,len(h2o.nodes)-1)] h2o_cmd.runRF(node=randomNode, parseResult=parseResult, destination_key=model_key, timeoutSecs=300, noPoll=False if OVERWRITE_RF_MODEL else True, **kwargs) print "rf job dispatch end on ", csvFilename, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected first = None print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['_dataKey'] model_key = rfView['_key'] ntree = rfView['ntree'] print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)" # allow it to poll to complete rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, ntree=ntree, timeoutSecs=60, noPoll=False) if first is None: # we'll use this to compare the others first = rfViewResult.copy() firstModelKey = model_key print "first", h2o.dump_json(first) else: print "Comparing", model_key, "to", firstModelKey df = h2o_util.JsonDiff(rfViewResult, first, vice_versa=True, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def scoreRF(scoreParseResult, trainResult, vactual=None, timeoutSecs=120, **kwargs): # Run validation on dataset parseKey = scoreParseResult['destination_key'] if h2o.beta_features: # this is how we're supposed to do scorin? rfModelKey = trainResult['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual=vactual, predict=predictKey, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) rftime = time.time() - start cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) scoreResult = predictCMResult else: ntree = trainResult['ntree'] rfModelKey = trainResult['model_key'] start = time.time() # NOTE: response_variable is required, and passed from kwargs here # out_of_bag_error_estimate=0 is required for scoring. H2O will assert if 1 and different data set # compared to training kwargs['out_of_bag_error_estimate'] = 0 scoreResult = h2o_cmd.runRFView(None, parseKey, rfModelKey, ntree=ntree, timeoutSecs=timeoutSecs, **kwargs) rftime = time.time() - start h2o.verboseprint("RF score results: ", scoreResult) h2o.verboseprint("RF computation took {0} sec".format(rftime)) scoreResult['python_call_timer'] = rftime return scoreResult
def test_rf_covtype_fvec(self): importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" h2i.setupImportFolder(None, importFolderPath) print "\nUsing header=0 on the normal covtype.data" parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=0, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) rfViewInitial = [] for jobDispatch in range(1): # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['model_key'] = "model_" + str(jobDispatch) # don't poll for fvec rfResult = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, noPoll=True, rfView=False, **kwargs) elapsed = time.time() - start print "RF dispatch end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) print h2o.dump_json(rfResult) # FIX! are these already in there? rfView = {} rfView['data_key'] = key2 rfView['model_key'] = kwargs['model_key'] rfView['ntree'] = kwargs['ntree'] rfViewInitial.append(rfView) print "rf job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=120, retryDelaySecs=5) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['data_key'] model_key = rfView['model_key'] ntree = rfView['ntree'] # allow it to poll to complete rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, ntree=ntree, timeoutSecs=60, noPoll=False)
def scoreRF(scoreParseKey, trainResult, **kwargs): # Run validation on dataset rfModelKey = trainResult['model_key'] ntree = trainResult['ntree'] start = time.time() scoreResult = h2o_cmd.runRFView(modelKey=rfModelKey, parseKey=scoreParseKey, ntree=ntree, **kwargs) rftime = time.time()-start h2o.verboseprint("RF score results: ", scoreResult) h2o.verboseprint("RF computation took {0} sec".format(rftime)) scoreResult['python_call_timer'] = rftime return scoreResult
def scoreRF(scoreParseResult, trainResult, vactual=None, timeoutSecs=120, **kwargs): # Run validation on dataset parseKey = scoreParseResult['destination_key'] if h2o.beta_features: # this is how we're supposed to do scorin? rfModelKey = trainResult['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs, **kwargs) h2o_cmd.runInspect(key='Predict.hex', verbose=True) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual=vactual, predict=predictKey, vpredict='predict', timeoutSecs=timeoutSecs, **kwargs) rftime = time.time()-start cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) scoreResult = predictCMResult else: ntree = trainResult['ntree'] rfModelKey = trainResult['model_key'] start = time.time() # NOTE: response_variable is required, and passed from kwargs here # out_of_bag_error_estimate=0 is required for scoring. H2O will assert if 1 and different data set # compared to training kwargs['out_of_bag_error_estimate'] = 0 scoreResult = h2o_cmd.runRFView(None, parseKey, rfModelKey, ntree=ntree, timeoutSecs=timeoutSecs, **kwargs) rftime = time.time()-start h2o.verboseprint("RF score results: ", scoreResult) h2o.verboseprint("RF computation took {0} sec".format(rftime)) scoreResult['python_call_timer'] = rftime return scoreResult
def scoreRF(scoreParseKey, trainResult, **kwargs): # Run validation on dataset rfModelKey = trainResult["model_key"] ntree = trainResult["ntree"] start = time.time() data_key = scoreParseKey["destination_key"] scoreResult = h2o_cmd.runRFView(None, data_key, rfModelKey, ntree, **kwargs) rftime = time.time() - start h2o.verboseprint("RF score results: ", scoreResult) h2o.verboseprint("RF computation took {0} sec".format(rftime)) scoreResult["python_call_timer"] = rftime return scoreResult
def scoreRF(scoreParseKey, trainResult, **kwargs): # Run validation on dataset rfModelKey = trainResult['model_key'] ntree = trainResult['ntree'] start = time.time() data_key = scoreParseKey['destination_key'] # NOTE: response_variable is required, and passed from kwargs here scoreResult = h2o_cmd.runRFView(None, data_key, rfModelKey, ntree, **kwargs) rftime = time.time()-start h2o.verboseprint("RF score results: ", scoreResult) h2o.verboseprint("RF computation took {0} sec".format(rftime)) scoreResult['python_call_timer'] = rftime return scoreResult
def scoreRF(scoreParseKey, trainResult, **kwargs): # Run validation on dataset rfModelKey = trainResult['model_key'] ntree = trainResult['ntree'] start = time.time() data_key = scoreParseKey['destination_key'] scoreResult = h2o_cmd.runRFView(None, data_key, rfModelKey, ntree, **kwargs) rftime = time.time() - start h2o.verboseprint("RF score results: ", scoreResult) h2o.verboseprint("RF computation took {0} sec".format(rftime)) scoreResult['python_call_timer'] = rftime return scoreResult
def test_rf_covtype_train_full_fvec(self): h2o.beta_features = True csvFilename = "covtype.data" csvPathname = "standard/" + csvFilename parseResult = h2i.import_parse( bucket="home-0xdiag-datasets", path=csvPathname, schema="put", hex_key=csvFilename + ".hex", timeoutSecs=180 ) for trial in range(1): # params is mutable. This is default. kwargs = paramDict # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = kwargs["ntrees"] * 60 start = time.time() print "Note train.csv is used for both train and validation" rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) h2o_jobs.pollStatsWhileBusy(timeoutSecs=timeoutSecs, retryDelaySecs=5) elapsed = time.time() - start print "RF end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed / timeoutSecs) * 100 ) job_key = rfv["job_key"] model_key = rfv["destination_key"] rfv = h2o_cmd.runRFView( data_key=parseResult["destination_key"], model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1 ) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv) # hmm..just using defaults above in RF? self.assertLess( classification_error, 4.8, "train.csv should have full classification error: %s < 4.8" % classification_error, ) print "Trial #", trial, "completed"
def test_rf_big1_nopoll(self): csvFilename = 'hhp_107_01.data.gz' csvPathname = h2o.find_file("smalldata/" + csvFilename) key2 = csvFilename + ".hex" print "\n" + csvPathname parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=15) rfViewInitial = [] # dispatch multiple jobs back to back for jobDispatch in range(3): start = time.time() kwargs = {} if OVERWRITE_RF_MODEL: print "Since we're overwriting here, we have to wait for each to complete nopoll=False" model_key = 'RF_model' else: model_key = 'RF_model' + str(jobDispatch) kwargs['ntree'] = 7 if OVERWRITE_RF_MODEL: print "Change the number of trees, while keeping the rf model key name the same" print "Checks that we correctly overwrite previous rf model" kwargs['ntree'] += 1 kwargs['seed'] = random.randint(0, sys.maxint) # FIX! what model keys do these get? randomNode = h2o.nodes[random.randint(0, len(h2o.nodes) - 1)] h2o_cmd.runRFOnly(node=randomNode, parseKey=parseKey, model_key=model_key, timeoutSecs=300, noPoll=False if OVERWRITE_RF_MODEL else True, **kwargs) # FIX! are these already in there? rfView = {} rfView['data_key'] = key2 rfView['model_key'] = model_key rfView['ntree'] = kwargs['ntree'] rfViewInitial.append(rfView) print "rf job dispatch end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "\njobDispatch #", jobDispatch h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected first = None print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['data_key'] model_key = rfView['model_key'] ntree = rfView['ntree'] # a = h2o.nodes[0].random_forest_view(data_key, model_key, noPoll=True) print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)" # allow it to poll to complete rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, ntree=ntree, timeoutSecs=60, noPoll=False) if first is None: # we'll use this to compare the others first = rfViewResult.copy() firstModelKey = model_key print "first", h2o.dump_json(first) else: print "Comparing", model_key, "to", firstModelKey df = h2o_util.JsonDiff(rfViewResult, first, vice_versa=True, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_rf_covtype20x_fvec(self): h2o.beta_features = True importFolderPath = 'standard' if DO_SMALL: csvFilenameTrain = 'covtype.data' hex_key = 'covtype1x.data.A.hex' else: csvFilenameTrain = 'covtype20x.data' hex_key = 'covtype20x.data.A.hex' csvPathname = importFolderPath + "/" + csvFilenameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # have to re import since source key is gone # we could just copy the key, but sometimes we change the test/train data to covtype.data if DO_SMALL: csvFilenameTest = 'covtype.data' hex_key = 'covtype1x.data.B.hex' dataKeyTest2 = 'covtype1x.data.C.hex' else: csvFilenameTest = 'covtype20x.data' hex_key = 'covtype20x.data.B.hex' dataKeyTest2 = 'covtype20x.data.C.hex' csvPathname = importFolderPath + "/" + csvFilenameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) print "Parse result['destination_key']:", parseResultTest['destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] print "Parse end", dataKeyTest # make a 3rd key so the predict is uncached too! execExpr = dataKeyTest2 + "=" + dataKeyTest if h2o.beta_features: kwargs = {'str': execExpr, 'timeoutSecs': 15} else: kwargs = {'expression': execExpr, 'timeoutSecs': 15} resultExec = h2o_cmd.runExec(**kwargs) # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. if h2o.beta_features: paramDict = drf2ParamDict params = { 'ntrees': 20, 'destination_key': 'RF_model' } else: paramDict = drf1ParamDict params = { 'ntree': 20, 'out_of_bag_error_estimate': 1, 'model_key': 'RF_model' } colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() if h2o.beta_features: timeoutSecs = 30 + kwargs['ntrees'] * 60 else: timeoutSecs = 30 + kwargs['ntree'] * 60 start = time.time() rf = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' print "\nRFView start after job completion" if h2o.beta_features: model_key = kwargs['destination_key'] ntree = kwargs['ntrees'] else: model_key = kwargs['model_key'] ntree = kwargs['ntree'] start = time.time() # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree) h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree=ntree, timeoutSecs=timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' for trial in range(1): # scoring start = time.time() rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree=ntree, timeoutSecs=timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) self.assertAlmostEqual(classification_error, 50, delta=50, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2) print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' parseKey = parseResultTrain['destination_key'] rfModelKey = rfView['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual='C54', predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed"
def test_rf_covtype_train_oobe3(self): print "\nUse randomFilter to sample the dataset randomly. then slice it" importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" h2i.setupImportFolder(None, importFolderPath) print "\nUsing header=0 on the normal covtype.data" parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=0, timeoutSecs=100) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # how many rows for each pct? num_rows = inspect['num_rows'] pct10 = int(num_rows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = num_rows - rowsForPct[9] rowsForPct[10] = num_rows # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79] expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78] print "Creating the key of the last 10% data, for scoring" dataKeyTest = "rTest" dataKeyTrain = "rTrain" # FIX! too many digits (10) in the 2nd param seems to cause stack trace execExpr = dataKeyTest + "=randomFilter(" + key2 + "," + str(pct10) + ",12345)" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10) execExpr = dataKeyTrain + "=randomFilter(" + key2 + "," + str(rowsForPct[9]) + ",12345)" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTrain, timeoutSecs=10) # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] for trial in range(1,10): # always slice from the beginning rowsToUse = rowsForPct[trial%10] resultKey = "r" + str(trial) execExpr = resultKey + "=slice(" + dataKeyTrain + ",1," + str(rowsToUse) + ")" # execExpr = resultKey + "=slice(" + dataKeyTrain + ",1)" h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10) parseKey['destination_key'] = resultKey # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['model_key'] = "model_" + str(trial) rfv = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) oobeTrainPctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error']) self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=0.2) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%" # pop the stuff from kwargs that were passing as params model_key = rfv['model_key'] kwargs.pop('model_key',None) data_key = rfv['data_key'] kwargs.pop('data_key',None) ntree = rfv['ntree'] kwargs.pop('ntree',None) kwargs['iterative_cm'] = 1 # do full scoring kwargs['out_of_bag_error_estimate'] = 0 rfv = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) fullScorePctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error']) self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=0.2) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/num_rows), "pct. of all rows" actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "actualTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "actualScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp
def test_RF_mnist_both(self): importFolderPath = "mnist" csvFilelist = [ # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), # to see results a 2nd time ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list (importFolderResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=importFolderPath + "/*") ### print "importHDFSResult:", h2o.dump_json(importFolderResult) if 'files' in importFolderResult: succeededList = importFolderResult['files'] else: succeededList = importFolderResult['succeeded'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 allDelta = [] for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed, parsePattern) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training" trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+parsePattern, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** # print "This is the 'ignore=' we'll use" # no longer use. depend on h2o to get it right. ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) ntree = 25 params = { 'response_variable': 0, # 'ignore': ignore_x, 'ntree': ntree, 'iterative_cm': 1, 'out_of_bag_error_estimate': 1, # 'data_key='mnist_training.csv.hex' 'features': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'exclusive_split_limit': None, 'depth': 2147483647, 'stat_type': 'ENTROPY', 'sampling_strategy': 'RANDOM', 'sample': 67, # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77', 'model_key': 'RF_model', 'bin_limit': 1024, # 'seed': 784834182943470027, 'use_non_local_data': 1, # 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0', } if rfSeed is None: params['seed'] = random.randint(0,sys.maxint) else: params['seed'] = rfSeed print "RF seed:", params['seed'] kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, rfView=True, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfView, **params) modelKey = rfView['model_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response_variable': y} rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) print "classification error is expected to be low because we included the test data in with the training!" self.assertAlmostEqual(classification_error, 0.0003, delta=0.0003, msg="Classification error %s differs too much" % classification_error) leaves = rfView['trees']['leaves'] # Expected values are from this case: # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), leavesExpected = {'min': 4996, 'mean': 5064.1, 'max': 5148} for l in leaves: # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l])) delta = ((leaves[l] - leavesExpected[l])/leaves[l]) * 100 d = "seed: %s %s leaves: %s expected: %s pct. different %s" % (params['seed'], l, leaves[l], leavesExpected[l], delta) print d allDelta.append(d) depth = rfView['trees']['depth'] depthExpected = {'min': 21, 'mean': 23.8, 'max': 25} for l in depth: # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l])) delta = ((depth[l] - depthExpected[l])/leaves[l]) * 100 d = "seed: %s %s depth: %s expected: %s pct. different %s" % (params['seed'], l, depth[l], depthExpected[l], delta) print d allDelta.append(d) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # Done ******************************************************* print "\nShowing the results again from all the trials, to see variance" for d in allDelta: print d
def test_rf_covtype_train_oobe(self): print "\nMichal will hate me for another file needed: covtype.shuffled.data" importFolderPath = "/home/0xdiag/datasets" csvFilename = 'covtype.shuffled.data' csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" h2i.setupImportFolder(None, importFolderPath) print "\nUsing header=0 on the normal covtype.data" parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=0, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # how many rows for each pct? num_rows = inspect['num_rows'] pct10 = int(num_rows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = num_rows - rowsForPct[9] rowsForPct[10] = last10 # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] # this was with 10 trees # expectTrainPctRightList = [0, 85.27, 88.45, 89.99, 91.11, 91.96, 92.51, 93.03, 93.45, 93.78] # expectScorePctRightList = [0, 89.10, 91,90, 93.26, 94.25, 94.74, 95.10, 95.42, 95.72, 95.92] # 0 isn't used expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79] expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78] print "Creating the key of the last 10% data, for scoring" dataKeyTest = "rTest" # start at 90% rows + 1 execExpr = dataKeyTest + " = slice(" + key2 + "," + str(rowsForPct[9]+1) + ")" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10) # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] for trial in range(1,10): # always slice from the beginning rowsToUse = rowsForPct[trial%10] resultKey = "r" + str(trial) execExpr = resultKey + " = slice(" + key2 + ",1," + str(rowsToUse) + ")" h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10) parseKey['destination_key'] = resultKey # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['model_key'] = "model_" + str(trial) rfv = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) oobeTrainPctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error']) self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=0.2) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%" # pop the stuff from kwargs that were passing as params model_key = rfv['model_key'] kwargs.pop('model_key',None) data_key = rfv['data_key'] kwargs.pop('data_key',None) ntree = rfv['ntree'] kwargs.pop('ntree',None) # scoring # RFView.html? # dataKeyTest=a5m.hex& # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628& # response_variable=1& # ntree=50& # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0& # out_of_bag_error_estimate=1& # no_confusion_matrix=1& # clear_confusion_matrix=1 ### dataKeyTest = data_key kwargs['clear_confusion_matrix'] = 1 kwargs['no_confusion_matrix'] = 0 # do full scoring kwargs['out_of_bag_error_estimate'] = 0 rfv = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) fullScorePctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error']) self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=0.2) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/num_rows), "pct. of all rows" actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "maybe should update with actual. Remove single quotes" print"expectTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "expectScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp
def test_RF_mnist_reals_fvec(self): h2o.beta_features = True importFolderPath = "mnist" csvFilelist = [ # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + trainCsvFilename, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) ntrees = 10 params = { 'response': 'C1', 'ignored_cols_by_name': ignore_x, 'ntrees': ntrees, 'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'max_depth': 15, 'sample_rate': 0.67, 'destination_key': 'RF_model', 'nbins': 1024, 'seed': 784834182943470027, 'importance': 0, 'balance_classes': 0, } kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfv = h2o_cmd.runRF(parseResult=parseResult, rfView=True, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfv, **params) rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response_variable': y} rfv = h2o_cmd.runRFView(data_key=testKey2, model_key=model_key, ntrees=ntrees, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfv, **params) self.assertAlmostEqual(classification_error, 9, delta=1.0, msg="Classification error %s differs too much" % classification_error) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_rf_log(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 10, 'cA', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # CREATE test dataset****************************************************** csvFilename = 'syn_test_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) testParseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10) print "Test Parse result['destination_key']:", testParseResult['destination_key'] dataKeyTest = testParseResult['destination_key'] # CREATE train dataset****************************************************** csvFilename = 'syn_train_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) trainParseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10) print "Train Parse result['destination_key']:", trainParseResult['destination_key'] dataKeyTrain = trainParseResult['destination_key'] # RF train****************************************************** # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['response_variable'] = colCount rfv = h2o_cmd.runRF(parseResult=trainParseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) oobeTrainPctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error']) expectTrainPctRight = 98 self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRight,\ msg="OOBE: pct. right for training not close enough %6.2f %6.2f"% (oobeTrainPctRight, expectTrainPctRight), delta=1) # RF score****************************************************** print "Now score with the 2nd random dataset" # pop the stuff from kwargs that were passing as params model_key = rfv['model_key'] kwargs.pop('model_key',None) data_key = rfv['data_key'] kwargs.pop('data_key',None) ntree = rfv['ntree'] kwargs.pop('ntree',None) kwargs['iterative_cm'] = 1 # do full scoring kwargs['out_of_bag_error_estimate'] = 0 rfv = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, ntree=ntree) self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) fullScorePctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error']) expectScorePctRight = 99 self.assertAlmostEqual(fullScorePctRight,expectScorePctRight, msg="Full: pct. right for scoring not close enough %6.2f %6.2f"% (fullScorePctRight, expectScorePctRight), delta=1)
def test_rfview_score(self): csvPathnameTrain = h2o.find_dataset( 'UCI/UCI-large/covtype/covtype.data') print "Train with:", csvPathnameTrain parseKeyTrain = h2o_cmd.parseFile(csvPathname=csvPathnameTrain, key2="covtype.hex", timeoutSecs=15) dataKeyTrain = parseKeyTrain['destination_key'] csvPathnameTest = h2o.find_dataset( 'UCI/UCI-large/covtype/covtype.data') print "Test with:", csvPathnameTest parseKeyTest = h2o_cmd.parseFile(csvPathname=csvPathnameTrain, key2="covtype.hex", timeoutSecs=15) dataKeyTest = parseKeyTest['destination_key'] for trial in range(5): # params is mutable. This is default. params = { 'ntree': 13, 'parallel': 1, 'out_of_bag_error_estimate': 0 } colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntree'] * 10 * (kwargs['parallel'] and 1 or 5) rfv = h2o_cmd.runRFOnly(parseKey=parseKeyTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) ### print "rf response:", h2o.dump_json(rfv) model_key = rfv['model_key'] # pop the stuff from kwargs that were passing as params kwargs.pop('model_key', None) data_key = rfv['data_key'] kwargs.pop('data_key', None) ntree = rfv['ntree'] kwargs.pop('ntree', None) # scoring # RFView.html? # dataKeyTest=a5m.hex& # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628& # response_variable=1& # ntree=50& # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0& # out_of_bag_error_estimate=1& h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # new web page for predict? throw it in here for now start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 0 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 1 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 1 kwargs['class_weights'] = '1=1,2=2,3=3,4=4,5=5,6=6,7=7' h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' print "Trial #", trial, "completed"
def test_rf_big1_nopoll(self): csvFilename = 'hhp_107_01.data.gz' csvPathname = h2o.find_file("smalldata/" + csvFilename) key2 = csvFilename + ".hex" print "\n" + csvPathname parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=15) firstRfView = None # dispatch multiple jobs back to back for jobDispatch in range(3): start = time.time() kwargs = {} if OVERWRITE_RF_MODEL: print "Since we're overwriting here, we have to wait for each to complete nopoll=False" model_key = 'RF_model' else: model_key = 'RF_model' + str(jobDispatch) print "Change the number of trees, while keeping the rf model key name the same" print "Checks that we correctly overwrite previous rf model" if OVERWRITE_RF_MODEL: kwargs['ntree'] = 7 + jobDispatch else: kwargs['ntree'] = 7 # don't change the seed if we're overwriting the model. It should get # different results just from changing the tree count kwargs['seed'] = random.randint(0, sys.maxint) # FIX! what model keys do these get? randomNode = h2o.nodes[random.randint(0,len(h2o.nodes)-1)] h2o_cmd.runRFOnly(node=randomNode, parseKey=parseKey, model_key=model_key, timeoutSecs=300, noPoll=True, **kwargs) # FIX! are these already in there? rfView = {} rfView['data_key'] = key2 rfView['model_key'] = model_key rfView['ntree'] = kwargs['ntree'] print "rf job dispatch end on ", csvPathname, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch # we're going to compare rf results to previous as we go along (so we save rf view results h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # In this test we're waiting after each one, so we can save the RFView results for comparison to future print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['data_key'] model_key = rfView['model_key'] ntree = rfView['ntree'] # a = h2o.nodes[0].random_forest_view(data_key, model_key, noPoll=True) print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)" # allow it to poll to complete rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, ntree=ntree, timeoutSecs=60, noPoll=False) if firstRfView is None: # we'll use this to compare the others firstRfView = rfViewResult.copy() firstModelKey = model_key print "firstRfView", h2o.dump_json(firstRfView) else: print "Comparing", model_key, "to", firstModelKey df = h2o_util.JsonDiff(rfViewResult, firstRfView, vice_versa=True, with_values=True) print "df.difference:", h2o.dump_json(df.difference) self.assertGreater(len(df.difference), 29, msg="Want >=30 , not %d differences between the two rfView json responses. %s" % \ (len(df.difference), h2o.dump_json(df.difference)))
def test_rf_covtype_train_oobe3(self): print "\nUse randomFilter to sample the dataset randomly. then slice it" importFolderPath = "standard" csvFilename = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" print "\nUsing header=0 on the normal covtype.data" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, header=0, timeoutSecs=100) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # how many rows for each pct? num_rows = inspect['num_rows'] pct10 = int(num_rows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = num_rows - rowsForPct[9] rowsForPct[10] = num_rows # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79] expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78] print "Creating the key of the last 10% data, for scoring" dataKeyTest = "rTest" dataKeyTrain = "rTrain" # FIX! too many digits (10) in the 2nd param seems to cause stack trace execExpr = dataKeyTest + "=randomFilter(" + hex_key + "," + str(pct10) + ",12345)" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10) execExpr = dataKeyTrain + "=randomFilter(" + hex_key + "," + str(rowsForPct[9]) + ",12345)" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTrain, timeoutSecs=10) # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] for trial in range(1,10): # always slice from the beginning rowsToUse = rowsForPct[trial%10] resultKey = "r" + str(trial) execExpr = resultKey + "=slice(" + dataKeyTrain + ",1," + str(rowsToUse) + ")" # execExpr = resultKey + "=slice(" + dataKeyTrain + ",1)" h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10) parseResult['destination_key'] = resultKey # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['model_key'] = "model_" + str(trial) rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) oobeTrainPctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error']) self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=0.2) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%" # pop the stuff from kwargs that were passing as params model_key = rfv['model_key'] kwargs.pop('model_key',None) data_key = rfv['data_key'] kwargs.pop('data_key',None) ntree = rfv['ntree'] kwargs.pop('ntree',None) kwargs['iterative_cm'] = 1 # do full scoring kwargs['out_of_bag_error_estimate'] = 0 rfv = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # FIX! should update this expected classification error (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, ntree=ntree) self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) fullScorePctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error']) self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=0.2) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/num_rows), "pct. of all rows" actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "actualTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "actualScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp
def test_rf_covtype20x(self): importFolderPath = '/home/0xdiag/datasets/standard' importFolderResult = h2i.setupImportFolder(None, importFolderPath) csvFilenameTrain = 'covtype20x.data' key2 = 'covtype20x.data.A.hex' parseKeyTrain = h2i.parseImportFolderFile(None, csvFilenameTrain, importFolderPath, key2=key2, timeoutSecs=500) print csvFilenameTrain, 'parse time:', parseKeyTrain['response']['time'] inspect = h2o_cmd.runInspect(key=parseKeyTrain['destination_key']) dataKeyTrain = parseKeyTrain['destination_key'] print "Parse end", dataKeyTrain # have to re import since source key is gone # we could just copy the key, but sometimes we change the test/train data to covtype.data importFolderResult = h2i.setupImportFolder(None, importFolderPath) csvFilenameTest = 'covtype20x.data' key2 = 'covtype20x.data.B.hex' parseKeyTest = h2i.parseImportFolderFile(None, csvFilenameTest, importFolderPath, key2=key2, timeoutSecs=500) print csvFilenameTest, 'parse time:', parseKeyTest['response']['time'] print "Parse result['destination_key']:", parseKeyTest['destination_key'] inspect = h2o_cmd.runInspect(key=parseKeyTest['destination_key']) dataKeyTest = parseKeyTest['destination_key'] dataKeyTest2 = 'covtype20x.data.C.hex' print "Parse end", dataKeyTest # make a 3rd key so the predict is uncached too! execExpr = dataKeyTest2 + "=" + dataKeyTest resultExec = h2o_cmd.runExecOnly(expression=execExpr, timeoutSecs=15) # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. print "RF with no_confusion_matrix=1, so we can 'time' the RFView separately after job completion?" params = { 'ntree': 6, 'parallel': 1, 'out_of_bag_error_estimate': 0, 'no_confusion_matrix': 1, 'model_key': 'RF_model' } colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntree'] * 60 * (kwargs['parallel'] and 1 or 5) start = time.time() rfv = h2o_cmd.runRFOnly(parseKey=parseKeyTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, noPoll=True, **kwargs) print "rf job dispatch end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' ### print "rf response:", h2o.dump_json(rfv) start = time.time() h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=500, retryDelaySecs=5) print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['model_key'] ntree = kwargs['ntree'] start = time.time() h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree, timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' for trial in range(3): # scoring start = time.time() h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2) print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' print "Trial #", trial, "completed"
def rf_covtype_train_oobe(self, csvFilename, checkExpectedResults=True): # the expected results are only for the shuffled version # since getting 10% samples etc of the smallish dataset will vary between # shuffled and non-shuffled datasets importFolderPath = "standard" csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numCols = inspect['numCols'] numRows = inspect['numRows'] pct10 = int(numRows * .1) rowsForPct = [i * pct10 for i in range(0, 11)] # this can be slightly less than 10% last10 = numRows - rowsForPct[9] rowsForPct[10] = numRows # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] # 0 isn't used expectTrainPctRightList = [ 0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79 ] expectScorePctRightList = [ 0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78 ] # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] trial = 0 for rowPct in [0.9]: trial += 1 # Not using this now (did use it for slicing) rowsToUse = rowsForPct[trial % 10] resultKey = "r_" + csvFilename + "_" + str(trial) # just do random split for now dataKeyTrain = 'rTrain.hex' dataKeyTest = 'rTest.hex' h2o_cmd.createTestTrain(hex_key, dataKeyTrain, dataKeyTest, trainPercent=90, outputClass=4, outputCol=numCols - 1, changeToBinomial=not DO_MULTINOMIAL) sliceResult = {'destination_key': dataKeyTrain} # adjust timeoutSecs with the number of trees kwargs = paramDict.copy() kwargs['destination_key'] = "model_" + csvFilename + "_" + str( trial) timeoutSecs = 30 + kwargs['ntrees'] * 20 start = time.time() rfv = h2o_cmd.runRF(parseResult=sliceResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, **kwargs) # oobeTrainPctRight = 100 * (1.0 - error) oobeTrainPctRight = 100 - error if checkExpectedResults: self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%. Note this is silly if we trained on 100% of the data" print "Or sorted by output class, so that the last 10% is the last few classes" rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] rfvScoring = h2o_cmd.runRFView(None, dataKeyTest, model_key, used_trees, timeoutSecs, retryDelaySecs=1, **kwargs) (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfvScoring, **kwargs) fullScorePctRight = 100 - error h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) if checkExpectedResults: self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % ( rowsToUse * 100.0 / numRows), "pct. of all rows" actualDelta = [ abs(a - b) for a, b in zip(expectTrainPctRightList, actualTrainPctRightList) ] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "maybe should update with actual. Remove single quotes" print "actualTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [ abs(a - b) for a, b in zip(expectScorePctRightList, actualScorePctRightList) ] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "actualScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp return rfvScoring
def test_rfview_score(self): # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED csvPathnameTrain = h2o.find_file('smalldata/covtype/covtype.20k.data') print "Train with:", csvPathnameTrain parseKeyTrain = h2o_cmd.parseFile(csvPathname=csvPathnameTrain, key2="covtype.20k.hex", timeoutSecs=10) dataKeyTrain = parseKeyTrain['destination_key'] csvPathnameTest = h2o.find_dataset( 'UCI/UCI-large/covtype/covtype.data') print "Test with:", csvPathnameTest parseKeyTest = h2o_cmd.parseFile(csvPathname=csvPathnameTrain, key2="covtype.hex", timeoutSecs=10) dataKeyTest = parseKeyTest['destination_key'] for trial in range(5): # params is mutable. This is default. params = { 'ntree': 13, 'parallel': 1, 'out_of_bag_error_estimate': 0 } colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + 15 * (kwargs['parallel'] and 5 or 10) rfv = h2o_cmd.runRFOnly(parseKey=parseKeyTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) ### print "rf response:", h2o.dump_json(rfv) model_key = rfv['model_key'] # pop the stuff from kwargs that were passing as params kwargs.pop('model_key', None) data_key = rfv['data_key'] kwargs.pop('data_key', None) ntree = rfv['ntree'] kwargs.pop('ntree', None) # scoring # RFView.html? # dataKeyTest=a5m.hex& # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628& # response_variable=1& # ntree=50& # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0& # out_of_bag_error_estimate=1& # no_confusion_matrix=1& # clear_confusion_matrix=1 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) kwargs['no_confusion_matrix'] = 0 kwargs['clear_confusion_matrix'] = 0 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) kwargs['no_confusion_matrix'] = 0 kwargs['clear_confusion_matrix'] = 1 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) kwargs['no_confusion_matrix'] = 1 kwargs['clear_confusion_matrix'] = 0 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) kwargs['no_confusion_matrix'] = 1 kwargs['clear_confusion_matrix'] = 1 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) kwargs['no_confusion_matrix'] = 0 kwargs['clear_confusion_matrix'] = 0 kwargs['class_weights'] = '1=1,2=2,3=3,4=4,5=5,6=6,7=7' h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) print "Trial #", trial, "completed"
def test_rf_change_data_key_fvec(self): importFolderPath = 'standard' csvFilenameTrain = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilenameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500) inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # we could train on covtype, and then use covtype20x for test? or vice versa # parseResult = parseResult # dataKeyTest = dataKeyTrain csvFilenameTest = 'covtype20x.data' csvPathname = importFolderPath + "/" + csvFilenameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500) print "Parse result['destination_key']:", parseResultTest[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] print "Parse end", dataKeyTest # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. params = {'ntrees': 2, 'destination_key': 'RF_model'} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 100 start = time.time() rfv = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, noPoll=True, **kwargs) print "rf job dispatch end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' ### print "rf response:", h2o.dump_json(rfv) start = time.time() h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=360, pollTimeoutSecs=120, retryDelaySecs=5) print "rf job end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['destination_key'] ntrees = kwargs['ntrees'] start = time.time() h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntrees, timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' for trial in range(3): # scoring start = time.time() rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntrees, timeoutSecs, out_of_bag_error_estimate=1, retryDelaySecs=1) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds.' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees) # FIX! should update this expected classification error # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) print "predict", trial, "end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds.' print "Trial #", trial, "completed"
def test_rf_covtype_train_oobe(self): print "\nMichal will hate me for another file needed: covtype.shuffled.data" importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = 'covtype.shuffled.data' csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" h2i.setupImportFolder(None, importFolderPath) print "\nUsing header=0 on the normal covtype.data" parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=0, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # how many rows for each pct? num_rows = inspect['num_rows'] pct10 = int(num_rows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = num_rows - rowsForPct[9] rowsForPct[10] = last10 # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] # 0 isn't used expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79] expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78] print "Creating the key of the last 10% data, for scoring" dataKeyTest = "rTest" # start at 90% rows + 1 execExpr = dataKeyTest + " = slice(" + key2 + "," + str(rowsForPct[9]+1) + ")" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10) # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] for trial in range(1,10): # always slice from the beginning rowsToUse = rowsForPct[trial%10] resultKey = "r" + str(trial) execExpr = resultKey + " = slice(" + key2 + ",1," + str(rowsToUse) + ")" h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10) parseKey['destination_key'] = resultKey # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['model_key'] = "model_" + str(trial) rfv = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) oobeTrainPctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error']) self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=0.2) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%" # pop the stuff from kwargs that were passing as params model_key = rfv['model_key'] kwargs.pop('model_key',None) data_key = rfv['data_key'] kwargs.pop('data_key',None) ntree = rfv['ntree'] kwargs.pop('ntree',None) # scoring # RFView.html? # dataKeyTest=a5m.hex& # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628& # response_variable=1& # ntree=50& # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0& # out_of_bag_error_estimate=1& # does this still exist? # no_confusion_matrix=1 kwargs['iterative_cm'] = 1 kwargs['no_confusion_matrix'] = 1 # do full scoring kwargs['out_of_bag_error_estimate'] = 0 rfv = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) h2o.nodes[0].random_forest_predict(model_key=model_key, key=dataKeyTest) fullScorePctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error']) self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=0.2) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/num_rows), "pct. of all rows" actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "maybe should update with actual. Remove single quotes" print"expectTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "expectScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp
def rf_covtype_train_oobe(self, csvFilename, checkExpectedResults=True): # the expected results are only for the shuffled version # since getting 10% samples etc of the smallish dataset will vary between # shuffled and non-shuffled datasets importFolderPath = "standard" csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numCols = inspect['numCols'] numRows = inspect['numRows'] pct10 = int(numRows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = numRows - rowsForPct[9] rowsForPct[10] = numRows # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] # 0 isn't used expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79] expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78] # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] trial = 0 for rowPct in [0.9]: trial += 1 # Not using this now (did use it for slicing) rowsToUse = rowsForPct[trial%10] resultKey = "r_" + csvFilename + "_" + str(trial) # just do random split for now dataKeyTrain = 'rTrain.hex' dataKeyTest = 'rTest.hex' createTestTrain(hex_key, dataKeyTrain, dataKeyTest, percent=0.90, outputClass=4, numCols=numCols) sliceResult = {'destination_key': dataKeyTrain} # adjust timeoutSecs with the number of trees kwargs = paramDict.copy() kwargs['destination_key'] = "model_" + csvFilename + "_" + str(trial) timeoutSecs = 30 + kwargs['ntrees'] * 20 start = time.time() rfv = h2o_cmd.runRF(parseResult=sliceResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv) # oobeTrainPctRight = 100 * (1.0 - error) oobeTrainPctRight = 100 - error if checkExpectedResults: self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%. Note this is silly if we trained on 100% of the data" print "Or sorted by output class, so that the last 10% is the last few classes" rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_selfKey'] rfvScoring = h2o_cmd.runRFView(None, dataKeyTest, model_key, used_trees, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfvScoring) fullScorePctRight = 100 - error h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) if checkExpectedResults: self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/numRows), "pct. of all rows" actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "maybe should update with actual. Remove single quotes" print "actualTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "actualScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp # return the last rfv done during training return rfv
def test_mnist8m_RF_bench(self): overallWallStart = time.time() importFolderPath = '/home/0xdiag/datasets/mnist/mnist8m' importFolderResult = h2i.setupImportFolder(None, importFolderPath) output = None if not os.path.exists('rfbench.csv'): output = open('rfbench.csv', 'w') output.write(','.join(csv_header) + '\n') else: output = open('rfbench.csv', 'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore', delimiter=',') try: java_heap_GB = h2o.nodes[0].java_heap_GB #Train File Parsing# trainParseWallStart = time.time() print "Training file is: ", files['train'] csvPathname = files['train'] destKey = files['train'] + '.hex' parseKey = h2i.parseImportFolderFile(None, csvPathname, importFolderPath, key2=destKey, timeoutSecs=3600, retryDelaySecs=5, pollTimeoutSecs=120) trainParseWallTime = time.time() - trainParseWallStart #End Train File Parse# inspect = h2o.nodes[0].inspect(parseKey['destination_key']) row = { 'java_heap_GB': java_heap_GB, 'dataset': 'mnist8m', 'nTrainRows': inspect['num_rows'], 'nCols': inspect['num_cols'], #'nIgnoredCols':nIgnoredCols,'ignoredCols':ignoredCols, 'trainParseWallTime': trainParseWallTime } #RF+RFView (train)# kwargs = configs.copy() trainRFStart = time.time() rfView = h2o_cmd.runRFOnly(parseKey=parseKey, rfView=True, timeoutSecs=3600, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs) trainViewTime = time.time() - trainRFStart #End RF+RFView (train)# row.update({'trainViewTime': trainViewTime}) h2o_rf.simpleCheckRFView(None, rfView, **kwargs) modelKey = rfView['model_key'] #Test File Parsing# testParseWallStart = time.time() print "Testing file is: ", files['test'] csvPathname = files['test'] destKey = files['test'] + '.hex' parseKey = h2i.parseImportFolderFile(None, csvPathname, importFolderPath, key2=destKey, timeoutSecs=3600, retryDelaySecs=5, pollTimeoutSecs=120) testParseWallTime = time.time() - testParseWallStart #End Test File Parse# inspect = h2o.nodes[0].inspect(parseKey['destination_key']) row.update({'nTestRows': inspect['num_rows']}) row.update({'testParseWallTime': testParseWallTime}) modelKey = rfView['model_key'] #RFView (score on test)# kwargs = configs.copy() testRFStart = time.time() kwargs.update({ 'model_key': modelKey, 'ntree': 10, 'out_of_bag_error_estimate': 1 }) rfView = h2o_cmd.runRFView(data_key=destKey, timeoutSecs=3600, doSimpleCheck=False, **kwargs) testViewTime = time.time() - testRFStart #End RFView (score on test)# pprint(rfView) errRate = rfView['confusion_matrix']['classification_error'] row.update({'testViewTime': testViewTime}) overallWallTime = time.time() - overallWallStart row.update({'overallWallTime': overallWallTime}) row.update({'errRate': errRate}) print row csvWrt.writerow(row) #h2o.nodes[0].remove_key(k) finally: output.close()
def test_rf_covtype_fvec(self): importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" h2i.setupImportFolder(None, importFolderPath) print "\nUsing header=0 on the normal covtype.data" parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=0, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) rfViewInitial = [] for jobDispatch in range(1): # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['model_key'] = "model_" + str(jobDispatch) # don't poll for fvec rfResult = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, noPoll=True, rfView=False, **kwargs) elapsed = time.time() - start print "RF dispatch end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) print h2o.dump_json(rfResult) # FIX! are these already in there? rfView = {} rfView['data_key'] = key2 rfView['model_key'] = kwargs['model_key'] rfView['ntree'] = kwargs['ntree'] rfViewInitial.append(rfView) print "rf job dispatch end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "\njobDispatch #", jobDispatch h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=120, retryDelaySecs=5) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['data_key'] model_key = rfView['model_key'] ntree = rfView['ntree'] # allow it to poll to complete rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, ntree=ntree, timeoutSecs=60, noPoll=False)
def test_RF_mnist_both(self): csvFilelist = [ # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist_training*gz'), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist_training*gz'), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist_training*gz'), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist_training*gz'), ] # IMPORT********************************************** trial = 0 allDelta = [] importFolderPath = "mnist" for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed, parsePattern) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() csvPathname = importFolderPath + "/" + testCsvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training" trainKey = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() csvPathname = importFolderPath + "/" + parsePattern parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=parsePattern, schema='local', timeoutSecs=300) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** print "This is the 'ignore=' we'll use" y = 0 # first column is pixel value ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, returnIgnoreX=True) ntree = 100 params = { 'response': y, 'ignored_cols': ignore_x, 'ntrees': ntree, # 'data_key='mnist_training.csv.hex' 'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'max_depth': 500, 'destination_key': 'RF_model', 'nbins': 1024, } if rfSeed is None: params['seed'] = random.randint(0,sys.maxint) else: params['seed'] = rfSeed print "RF seed:", params['seed'] kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfView, **params) modelKey = rfView['model_key'] # RFView (score on test)**************************************** start = time.time() kwargs = {'response': y} rfView = h2o_cmd.runRFView(data_key=testKey, model_key=modelKey, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) print "classification error is expected to be low because we included the test data in with the training!" self.assertAlmostEqual(classification_error, 0.028, delta=0.01, msg="Classification error %s differs too much" % classification_error) treeStats = rfView['drf_model']['treesStats'] # Expected values are from this case: # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), expected = {'minLeaves': 4996, 'meanLeaves': 5064.1, 'maxLeaves': 5148} expected += {'minDepth': 21, 'meanDepth': 23.8, 'maxDepth': 25} for key in expected: delta = ((expected[key]- actual[key])/expected[key]) * 100 d = "seed: %s %s %s %s %s pct. different %s" % (params['seed'], key, actual[key], expected[key], delta) print d allDelta.append(d) # FIX! should change this to an assert? # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # Done ******************************************************* print "\nShowing the results again from all the trials, to see variance" for d in allDelta: print d
def run_rf(files,configs): overallWallStart = time.time() output = None #if not os.path.exists('rfbench.csv'): # output = open('rfbench.csv','w') # output.write(','.join(csv_header)+'\n') #else: # output = open('rfbench.csv','a') #csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, # dialect='excel', extrasaction='ignore',delimiter=',') #csvWrt.writeheader() try: java_heap_GB = h2o.nodes[0].java_heap_GB #Train File Parsing# trainParseWallStart = time.time() print "Training file is: ", files['train'] importFolderPath = "mnist/mnist8m" csvPathname = importFolderPath + "/" + files['train'] hex_key = files['train'] + '.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key timeoutSecs=3600,retryDelaySecs=5,pollTimeoutSecs=120) trainParseWallTime = time.time() - trainParseWallStart #End Train File Parse# inspect = h2o.nodes[0].inspect(parseResult['destination_key']) row = {'java_heap_GB':java_heap_GB,'dataset':'mnist8m', 'nTrainRows': inspect['num_rows'],'nCols':inspect['num_cols'], #'nIgnoredCols':nIgnoredCols,'ignoredCols':ignoredCols, 'trainParseWallTime':trainParseWallTime} #RF+RFView (train)# kwargs = configs.copy() trainRFStart = time.time() rfView = h2o_cmd.runRFOnly(parseResult=parseResult,rfView=True, timeoutSecs= 3600,pollTimeoutSecs= 60,retryDelaySecs = 2, **kwargs) trainViewTime = time.time() - trainRFStart #End RF+RFView (train)# row.update({'trainViewTime':trainViewTime}) h2o_rf.simpleCheckRFView(None, rfView, **kwargs) modelKey = rfView['model_key'] #Test File Parsing# testParseWallStart = time.time() print "Testing file is: ", files['test'] importFolderPath = "mnist/mnist8m" csvPathname = importFolderPath + "/" + files['test'] hex_key = files['test'] + '.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key timeoutSecs=3600,retryDelaySecs=5,pollTimeoutSecs=120) testParseWallTime = time.time() - testParseWallStart #End Test File Parse# inspect = h2o.nodes[0].inspect(parseResult['destination_key']) row.update({'nTestRows':inspect['num_rows']}) row.update({'testParseWallTime':testParseWallTime}) modelKey = rfView['model_key'] #RFView (score on test)# kwargs = configs.copy() testRFStart = time.time() kwargs.update({'model_key':modelKey,'ntree':10}) rfView = h2o_cmd.runRFView(data_key=hex_key,timeoutSecs=180, doSimpleCheck=False,**kwargs) testViewTime = time.time() - testRFStart #End RFView (score on test)# pprint(rfView) errRate = rfView['confusion_matrix']['classification_error'] row.update({'testViewTime':testViewTime}) overallWallTime = time.time() - overallWallStart row.update({'overallWallTime':overallWallTime}) row.update({'errRate':errRate}) print row #csvWrt.writerow(row) #h2o.nodes[0].remove_key(k) finally: output.close()
def test_RF_mnist(self): importFolderPath = "/home/0xdiag/datasets/mnist" csvFilelist = [ # ("mnist_testing.csv.gz", "mnist_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_testing.csv.gz", "mnist_testing.csv.gz", 600), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list importFolderResult = h2i.setupImportFolder(None, importFolderPath) ### print "importHDFSResult:", h2o.dump_json(importFolderResult) if 'files' in importFolderResult: succeededList = importFolderResult['files'] else: succeededList = importFolderResult['succeeded'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, testCsvFilename, importFolderPath, key2=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, trainCsvFilename, importFolderPath, key2=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # RF+RFView (train)**************************************** print "This is the 'ignore=' we'll use" ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300, forRF=True) ntree = 10 params = { 'response_variable': 0, 'ignore': ignore_x, 'ntree': ntree, 'iterative_cm': 1, 'out_of_bag_error_estimate': 1, # 'data_key='mnist_training.csv.hex' 'features': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'exclusive_split_limit': None, 'depth': 2147483647, 'stat_type': 'ENTROPY', 'sampling_strategy': 'RANDOM', 'sample': 67, # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77', 'model_key': 'RF_model', 'bin_limit': 1024, 'seed': 784834182943470027, 'parallel': 1, 'use_non_local_data': 0, 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0', } kwargs = params.copy() timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runRFOnly(parseKey=parseKey, rfView=True, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfView, **params) modelKey = rfView['model_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response_variable': y} rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_rf_change_data_key_fvec(self): h2o.beta_features = True importFolderPath = 'standard' csvFilenameTrain = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilenameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500) inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # we could train on covtype, and then use covtype20x for test? or vice versa # parseResult = parseResult # dataKeyTest = dataKeyTrain csvFilenameTest = 'covtype20x.data' csvPathname = importFolderPath + "/" + csvFilenameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500) print "Parse result['destination_key']:", parseResultTest['destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] print "Parse end", dataKeyTest # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. params = { 'ntrees': 6, 'destination_key': 'RF_model' } colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntrees'] * 60 start = time.time() rfv = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, noPoll=True, **kwargs) print "rf job dispatch end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' ### print "rf response:", h2o.dump_json(rfv) start = time.time() h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=120, retryDelaySecs=5) print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['destination_key'] ntrees = kwargs['ntrees'] start = time.time() h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntrees, timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' for trial in range(3): # scoring start = time.time() rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntrees, timeoutSecs, out_of_bag_error_estimate=1, retryDelaySecs=1) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees) # FIX! should update this expected classification error # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' print "Trial #", trial, "completed"
def test_rf_mnist_both_fvec(self): h2o.beta_features = True importFolderPath = "mnist" csvFilelist = [ # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), # to see results a 2nd time ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist*gz'), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list (importFolderResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=importFolderPath + "/*") ### print "importHDFSResult:", h2o.dump_json(importFolderResult) if 'files' in importFolderResult: succeededList = importFolderResult['files'] else: succeededList = importFolderResult['succeeded'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 allDelta = [] for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed, parsePattern) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training" trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath+"/"+parsePattern, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** print "Not using ignore from this..have to adjust cols?" h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) ntree = 2 params = { 'response': 'C1', # 'ignored_cols_by_name': ignore_x, 'ntrees': ntree, 'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'max_depth': 20, 'sample_rate': 0.67, 'destination_key': 'RF_model', 'nbins': 100, 'importance': 0, 'balance_classes': 0, } if rfSeed is None: params['seed'] = random.randint(0,sys.maxint) else: params['seed'] = rfSeed print "RF seed:", params['seed'] kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, rfView=True, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # print 'rfView:', h2o.dump_json(rfView) h2o_rf.simpleCheckRFView(None, rfView, **params) modelKey = rfView['drf_model']['_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response': y} rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) # training and test data are unique, so error won't be low? # self.assertAlmostEqual(classification_error, 0.0003, delta=0.0003, msg="Classification error %s differs too much" % classification_error) leaves = { 'min': rfView['drf_model']['treeStats']['minLeaves'], 'mean': rfView['drf_model']['treeStats']['meanLeaves'], 'max': rfView['drf_model']['treeStats']['maxLeaves'], } # Expected values are from this case: # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), leavesExpected = {'min': 537, 'mean': 1118.05, 'max': 1701} for l in leaves: # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l])) delta = ((leaves[l] - leavesExpected[l])/leaves[l]) * 100 d = "seed: %s leaves %s %s %s pct. different %s" % (params['seed'], l, leaves[l], leavesExpected[l], delta) print d allDelta.append(d) depth = { 'min': rfView['drf_model']['treeStats']['minDepth'], 'mean': rfView['drf_model']['treeStats']['meanDepth'], 'max': rfView['drf_model']['treeStats']['maxDepth'], } depthExpected = {'min': 20, 'mean': 20, 'max': 20} for l in depth: # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l])) delta = ((depth[l] - depthExpected[l])/leaves[l]) * 100 d = "seed: %s depth %s %s %s pct. different %s" % (params['seed'], l, depth[l], depthExpected[l], delta) print d allDelta.append(d) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # Done ******************************************************* print "\nShowing the results again from all the trials, to see variance" for d in allDelta: print d
def test_rf_log_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 100, 'cA', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # CREATE test dataset****************************************************** csvFilename = 'syn_test_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) testParseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10) print "Test Parse result['destination_key']:", testParseResult['destination_key'] dataKeyTest = testParseResult['destination_key'] # CREATE train dataset****************************************************** csvFilename = 'syn_train_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) trainParseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10) print "Train Parse result['destination_key']:", trainParseResult['destination_key'] dataKeyTrain = trainParseResult['destination_key'] # RF train****************************************************** # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntrees'] * 20 start = time.time() # do oobe kwargs['response'] = "C" + str(colCount+1) rfv = h2o_cmd.runRF(parseResult=trainParseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, ntree=used_trees) oobeTrainPctRight = 100.0 - classification_error expectTrainPctRight = 94 self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRight,\ msg="OOBE: pct. right for training not close enough %6.2f %6.2f"% (oobeTrainPctRight, expectTrainPctRight), delta=5) # RF score****************************************************** print "Now score with the 2nd random dataset" rfv = h2o_cmd.runRFView(data_key=dataKeyTest, model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, ntree=used_trees) self.assertTrue(classification_error<=5.0, msg="Classification error %s too big" % classification_error) predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) fullScorePctRight = 100.0 - classification_error expectScorePctRight = 94 self.assertTrue(fullScorePctRight >= expectScorePctRight, msg="Full: pct. right for scoring not close enough %6.2f %6.2f"% (fullScorePctRight, expectScorePctRight), delta=5)
def rf_covtype_train_oobe(self, csvFilename, checkExpectedResults=True): # the expected results are only for the shuffled version # since getting 10% samples etc of the smallish dataset will vary between # shuffled and non-shuffled datasets importFolderPath = "/home/0xdiag/datasets/standard" csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" h2i.setupImportFolder(None, importFolderPath) print "\nUsing header=0 on", csvFilename parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=0, timeoutSecs=180) inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # how many rows for each pct? num_rows = inspect['num_rows'] pct10 = int(num_rows * .1) rowsForPct = [i * pct10 for i in range(0, 11)] # this can be slightly less than 10% last10 = num_rows - rowsForPct[9] rowsForPct[10] = num_rows # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] # 0 isn't used expectTrainPctRightList = [ 0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79 ] expectScorePctRightList = [ 0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78 ] print "Creating the key of the last 10% data, for scoring" dataKeyTest = "rTest" # start at 90% rows + 1 execExpr = dataKeyTest + " = slice(" + key2 + "," + str(rowsForPct[9] + 1) + ")" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10) # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] # don't use the smaller samples..bad error rates, plus for sorted covtype, you can get just one class! for trial in range(8, 9): # always slice from the beginning rowsToUse = rowsForPct[trial % 10] resultKey = "r_" + csvFilename + "_" + str(trial) execExpr = resultKey + " = slice(" + key2 + ",1," + str( rowsToUse) + ")" h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10) # hack so the RF will use the sliced result # FIX! don't use the sliced bit..use the whole data for rf training below ### parseKey['destination_key'] = resultKey # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['model_key'] = "model_" + csvFilename + "_" + str(trial) # kwargs['model_key'] = "model" # double check the rows/cols inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) h2o_cmd.infoFromInspect(inspect, "going into RF") start = time.time() rfv = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) oobeTrainPctRight = 100 * ( 1.0 - rfv['confusion_matrix']['classification_error']) if checkExpectedResults: self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%. Note this is silly if we trained on 100% of the data" print "Or sorted by output class, so that the last 10% is the last few classes" # pop the stuff from kwargs that were passing as params model_key = rfv['model_key'] kwargs.pop('model_key', None) data_key = rfv['data_key'] kwargs.pop('data_key', None) ntree = rfv['ntree'] kwargs.pop('ntree', None) kwargs['iterative_cm'] = 1 kwargs['no_confusion_matrix'] = 0 # do full scoring kwargs['out_of_bag_error_estimate'] = 0 # double check the rows/cols inspect = h2o_cmd.runInspect(key=dataKeyTest) h2o_cmd.infoFromInspect(inspect, "dataKeyTest") rfvScoring = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) fullScorePctRight = 100 * ( 1.0 - rfvScoring['confusion_matrix']['classification_error']) if checkExpectedResults: self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % ( rowsToUse * 100.0 / num_rows), "pct. of all rows" actualDelta = [ abs(a - b) for a, b in zip(expectTrainPctRightList, actualTrainPctRightList) ] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "maybe should update with actual. Remove single quotes" print "actualTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [ abs(a - b) for a, b in zip(expectScorePctRightList, actualScorePctRightList) ] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "actualScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp # return the last rfv done during training return rfv
def test_RF_mnist_both(self): importFolderPath = "/home/0xdiag/datasets/mnist_repl" csvFilelist = [ # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), ("mnist_training.csv.gz", "mnist_testing_0.csv.gz", 600, None, '*mnist_training*gz'), ("mnist_training.csv.gz", "mnist_testing_0.csv.gz", 600, None, '*mnist_training*gz'), ("mnist_training.csv.gz", "mnist_testing_0.csv.gz", 600, None, '*mnist_training*gz'), ("mnist_training.csv.gz", "mnist_testing_0.csv.gz", 600, None, '*mnist_training*gz'), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list importFolderResult = h2i.setupImportFolder(None, importFolderPath) ### print "importHDFSResult:", h2o.dump_json(importFolderResult) succeededList = importFolderResult['files'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList), 1, "Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 allDelta = [] for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed, parsePattern) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, testCsvFilename, importFolderPath, key2=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) # PARSE train**************************************** print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training" trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, parsePattern, importFolderPath, key2=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # RF+RFView (train)**************************************** print "This is the 'ignore=' we'll use" ignore_x = h2o_glm.goodXFromColumnInfo( y, key=parseKey['destination_key'], timeoutSecs=300, forRF=True) ntree = 100 params = { 'response_variable': 0, 'ignore': ignore_x, 'ntree': ntree, 'iterative_cm': 1, 'out_of_bag_error_estimate': 1, # 'data_key='mnist_training.csv.hex' 'features': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'exclusive_split_limit': None, 'depth': 2147483647, 'stat_type': 'ENTROPY', 'sampling_strategy': 'RANDOM', 'sample': 67, # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77', 'model_key': 'RF_model', 'bin_limit': 1024, # 'seed': 784834182943470027, 'parallel': 1, 'use_non_local_data': 0, 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0', } if rfSeed is None: params['seed'] = random.randint(0, sys.maxint) else: params['seed'] = rfSeed print "RF seed:", params['seed'] kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runRFOnly(parseKey=parseKey, rfView=False, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfView, **params) modelKey = rfView['model_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response_variable': y} rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) print "classification error is expected to be low because we included the test data in with the training!" self.assertAlmostEqual( classification_error, 0.028, delta=0.01, msg="Classification error %s differs too much" % classification_error) leaves = rfView['trees']['leaves'] # Expected values are from this case: # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), leavesExpected = {'min': 4996, 'mean': 5064.1, 'max': 5148} for l in leaves: # self.assertAlmostEqual(leaves[l], leavesExpected[l], delta=10, msg="leaves %s %s %s differs too much" % (l, leaves[l], leavesExpected[l])) delta = ((leaves[l] - leavesExpected[l]) / leaves[l]) * 100 d = "seed: %s leaves %s %s %s pct. different %s" % ( params['seed'], l, leaves[l], leavesExpected[l], delta) print d allDelta.append(d) depth = rfView['trees']['depth'] depthExpected = {'min': 21, 'mean': 23.8, 'max': 25} for l in depth: # self.assertAlmostEqual(depth[l], depthExpected[l], delta=1, msg="depth %s %s %s differs too much" % (l, depth[l], depthExpected[l])) delta = ((depth[l] - depthExpected[l]) / leaves[l]) * 100 d = "seed: %s depth %s %s %s pct. different %s" % ( params['seed'], l, depth[l], depthExpected[l], delta) print d allDelta.append(d) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions( model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # Done ******************************************************* print "\nShowing the results again from all the trials, to see variance" for d in allDelta: print d
def test_rf_log_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (10000, 100, 'cA', 300), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) # CREATE test dataset****************************************************** csvFilename = 'syn_test_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) testParseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10) print "Test Parse result['destination_key']:", testParseResult[ 'destination_key'] dataKeyTest = testParseResult['destination_key'] # CREATE train dataset****************************************************** csvFilename = 'syn_train_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) trainParseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=10) print "Train Parse result['destination_key']:", trainParseResult[ 'destination_key'] dataKeyTrain = trainParseResult['destination_key'] # RF train****************************************************** # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntrees'] * 20 start = time.time() # do oobe kwargs['response'] = "C" + str(colCount + 1) rfv = h2o_cmd.runRF(parseResult=trainParseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, ntree=used_trees) oobeTrainPctRight = 100.0 - classification_error expectTrainPctRight = 94 self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRight,\ msg="OOBE: pct. right for training not close enough %6.2f %6.2f"% (oobeTrainPctRight, expectTrainPctRight), delta=5) # RF score****************************************************** print "Now score with the 2nd random dataset" rfv = h2o_cmd.runRFView(data_key=dataKeyTest, model_key=model_key, timeoutSecs=timeoutSecs, retryDelaySecs=1) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, ntree=used_trees) self.assertTrue(classification_error <= 5.0, msg="Classification error %s too big" % classification_error) predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) fullScorePctRight = 100.0 - classification_error expectScorePctRight = 94 self.assertTrue( fullScorePctRight >= expectScorePctRight, msg="Full: pct. right for scoring not close enough %6.2f %6.2f" % (fullScorePctRight, expectScorePctRight), delta=5)
def test_rf_covtype_train_oobe(self): print "\nUse randomBitVector and filter to separate the dataset randomly" importFolderPath = "/home/0xdiag/datasets" csvFilename = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" print "\nUsing header=0 on the normal covtype.data" # don't import it, just so we don't have all the key names cluttering the view all # in the browser parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, header=0, timeoutSecs=100) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # how many rows for each pct? num_rows = inspect['num_rows'] pct10 = int(num_rows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = num_rows - rowsForPct[9] rowsForPct[10] = last10 # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] # this was with 10 trees # expectTrainPctRightList = [0, 85.27, 88.45, 89.99, 91.11, 91.96, 92.51, 93.03, 93.45, 93.78] # expectScorePctRightList = [0, 89.10, 91,90, 93.26, 94.25, 94.74, 95.10, 95.42, 95.72, 95.92] # 0 isn't used expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79] expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78] print "Creating the key of the last 10% data, for scoring" dataKeyTest = "rTest" dataKeyTrain = "rTrain" # start at 90% rows + 1 # randomBitVector(size,selected) # randomFilter(srcFrame,rows,seed) # filter(srcFrame,bitVect) ### h2b.browseTheCloud() # odd. output is byte, all other exec outputs are 8 byte? (at least the ones below?) execExpr = "rbv=randomBitVector(" + str(num_rows) + "," + str(last10) + ",12345)" h2o_exec.exec_expr(None, execExpr, resultKey="rbv", timeoutSecs=10) # complement the bit vector execExpr = "not_rbv=colSwap(rbv,0,rbv[0]==0?1:0)" h2o_exec.exec_expr(None, execExpr, resultKey="not_rbv", timeoutSecs=10) execExpr = dataKeyTest + "=filter(" + key2 + ",rbv)" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10) execExpr = dataKeyTrain + "=filter(" + key2 + ",not_rbv)" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTrain, timeoutSecs=10) ### time.sleep(3600) # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] for trial in range(1,10): # always slice from the beginning rowsToUse = rowsForPct[trial%10] resultKey = "r" + str(trial) execExpr = resultKey + "=slice(" + dataKeyTrain + ",1," + str(rowsToUse) + ")" h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10) parseKey['destination_key'] = resultKey # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['model_key'] = "model_" + str(trial) rfv = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) oobeTrainPctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error']) self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=0.2) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%" # pop the stuff from kwargs that were passing as params model_key = rfv['model_key'] kwargs.pop('model_key',None) data_key = rfv['data_key'] kwargs.pop('data_key',None) ntree = rfv['ntree'] kwargs.pop('ntree',None) # scoring # RFView.html? # dataKeyTest=a5m.hex& # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628& # response_variable=1& # ntree=50& # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0& # out_of_bag_error_estimate=1& # no_confusion_matrix=1& # clear_confusion_matrix=1 ### dataKeyTest = data_key kwargs['clear_confusion_matrix'] = 1 kwargs['no_confusion_matrix'] = 0 # do full scoring kwargs['out_of_bag_error_estimate'] = 0 rfv = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) fullScorePctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error']) self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=0.2) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/num_rows), "pct. of all rows" actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "maybe should update with actual. Remove single quotes" print"expectTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "expectScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp
def run_rf(files,configs): overallWallStart = time.time() output = None #if not os.path.exists('rfbench.csv'): # output = open('rfbench.csv','w') # output.write(','.join(csv_header)+'\n') #else: # output = open('rfbench.csv','a') #csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, # dialect='excel', extrasaction='ignore',delimiter=',') #csvWrt.writeheader() try: java_heap_GB = h2o.nodes[0].java_heap_GB #Train File Parsing# trainParseWallStart = time.time() print "Training file is: ", files['train'] importFolderPath = "mnist/mnist8m" csvPathname = importFolderPath + "/" + files['train'] hex_key = files['train'] + '.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key timeoutSecs=3600,retryDelaySecs=5,pollTimeoutSecs=120) trainParseWallTime = time.time() - trainParseWallStart #End Train File Parse# inspect = h2o.nodes[0].inspect(parseResult['destination_key']) row = {'java_heap_GB':java_heap_GB,'dataset':'mnist8m', 'nTrainRows': inspect['numRows'],'nCols':inspect['numCols'], #'nIgnoredCols':nIgnoredCols,'ignoredCols':ignoredCols, 'trainParseWallTime':trainParseWallTime} #RF+RFView (train)# kwargs = configs.copy() trainRFStart = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult,rfView=True, timeoutSecs= 3600,pollTimeoutSecs= 60,retryDelaySecs = 2, **kwargs) trainViewTime = time.time() - trainRFStart #End RF+RFView (train)# row.update({'trainViewTime':trainViewTime}) h2o_rf.simpleCheckRFView(None, rfView, **kwargs) modelKey = rfView['model_key'] #Test File Parsing# testParseWallStart = time.time() print "Testing file is: ", files['test'] importFolderPath = "mnist/mnist8m" csvPathname = importFolderPath + "/" + files['test'] hex_key = files['test'] + '.hex' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key timeoutSecs=3600,retryDelaySecs=5,pollTimeoutSecs=120) testParseWallTime = time.time() - testParseWallStart #End Test File Parse# inspect = h2o.nodes[0].inspect(parseResult['destination_key']) row.update({'nTestRows':inspect['numRows']}) row.update({'testParseWallTime':testParseWallTime}) modelKey = rfView['model_key'] #RFView (score on test)# kwargs = configs.copy() testRFStart = time.time() kwargs.update({'model_key':modelKey,'ntree':10}) rfView = h2o_cmd.runRFView(data_key=hex_key,timeoutSecs=180, doSimpleCheck=False,**kwargs) testViewTime = time.time() - testRFStart #End RFView (score on test)# pprint(rfView) errRate = rfView['confusion_matrix']['classification_error'] row.update({'testViewTime':testViewTime}) overallWallTime = time.time() - overallWallStart row.update({'overallWallTime':overallWallTime}) row.update({'errRate':errRate}) print row #csvWrt.writerow(row) #h2o.nodes[0].remove_key(k) finally: output.close()
def test_rf_change_data_key(self): importFolderPath = '/home/0xdiag/datasets/standard' importFolderResult = h2i.setupImportFolder(None, importFolderPath) csvFilenameTrain = 'covtype.data' parseKeyTrain = h2i.parseImportFolderFile(None, csvFilenameTrain, importFolderPath, timeoutSecs=500) print csvFilenameTrain, 'parse time:', parseKeyTrain['response']['time'] inspect = h2o_cmd.runInspect(key=parseKeyTrain['destination_key']) dataKeyTrain = parseKeyTrain['destination_key'] print "Parse end", dataKeyTrain # we could train on covtype, and then use covtype20x for test? or vice versa # parseKey = parseKey # dataKeyTest = dataKeyTrain csvFilenameTest = 'covtype20x.data' parseKeyTest = h2i.parseImportFolderFile(None, csvFilenameTest, importFolderPath, timeoutSecs=500) print csvFilenameTest, 'parse time:', parseKeyTest['response']['time'] print "Parse result['destination_key']:", parseKeyTest['destination_key'] inspect = h2o_cmd.runInspect(key=parseKeyTest['destination_key']) dataKeyTest = parseKeyTest['destination_key'] print "Parse end", dataKeyTest # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. print "RF with no_confusion_matrix=1, so we can 'time' the RFView separately after job completion?" params = { 'ntree': 6, 'parallel': 1, 'out_of_bag_error_estimate': 0, 'no_confusion_matrix': 1, 'model_key': 'RF_model' } colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntree'] * 60 * (kwargs['parallel'] and 1 or 5) start = time.time() rfv = h2o_cmd.runRFOnly(parseKey=parseKeyTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, noPoll=True, **kwargs) print "rf job dispatch end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' ### print "rf response:", h2o.dump_json(rfv) start = time.time() h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=180, pollTimeoutSecs=120, retryDelaySecs=5) print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['model_key'] ntree = kwargs['ntree'] start = time.time() h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree, timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' for trial in range(3): # scoring start = time.time() h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, out_of_bag_error_estimate=1, retryDelaySecs=1) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' print "Trial #", trial, "completed"
def test_rf_covtype_train_oobe2(self): print "\nUse randomBitVector and filter to separate the dataset randomly" importFolderPath = "/home/0xdiag/datasets/standard" csvFilename = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" h2i.setupImportFolder(None, importFolderPath) print "\nUsing header=0 on the normal covtype.data" parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=0, timeoutSecs=100) inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # how many rows for each pct? num_rows = inspect['num_rows'] pct10 = int(num_rows * .1) rowsForPct = [i * pct10 for i in range(0, 11)] # this can be slightly less than 10% last10 = num_rows - rowsForPct[9] rowsForPct[10] = last10 # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] expectTrainPctRightList = [ 0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79 ] expectScorePctRightList = [ 0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78 ] print "Creating the key of the last 10% data, for scoring" dataKeyTest = "rTest" dataKeyTrain = "rTrain" # start at 90% rows + 1 # randomBitVector(size,selected) # randomFilter(srcFrame,rows,seed) # filter(srcFrame,bitVect) # odd. output is byte, all other exec outputs are 8 byte? (at least the ones below?) execExpr = "rbv=randomBitVector(" + str(num_rows) + "," + str( last10) + ",12345)" h2o_exec.exec_expr(None, execExpr, resultKey="rbv", timeoutSecs=10) # complement the bit vector execExpr = "not_rbv=colSwap(rbv,0,rbv[0]==0?1:0)" h2o_exec.exec_expr(None, execExpr, resultKey="not_rbv", timeoutSecs=10) execExpr = dataKeyTest + "=filter(" + key2 + ",rbv)" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10) execExpr = dataKeyTrain + "=filter(" + key2 + ",not_rbv)" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTrain, timeoutSecs=10) ### time.sleep(3600) # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] for trial in range(1, 10): # always slice from the beginning rowsToUse = rowsForPct[trial % 10] resultKey = "r" + str(trial) execExpr = resultKey + "=slice(" + dataKeyTrain + ",1," + str( rowsToUse) + ")" # execExpr = resultKey + "=slice(" + dataKeyTrain + ",1)" h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10) parseKey['destination_key'] = resultKey # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 start = time.time() # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['model_key'] = "model_" + str(trial) rfv = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) oobeTrainPctRight = 100 * ( 1.0 - rfv['confusion_matrix']['classification_error']) self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=0.2) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%" # pop the stuff from kwargs that were passing as params model_key = rfv['model_key'] kwargs.pop('model_key', None) data_key = rfv['data_key'] kwargs.pop('data_key', None) ntree = rfv['ntree'] kwargs.pop('ntree', None) # scoring # RFView.html? # dataKeyTest=a5m.hex& # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628& # response_variable=1& # ntree=50& # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0& # out_of_bag_error_estimate=1& # http://192.168.1.28:54321/GeneratePredictionsPage.html?model_key=__RFModel_0e2531bc-2552-4f65-8a4a-843031b0cb99&key=iris # http://192.168.1.28:54321/RFView.html?data_key=iris.hex&model_key=__RFModel_0e2531bc-2552-4f65-8a4a-843031b0cb99&ntree=50&response_variable=4&class_weights=Iris-setosa%3D1.0%2CIris-versicolor%3D1.0%2CIris-virginica%3D1.0&out_of_bag_error_estimate=true&iterative_cm=true kwargs['iterative_cm'] = 1 # do full scoring kwargs['out_of_bag_error_estimate'] = 0 rfv = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) fullScorePctRight = 100 * ( 1.0 - rfv['confusion_matrix']['classification_error']) self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=0.2) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % ( rowsToUse * 100.0 / num_rows), "pct. of all rows" actualDelta = [ abs(a - b) for a, b in zip(expectTrainPctRightList, actualTrainPctRightList) ] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "maybe should update with actual. Remove single quotes" print "actualTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [ abs(a - b) for a, b in zip(expectScorePctRightList, actualScorePctRightList) ] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "actualScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp
def rf_covtype_train_oobe(self, csvFilename, checkExpectedResults=True): # the expected results are only for the shuffled version # since getting 10% samples etc of the smallish dataset will vary between # shuffled and non-shuffled datasets importFolderPath = "/home/0xdiag/datasets/standard" csvPathname = importFolderPath + "/" + csvFilename key2 = csvFilename + ".hex" h2i.setupImportFolder(None, importFolderPath) print "\nUsing header=0 on", csvFilename parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, key2=key2, header=0, timeoutSecs=180) inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # how many rows for each pct? num_rows = inspect['num_rows'] pct10 = int(num_rows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = num_rows - rowsForPct[9] rowsForPct[10] = num_rows # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] # 0 isn't used expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79] expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78] print "Creating the key of the last 10% data, for scoring" dataKeyTest = "rTest" # start at 90% rows + 1 execExpr = dataKeyTest + " = slice(" + key2 + "," + str(rowsForPct[9]+1) + ")" h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10) # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] # don't use the smaller samples..bad error rates, plus for sorted covtype, you can get just one class! for trial in range(8,9): # always slice from the beginning rowsToUse = rowsForPct[trial%10] resultKey = "r_" + csvFilename + "_" + str(trial) execExpr = resultKey + " = slice(" + key2 + ",1," + str(rowsToUse) + ")" h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10) # hack so the RF will use the sliced result # FIX! don't use the sliced bit..use the whole data for rf training below ### parseKey['destination_key'] = resultKey # adjust timeoutSecs with the number of trees # seems ec2 can be really slow kwargs = paramDict.copy() timeoutSecs = 30 + kwargs['ntree'] * 20 # do oobe kwargs['out_of_bag_error_estimate'] = 1 kwargs['model_key'] = "model_" + csvFilename + "_" + str(trial) # kwargs['model_key'] = "model" # double check the rows/cols inspect = h2o_cmd.runInspect(key=parseKey['destination_key']) h2o_cmd.infoFromInspect(inspect, "going into RF") start = time.time() rfv = h2o_cmd.runRFOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) oobeTrainPctRight = 100 * (1.0 - rfv['confusion_matrix']['classification_error']) if checkExpectedResults: self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%. Note this is silly if we trained on 100% of the data" print "Or sorted by output class, so that the last 10% is the last few classes" # pop the stuff from kwargs that were passing as params model_key = rfv['model_key'] kwargs.pop('model_key',None) data_key = rfv['data_key'] kwargs.pop('data_key',None) ntree = rfv['ntree'] kwargs.pop('ntree',None) kwargs['iterative_cm'] = 1 kwargs['no_confusion_matrix'] = 0 # do full scoring kwargs['out_of_bag_error_estimate'] = 0 # double check the rows/cols inspect = h2o_cmd.runInspect(key=dataKeyTest) h2o_cmd.infoFromInspect(inspect, "dataKeyTest") rfvScoring = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) fullScorePctRight = 100 * (1.0 - rfvScoring['confusion_matrix']['classification_error']) if checkExpectedResults: self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/num_rows), "pct. of all rows" actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "maybe should update with actual. Remove single quotes" print "actualTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "actualScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp # return the last rfv done during training return rfv
def test_rf_covtype_fvec(self): h2o.beta_features = True # fvec importFolderPath = "standard" # Parse Train ****************************************************** csvTrainFilename = 'covtype.shuffled.90pct.data' csvTrainPathname = importFolderPath + "/" + csvTrainFilename hex_key = csvTrainFilename + ".hex" parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=hex_key, timeoutSecs=180, doSummary=False) inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key']) # Parse Test ****************************************************** csvTestFilename = 'covtype.shuffled.10pct.data' csvTestPathname = importFolderPath + "/" + csvTestFilename hex_key = csvTestFilename + ".hex" parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTestPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseTestResult['destination_key']) rfViewInitial = [] xList = [] eList = [] fList = [] trial = 0 depthList = [10, 20, 30, 40] ntreesList = [5, 10, 20, 30] # ntreesList = [2] nbinsList = [10, 100, 1000] if TRY == 'max_depth': tryList = depthList elif TRY == 'ntrees': tryList = ntreesList elif TRY == 'nbins': tryList = nbinsList else: raise Exception("huh? %s" % TRY) for d in tryList: if TRY == 'max_depth': paramDict['max_depth'] = d elif TRY == 'ntrees': paramDict['ntrees'] = d elif TRY == 'nbins': paramDict['nbins'] = d else: raise Exception("huh? %s" % TRY) # adjust timeoutSecs with the number of trees # seems ec2 can be really slow if DO_OOBE: paramDict['validation'] = None else: paramDict['validation'] = parseTestResult['destination_key'] timeoutSecs = 30 + paramDict['ntrees'] * 200 # do ten starts, to see the bad id problem? TRIES = 10 for i in range(TRIES): lastOne = i == (TRIES-1) # have unique model names trial += 1 kwargs = paramDict.copy() # kwargs['destination_key'] = 'RFModel_' + str(trial) # let h2o name it kwargs['destination_key'] = None start = time.time() rfResult = h2o_cmd.runRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, noPoll=False, rfView=False, **kwargs) trainElapsed = time.time() - start print 'rf train end', i, 'on', csvTrainPathname, 'took', trainElapsed, 'seconds' # don't cancel the last one if not lastOne: time.sleep(1) h2o_jobs.cancelAllJobs(timeoutSecs=2) ### print "rfView", h2o.dump_json(rfView) model_key = rfResult['drf_model']['_selfKey'] data_key = rfResult['drf_model']['_dataKey'] ntrees = kwargs['ntrees'] rfView = h2o_cmd.runRFView(None, model_key=model_key, timeoutSecs=60, retryDelaySecs=5, doSimpleCheck=False) ## print "rfView:", h2o.dump_json(rfView) rf_model = rfView['drf_model'] cm = rf_model['cm'] ### print "cm:", h2o.dump_json(cm) ntrees = rf_model['N'] errs = rf_model['errs'] N = rf_model['N'] varimp = rf_model['varimp'] treeStats = rf_model['treeStats'] print "maxDepth:", treeStats['maxDepth'] print "maxLeaves:", treeStats['maxLeaves'] print "minDepth:", treeStats['minDepth'] print "minLeaves:", treeStats['minLeaves'] print "meanLeaves:", treeStats['meanLeaves'] print "meanDepth:", treeStats['meanDepth'] print "errs[0]:", errs[0] print "errs[-1]:", errs[-1] print "errs:", errs (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) # we iterate over params, so can't really do this check # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) # FIX! should update this expected classification error predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key) eList.append(classErrorPctList[4]) fList.append(trainElapsed) if DO_PLOT: if TRY == 'max_depth': xLabel = 'max_depth' elif TRY == 'ntrees': xLabel = 'ntrees' elif TRY == 'nbins': xLabel = 'nbins' else: raise Exception("huh? %s" % TRY) xList.append(paramDict[xLabel]) if DO_PLOT: eLabel = 'class 4 pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_rf_covtype20x_fvec(self): h2o.beta_features = True importFolderPath = 'standard' if DO_SMALL: csvFilenameTrain = 'covtype.data' hex_key = 'covtype1x.data.A.hex' else: csvFilenameTrain = 'covtype20x.data' hex_key = 'covtype20x.data.A.hex' csvPathname = importFolderPath + "/" + csvFilenameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # have to re import since source key is gone # we could just copy the key, but sometimes we change the test/train data to covtype.data if DO_SMALL: csvFilenameTest = 'covtype.data' hex_key = 'covtype1x.data.B.hex' dataKeyTest2 = 'covtype1x.data.C.hex' else: csvFilenameTest = 'covtype20x.data' hex_key = 'covtype20x.data.B.hex' dataKeyTest2 = 'covtype20x.data.C.hex' csvPathname = importFolderPath + "/" + csvFilenameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) print "Parse result['destination_key']:", parseResultTest[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] print "Parse end", dataKeyTest # make a 3rd key so the predict is uncached too! execExpr = dataKeyTest2 + "=" + dataKeyTest kwargs = {'str': execExpr, 'timeoutSecs': 15} resultExec = h2o_cmd.runExec(**kwargs) # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. paramDict = drf2ParamDict params = {'ntrees': 20, 'destination_key': 'RF_model'} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() timeoutSecs = 30 + kwargs['ntrees'] * 60 start = time.time() rf = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) print "rf job end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['destination_key'] ntree = kwargs['ntrees'] start = time.time() # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree) h2o_cmd.runRFView(None, dataKeyTrain, model_key, ntree=ntree, timeoutSecs=timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' for trial in range(1): # scoring start = time.time() rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree=ntree, timeoutSecs=timeoutSecs, out_of_bag_error_estimate=0, retryDelaySecs=1) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds.' (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) self.assertAlmostEqual( classification_error, 50, delta=50, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2) print "predict", trial, "end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds.' parseKey = parseResultTrain['destination_key'] rfModelKey = rfView['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict(data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual='C55', predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm) print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed"
def test_RF_mnist_both(self): csvFilelist = [ # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist_training*gz'), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist_training*gz'), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist_training*gz'), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, None, '*mnist_training*gz'), ] # IMPORT********************************************** trial = 0 allDelta = [] importFolderPath = "mnist" for (trainCsvFilename, testCsvFilename, timeoutSecs, rfSeed, parsePattern) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() csvPathname = importFolderPath + "/" + testCsvFilename parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** print "Use multi-file parse to grab both the mnist_testing.csv.gz and mnist_training.csv.gz for training" trainKey = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() csvPathname = importFolderPath + "/" + parsePattern parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=parsePattern, schema='local', timeoutSecs=300) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** print "This is the 'ignore=' we'll use" ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, returnIgnoreX=True) ntree = 100 params = { 'response': 0, 'ignored_cols': ignore_x, 'ntrees': ntree, # 'data_key='mnist_training.csv.hex' 'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'max_depth': 500, 'destination_key': 'RF_model', 'nbins': 1024, } if rfSeed is None: params['seed'] = random.randint(0,sys.maxint) else: params['seed'] = rfSeed print "RF seed:", params['seed'] kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, rfView=False, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfView, **params) modelKey = rfView['model_key'] # RFView (score on test)**************************************** start = time.time() kwargs = {'response': y} rfView = h2o_cmd.runRFView(data_key=testKey, model_key=modelKey, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) print "classification error is expected to be low because we included the test data in with the training!" self.assertAlmostEqual(classification_error, 0.028, delta=0.01, msg="Classification error %s differs too much" % classification_error) treeStats = rfView['drf_model']['treesStats'] # Expected values are from this case: # ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600, 784834182943470027), expected = {'minLeaves': 4996, 'meanLeaves': 5064.1, 'maxLeaves': 5148} expected += {'minDepth': 21, 'meanDepth': 23.8, 'maxDepth': 25} for key in expected: delta = ((expected[key]- actual[key])/expected[key]) * 100 d = "seed: %s %s %s %s %s pct. different %s" % (params['seed'], key, actual[key], expected[key], delta) print d allDelta.append(d) # FIX! should change this to an assert? # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) # Done ******************************************************* print "\nShowing the results again from all the trials, to see variance" for d in allDelta: print d
def test_rf_big1_overwrite_model_fvec(self): h2o.beta_features = True csvFilename = 'hhp_107_01.data.gz' hex_key = csvFilename + ".hex" print "\n" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, hex_key=hex_key, timeoutSecs=15, schema='put') firstRfView = None # dispatch multiple jobs back to back for jobDispatch in range(3): start = time.time() kwargs = {} if OVERWRITE_RF_MODEL: print "Since we're overwriting here, we have to wait for each to complete noPoll=False" model_key = 'RF_model' else: model_key = 'RF_model' + str(jobDispatch) print "Change the number of trees, while keeping the rf model key name the same" print "Checks that we correctly overwrite previous rf model" if OVERWRITE_RF_MODEL: kwargs['ntrees'] = 1 + jobDispatch else: kwargs['ntrees'] = 1 # don't change the seed if we're overwriting the model. It should get # different results just from changing the tree count kwargs['seed'] = random.randint(0, sys.maxint) # FIX! what model keys do these get? randomNode = h2o.nodes[random.randint(0, len(h2o.nodes) - 1)] h2o_cmd.runRF(node=randomNode, parseResult=parseResult, destination_key=model_key, timeoutSecs=300, noPoll=True, **kwargs) # FIX! are these already in there? rfView = {} rfView['_dataKey'] = hex_key rfView['_key'] = model_key print "rf job dispatch end on ", csvFilename, 'took', time.time( ) - start, 'seconds' print "\njobDispatch #", jobDispatch # we're going to compare rf results to previous as we go along (so we save rf view results h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # In this test we're waiting after each one, so we can save the RFView results for comparison to future print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['_dataKey'] model_key = rfView['_key'] print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)" # allow it to poll to complete rfViewResult = h2o_cmd.runRFView(None, data_key, model_key, timeoutSecs=60, noPoll=False) if firstRfView is None: # we'll use this to compare the others firstRfView = rfViewResult.copy() firstModelKey = model_key print "firstRfView", h2o.dump_json(firstRfView) else: print "Comparing", model_key, "to", firstModelKey df = h2o_util.JsonDiff(rfViewResult, firstRfView, vice_versa=True, with_values=True) print "df.difference:", h2o.dump_json(df.difference) self.assertGreater(len(df.difference), 29, msg="Want >=30 , not %d differences between the two rfView json responses. %s" % \ (len(df.difference), h2o.dump_json(df.difference)))
def test_rfview_score(self): csvPathnameTrain = 'standard/covtype.data' print "Train with:", csvPathnameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathnameTrain, schema='put', hex_key="covtype.hex", timeoutSecs=15) dataKeyTrain = parseResultTrain['destination_key'] csvPathnameTest = 'standard/covtype.data' print "Test with:", csvPathnameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathnameTest, schema='put', hex_key="covtype.hex", timeoutSecs=15) dataKeyTest = parseResultTest['destination_key'] for trial in range(5): # params is mutable. This is default. params = {'ntree': 13, 'out_of_bag_error_estimate': 0} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntree'] * 10 rfv = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) ### print "rf response:", h2o.dump_json(rfv) model_key = rfv['model_key'] # pop the stuff from kwargs that were passing as params kwargs.pop('model_key', None) data_key = rfv['data_key'] kwargs.pop('data_key', None) ntree = rfv['ntree'] kwargs.pop('ntree', None) # scoring # RFView.html? # dataKeyTest=a5m.hex& # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628& # response_variable=1& # ntree=50& # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0& # out_of_bag_error_estimate=1& rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, **kwargs) # new web page for predict? throw it in here for now (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if 'sampling_strategy' in kwargs and kwargs[ 'sampling_strategy'] != 'STRATIFIED_LOCAL': check_err = True else: check_err = False if check_err: self.assertAlmostEqual( classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 0 rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # FIX! should update this expected classification error (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if check_err: self.assertAlmostEqual( classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 1 rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # FIX! should update this expected classification error (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if check_err: self.assertAlmostEqual( classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 1 kwargs['class_weights'] = '1=1,2=2,3=3,4=4,5=5,6=6,7=7' rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # FIX! should update this expected classification error (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if check_err: self.assertAlmostEqual( classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' print "Trial #", trial, "completed"
def test_RF_mnist_fvec(self): h2o.beta_features = True importFolderPath = "mnist" csvFilelist = [ # ("mnist_testing.csv.gz", "mnist_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_testing.csv.gz", "mnist_testing.csv.gz", 600), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" # x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + trainCsvFilename, schema='local', hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** print "This is the 'ignore=' we'll use" ignore_x = h2o_glm.goodXFromColumnInfo( y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) params = { 'response': 'C' + str(y), 'cols': None, 'ignored_cols_by_name': ignore_x, 'classification': 1, 'validation': None, 'ntrees': 10, 'max_depth': 20, 'min_rows': None, 'nbins': 1000, 'mtries': None, 'sample_rate': 0.66, 'seed': None, } rfViewInitial = [] for jobDispatch in range(1): # adjust timeoutSecs with the number of trees # seems ec2 can be really slow params['destination_key'] = 'RFModel_' + str('jobDispatch') kwargs = params.copy() timeoutSecs = 1200 start = time.time() rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=not DO_POLL, rfView=DO_POLL, **kwargs) elapsed = time.time() - start # print h2o.dump_json(rfResult) print "rf job dispatch end on ", trainCsvFilename, 'took', time.time( ) - start, 'seconds' print "\njobDispatch #", jobDispatch # FIX! are these already in there? rfView = {} rfView['data_key'] = trainKey2 rfView['model_key'] = kwargs['destination_key'] rfView['ntrees'] = kwargs['ntrees'] rfViewInitial.append(rfView) if not DO_POLL: h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200, pollTimeoutSecs=120, retryDelaySecs=5) # FIX! need to add the rfview and predict stuff # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['data_key'] model_key = rfView['model_key'] ntrees = rfView['ntrees'] rfView = h2o_cmd.runRFView(None, model_key=model_key, timeoutSecs=60, noPoll=not DO_POLL, doSimpleCheck=False) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) self.assertAlmostEqual( classification_error, 10, delta=2, msg="Classification error %s differs too much" % classification_error) if not DO_POLL: h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=120, retryDelaySecs=5) # rfView = h2o_cmd.runRFView(None, data_key, model_key, timeoutSecs=60, noPoll=True, doSimpleCheck=False) # print "rfView:", h2o.dump_json(rfView) # "N":1, # "errs":[0.25,0.1682814508676529], # "testKey":"syn_binary_10000x10.hex", # "cm":[[3621,1399],[1515,3465]]}} rf_model = rfView['drf_model'] cms = rf_model['cms'] ntrees = rf_model['N'] errs = rf_model['errs'] N = rf_model['N'] # FIX! should update this expected classification error ## (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees) ## self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key)
def test_RF_mnist_fvec(self): h2o.beta_features = True importFolderPath = "mnist" csvFilelist = [ # ("mnist_testing.csv.gz", "mnist_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_testing.csv.gz", "mnist_testing.csv.gz", 600), ("mnist_training.csv.gz", "mnist_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" # x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + trainCsvFilename, schema='local', hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** print "This is the 'ignore=' we'll use" ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) params = { 'response': 'C' + str(y+1), 'cols': None, 'ignored_cols_by_name': ignore_x, 'classification': 1, 'validation': None, 'ntrees': 2, 'max_depth': 20, 'min_rows': None, 'nbins': 1000, 'mtries': None, 'sample_rate': 0.66, 'seed': None, } rfViewInitial = [] for jobDispatch in range(1): # adjust timeoutSecs with the number of trees # seems ec2 can be really slow params['destination_key'] = 'RFModel_' + str('jobDispatch') kwargs = params.copy() timeoutSecs = 1200 start = time.time() rfResult = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, noPoll=not DO_POLL, rfView=DO_POLL, **kwargs) elapsed = time.time() - start # print h2o.dump_json(rfResult) print "rf job dispatch end on ", trainCsvFilename, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch # FIX! are these already in there? rfView = {} rfView['data_key'] = trainKey2 rfView['model_key'] = kwargs['destination_key'] rfView['ntrees'] = kwargs['ntrees'] rfViewInitial.append(rfView) if not DO_POLL: h2o_jobs.pollStatsWhileBusy(timeoutSecs=1200, pollTimeoutSecs=120, retryDelaySecs=5) # FIX! need to add the rfview and predict stuff # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['data_key'] model_key = rfView['model_key'] ntrees = rfView['ntrees'] rfView = h2o_cmd.runRFView(None, model_key=model_key, timeoutSecs=60, noPoll=not DO_POLL, doSimpleCheck=False) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) self.assertAlmostEqual(classification_error, 10, delta=2, msg="Classification error %s differs too much" % classification_error) if not DO_POLL: h2o_jobs.pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=120, retryDelaySecs=5) # rfView = h2o_cmd.runRFView(None, data_key, model_key, timeoutSecs=60, noPoll=True, doSimpleCheck=False) # print "rfView:", h2o.dump_json(rfView) # "N":1, # "errs":[0.25,0.1682814508676529], # "testKey":"syn_binary_10000x10.hex", # "cm":[[3621,1399],[1515,3465]]}} rf_model = rfView['drf_model'] cms = rf_model['cms'] errs = rf_model['errs'] # FIX! should update this expected classification error ## (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees) ## self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key)
def test_rfview_score(self): csvPathnameTrain = 'UCI/UCI-large/covtype/covtype.data' print "Train with:", csvPathnameTrain parseResultTrain = h2i.import_parse(bucket='datasets', path=csvPathnameTrain, schema='put', hex_key="covtype.hex", timeoutSecs=15) dataKeyTrain = parseResultTrain['destination_key'] csvPathnameTest = 'UCI/UCI-large/covtype/covtype.data' print "Test with:", csvPathnameTest parseResultTest = h2i.import_parse(bucket='datasets', path=csvPathnameTest, schema='put', hex_key="covtype.hex", timeoutSecs=15) dataKeyTest = parseResultTest['destination_key'] for trial in range(5): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1, 'out_of_bag_error_estimate': 0} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + kwargs['ntree'] * 10 * (kwargs['parallel'] and 1 or 5) rfv = h2o_cmd.runRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) ### print "rf response:", h2o.dump_json(rfv) model_key = rfv['model_key'] # pop the stuff from kwargs that were passing as params kwargs.pop('model_key',None) data_key = rfv['data_key'] kwargs.pop('data_key',None) ntree = rfv['ntree'] kwargs.pop('ntree',None) # scoring # RFView.html? # dataKeyTest=a5m.hex& # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628& # response_variable=1& # ntree=50& # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0& # out_of_bag_error_estimate=1& rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # new web page for predict? throw it in here for now (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if 'sampling_strategy' in kwargs and kwargs['sampling_strategy'] != 'STRATIFIED_LOCAL': check_err = True else: check_err = False if check_err: self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 0 rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # FIX! should update this expected classification error (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if check_err: self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 1 rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # FIX! should update this expected classification error (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if check_err: self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' kwargs['iterative_cm'] = 1 kwargs['class_weights'] = '1=1,2=2,3=3,4=4,5=5,6=6,7=7' rfView = h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) # FIX! should update this expected classification error (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) # don't check error if stratified if check_err: self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) elapsed = time.time() - start print "predict end on ", dataKeyTest, 'took', elapsed, 'seconds.' print "Trial #", trial, "completed"
def test_rf_params_rand2(self): # for determinism, I guess we should spit out the seed? # random.seed(SEED) SEED = random.randint(0, sys.maxint) # if you have to force to redo a test # SEED = random.seed(SEED) print "\nUsing random seed:", SEED csvPathnameTrain = h2o.find_file('smalldata/covtype/covtype.20k.data') print "Train with:", csvPathnameTrain parseKeyTrain = h2o_cmd.parseFile(csvPathname=csvPathnameTrain, key2="covtype.20k.hex", timeoutSecs=10) dataKeyTrain = parseKeyTrain['destination_key'] csvPathnameTest = h2o.find_dataset('UCI/UCI-large/covtype/covtype.data') print "Test with:", csvPathnameTest parseKeyTest = h2o_cmd.parseFile(csvPathname=csvPathnameTrain, key2="covtype.hex", timeoutSecs=10) dataKeyTest = parseKeyTest['destination_key'] for trial in range(5): # params is mutable. This is default. params = {'ntree': 13, 'parallel': 1, 'out_of_bag_error_estimate': 0} colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 30 + 15 * (kwargs['parallel'] and 5 or 10) rfv = h2o_cmd.runRFOnly(parseKey=parseKeyTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) ### print "rf response:", h2o.dump_json(rfv) model_key = rfv['model_key'] # pop the stuff from kwargs that were passing as params kwargs.pop('model_key',None) data_key = rfv['data_key'] kwargs.pop('data_key',None) ntree = rfv['ntree'] kwargs.pop('ntree',None) # scoring # RFView.html? # dataKeyTest=a5m.hex& # model_key=__RFModel_81c5063c-e724-4ebe-bfc1-3ac6838bc628& # response_variable=1& # ntree=50& # class_weights=-1%3D1.0%2C0%3D1.0%2C1%3D1.0& # out_of_bag_error_estimate=1& # no_confusion_matrix=1& # clear_confusion_matrix=1 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) kwargs['no_confusion_matrix'] = 0 kwargs['clear_confusion_matrix'] = 0 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) kwargs['no_confusion_matrix'] = 0 kwargs['clear_confusion_matrix'] = 1 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) kwargs['no_confusion_matrix'] = 1 kwargs['clear_confusion_matrix'] = 0 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) kwargs['no_confusion_matrix'] = 1 kwargs['clear_confusion_matrix'] = 1 h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) kwargs['no_confusion_matrix'] = 0 kwargs['clear_confusion_matrix'] = 0 kwargs['class_weights'] = '1=1,2=2,3=3,4=4,5=5,6=6,7=7' h2o_cmd.runRFView(None, dataKeyTest, model_key, ntree, timeoutSecs, retryDelaySecs=1, print_params=True, **kwargs) print "Trial #", trial, "completed"
def test_RF_mnist_reals(self): importFolderPath = "/home/0xdiag/datasets/mnist" csvFilelist = [ # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] # IMPORT********************************************** # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list importFolderResult = h2i.setupImportFolder(None, importFolderPath) ### print "importHDFSResult:", h2o.dump_json(importFolderResult) succeededList = importFolderResult['files'] ### print "succeededList:", h2o.dump_json(succeededList) self.assertGreater(len(succeededList), 1, "Should see more than 1 files in the import?") # why does this hang? can't look at storeview after import? print "\nTrying StoreView after the import folder" h2o_cmd.runStoreView(timeoutSecs=30) trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, testCsvFilename, importFolderPath, key2=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseKey = h2i.parseImportFolderFile(None, trainCsvFilename, importFolderPath, key2=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseKey['destination_key'] # RF+RFView (train)**************************************** print "This is the 'ignore=' we'll use" ignore_x = h2o_glm.goodXFromColumnInfo( y, key=parseKey['destination_key'], timeoutSecs=300, forRF=True) ntree = 100 params = { 'response_variable': 0, 'ignore': ignore_x, 'ntree': ntree, 'iterative_cm': 1, 'out_of_bag_error_estimate': 1, # 'data_key='mnist_reals_training.csv.hex' 'features': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'exclusive_split_limit': None, 'depth': 2147483647, 'stat_type': 'ENTROPY', 'sampling_strategy': 'RANDOM', 'sample': 67, # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77', 'model_key': 'RF_model', 'bin_limit': 1024, 'seed': 784834182943470027, 'parallel': 1, 'use_non_local_data': 0, 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0', } kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runRFOnly(parseKey=parseKey, rfView=False, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfView, **params) modelKey = rfView['model_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response_variable': y} rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) self.assertAlmostEqual( classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions( model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_RF_mnist_reals(self): importFolderPath = "mnist" csvFilelist = [ # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + trainCsvFilename, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** print "This is the 'ignore=' we'll use" ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, forRF=True) ntree = 10 params = { 'response_variable': 0, 'ignore': ignore_x, 'ntree': ntree, 'iterative_cm': 1, 'out_of_bag_error_estimate': 1, # 'data_key='mnist_reals_training.csv.hex' 'features': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'exclusive_split_limit': None, 'depth': 2147483647, 'stat_type': 'ENTROPY', 'sampling_strategy': 'RANDOM', 'sample': 67, # 'model_key': '__RFModel_7055e6cf-a0de-44db-b165-f5994730ac77', 'model_key': 'RF_model', 'bin_limit': 1024, 'seed': 784834182943470027, 'parallel': 1, 'use_non_local_data': 0, 'class_weights': '0=1.0,1=1.0,2=1.0,3=1.0,4=1.0,5=1.0,6=1.0,7=1.0,8=1.0,9=1.0', } kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfView = h2o_cmd.runRF(parseResult=parseResult, rfView=True, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfView, **params) modelKey = rfView['model_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response_variable': y} rfView = h2o_cmd.runRFView(data_key=testKey2, model_key=modelKey, ntree=ntree, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfView, **params) self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_rf_covtype_fvec(self): h2o.beta_features = True # fvec importFolderPath = "standard" # Parse Train ****************************************************** csvTrainFilename = 'covtype.shuffled.90pct.data' csvTrainPathname = importFolderPath + "/" + csvTrainFilename hex_key = csvTrainFilename + ".hex" parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=hex_key, timeoutSecs=180, doSummary=False) inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key']) # Parse Test ****************************************************** csvTestFilename = 'covtype.shuffled.10pct.data' csvTestPathname = importFolderPath + "/" + csvTestFilename hex_key = csvTestFilename + ".hex" parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTestPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseTestResult['destination_key']) rfViewInitial = [] xList = [] eList = [] fList = [] trial = 0 depthList = [10, 20, 30, 40] ntreesList = [5, 10, 20, 30] # ntreesList = [2] nbinsList = [10, 100, 1000] if TRY == 'max_depth': tryList = depthList elif TRY == 'ntrees': tryList = ntreesList elif TRY == 'nbins': tryList = nbinsList else: raise Exception("huh? %s" % TRY) for d in tryList: if TRY == 'max_depth': paramDict['max_depth'] = d elif TRY == 'ntrees': paramDict['ntrees'] = d elif TRY == 'nbins': paramDict['nbins'] = d else: raise Exception("huh? %s" % TRY) # adjust timeoutSecs with the number of trees # seems ec2 can be really slow if DO_OOBE: paramDict['validation'] = None else: paramDict['validation'] = parseTestResult['destination_key'] timeoutSecs = 30 + paramDict['ntrees'] * 200 # do ten starts, to see the bad id problem? TRIES = 5 for i in range(TRIES): lastOne = i == (TRIES - 1) # have unique model names trial += 1 kwargs = paramDict.copy() model_key = 'RFModel_' + str(trial) kwargs['destination_key'] = model_key data_key = parseTrainResult['destination_key'] start = time.time() rfResult = h2o_cmd.runRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, noPoll=True, rfView=False, **kwargs) trainElapsed = time.time() - start print 'rf train end', i, 'on', csvTrainPathname, 'took', trainElapsed, 'seconds' # don't cancel the last one if not lastOne: time.sleep(1) h2o_jobs.cancelAllJobs(timeoutSecs=2) ### print "rfView", h2o.dump_json(rfView) print "We have a result from the RF above, completed but didn't do RFView yet" # could the RF indicate 'done' too soon? # if rfResult['state']=='RUNNING': # raise Exception("Why is this RF still in RUNNING state? %s" % h2o.dump_json(rfResult)) # if 'drf_model' not in rfResult: # raise Exception("How come there's no drf_model in this RF result? %s" % h2o.dump_json(rfResult)) h2o_jobs.pollWaitJobs(timeoutSecs=300) rfView = h2o_cmd.runRFView(None, model_key=model_key, timeoutSecs=60, retryDelaySecs=5, doSimpleCheck=False) print "rfView:", h2o.dump_json(rfView) rf_model = rfView['drf_model'] cms = rf_model['cms'] ### print "cm:", h2o.dump_json(cm) ntrees = rf_model['N'] errs = rf_model['errs'] N = rf_model['N'] varimp = rf_model['varimp'] treeStats = rf_model['treeStats'] print "maxDepth:", treeStats['maxDepth'] print "maxLeaves:", treeStats['maxLeaves'] print "minDepth:", treeStats['minDepth'] print "minLeaves:", treeStats['minLeaves'] print "meanLeaves:", treeStats['meanLeaves'] print "meanDepth:", treeStats['meanDepth'] print "errs[0]:", errs[0] print "errs[-1]:", errs[-1] print "errs:", errs (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) # we iterate over params, so can't really do this check # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) print "classErrorPctList:", classErrorPctList self.assertEqual( len(classErrorPctList), 7, "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict" ) # FIX! should update this expected classification error predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key) eList.append(classErrorPctList[4]) fList.append(trainElapsed) if DO_PLOT: if TRY == 'max_depth': xLabel = 'max_depth' elif TRY == 'ntrees': xLabel = 'ntrees' elif TRY == 'nbins': xLabel = 'nbins' else: raise Exception("huh? %s" % TRY) xList.append(paramDict[xLabel]) if DO_PLOT: eLabel = 'class 4 pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_RF_mnist_reals_fvec(self): importFolderPath = "mnist" csvFilelist = [ # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), # ("a.csv", "b.csv", 60), # ("mnist_reals_testing.csv.gz", "mnist_reals_testing.csv.gz", 600), ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600), ] trial = 0 for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist: trialStart = time.time() # PARSE test**************************************** testKey2 = testCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + testCsvFilename, hex_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] print "We won't use this pruning of x on test data. See if it prunes the same as the training" y = 0 # first column is pixel value print "y:" x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300) # PARSE train**************************************** trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex" start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=importFolderPath + "/" + trainCsvFilename, hex_key=trainKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) print "parse result:", parseResult['destination_key'] # RF+RFView (train)**************************************** ignore_x = h2o_glm.goodXFromColumnInfo(y, key=parseResult['destination_key'], timeoutSecs=300, returnIgnoreX=True) ntrees = 10 params = { 'response': 'C1', 'ignored_cols_by_name': ignore_x, 'ntrees': ntrees, 'mtries': 28, # fix because we ignore some cols, which will change the srt(cols) calc? 'max_depth': 15, 'sample_rate': 0.67, 'destination_key': 'RF_model', 'nbins': 1024, 'seed': 784834182943470027, 'importance': 0, 'balance_classes': 0, } kwargs = params.copy() print "Trying rf" timeoutSecs = 1800 start = time.time() rfv = h2o_cmd.runRF(parseResult=parseResult, rfView=True, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2, **kwargs) elapsed = time.time() - start print "RF completed in", elapsed, "seconds.", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) h2o_rf.simpleCheckRFView(None, rfv, **params) rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] # RFView (score on test)**************************************** start = time.time() # FIX! 1 on oobe causes stack trace? kwargs = {'response_variable': y} rfv = h2o_cmd.runRFView(data_key=testKey2, model_key=model_key, ntrees=ntrees, out_of_bag_error_estimate=0, timeoutSecs=60, pollTimeoutSecs=60, noSimpleCheck=False, **kwargs) elapsed = time.time() - start print "RFView in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs) (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(None, rfv, **params) self.assertAlmostEqual(classification_error, 9, delta=1.0, msg="Classification error %s differs too much" % classification_error) # Predict (on test)**************************************** start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=testKey2, timeoutSecs=timeoutSecs) elapsed = time.time() - start print "generate_predictions in", elapsed, "secs", \ "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)