def test_rf_covtype_fvec(self): h2o.beta_features = True # fvec importFolderPath = "standard" # Parse Train ****************************************************** csvTrainFilename = 'covtype.shuffled.90pct.data' csvTrainPathname = importFolderPath + "/" + csvTrainFilename hex_key = csvTrainFilename + ".hex" parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=hex_key, timeoutSecs=180, doSummary=False) inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key']) # Parse Test ****************************************************** csvTestFilename = 'covtype.shuffled.10pct.data' csvTestPathname = importFolderPath + "/" + csvTestFilename hex_key = csvTestFilename + ".hex" parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTestPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseTestResult['destination_key']) rfViewInitial = [] xList = [] eList = [] fList = [] trial = 0 depthList = [10, 20, 30, 40] ntreesList = [5, 10, 20, 30] # ntreesList = [2] nbinsList = [10, 100, 1000] if TRY == 'max_depth': tryList = depthList elif TRY == 'ntrees': tryList = ntreesList elif TRY == 'nbins': tryList = nbinsList else: raise Exception("huh? %s" % TRY) for d in tryList: if TRY == 'max_depth': paramDict['max_depth'] = d elif TRY == 'ntrees': paramDict['ntrees'] = d elif TRY == 'nbins': paramDict['nbins'] = d else: raise Exception("huh? %s" % TRY) # adjust timeoutSecs with the number of trees # seems ec2 can be really slow if DO_OOBE: paramDict['validation'] = None else: paramDict['validation'] = parseTestResult['destination_key'] timeoutSecs = 30 + paramDict['ntrees'] * 200 # do ten starts, to see the bad id problem? TRIES = 5 for i in range(TRIES): lastOne = i == (TRIES - 1) # have unique model names trial += 1 kwargs = paramDict.copy() model_key = 'RFModel_' + str(trial) kwargs['destination_key'] = model_key data_key = parseTrainResult['destination_key'] start = time.time() rfResult = h2o_cmd.runSpeeDRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) trainElapsed = time.time() - start print 'rf train end', i, 'on', csvTrainPathname, 'took', trainElapsed, 'seconds' # don't cancel the last one if not lastOne: time.sleep(1) h2o_jobs.cancelAllJobs(timeoutSecs=2) ### print "rfView", h2o.dump_json(rfView) print "We have a result from the RF above, completed but didn't do RFView yet" # could the RF indicate 'done' too soon? # if rfResult['state']=='RUNNING': # raise Exception("Why is this RF still in RUNNING state? %s" % h2o.dump_json(rfResult)) # if 'drf_model' not in rfResult: # raise Exception("How come there's no drf_model in this RF result? %s" % h2o.dump_json(rfResult)) h2o_jobs.pollWaitJobs(timeoutSecs=300) rfView = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60) print "rfView:", h2o.dump_json(rfView) rfView["drf_model"] = rfView.pop("speedrf_model") rf_model = rfView['drf_model'] cms = rf_model['cms'] ### print "cm:", h2o.dump_json(cm) ntrees = rf_model['N'] errs = rf_model['errs'] N = rf_model['N'] varimp = rf_model['varimp'] treeStats = rf_model['treeStats'] print "maxDepth:", treeStats['maxDepth'] print "maxLeaves:", treeStats['maxLeaves'] print "minDepth:", treeStats['minDepth'] print "minLeaves:", treeStats['minLeaves'] print "meanLeaves:", treeStats['meanLeaves'] print "meanDepth:", treeStats['meanDepth'] print "errs[0]:", errs[0] print "errs[-1]:", errs[-1] print "errs:", errs (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) # we iterate over params, so can't really do this check # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) print "classErrorPctList:", classErrorPctList self.assertEqual( len(classErrorPctList), 7, "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict" ) # FIX! should update this expected classification error predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key) eList.append(classErrorPctList[4]) fList.append(trainElapsed) if DO_PLOT: if TRY == 'max_depth': xLabel = 'max_depth' elif TRY == 'ntrees': xLabel = 'ntrees' elif TRY == 'nbins': xLabel = 'nbins' else: raise Exception("huh? %s" % TRY) xList.append(paramDict[xLabel]) if DO_PLOT: eLabel = 'class 4 pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_rf_change_data_key_fvec(self): importFolderPath = 'standard' csvFilenameTrain = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilenameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500) h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # we could train on covtype, and then use covtype20x for test? or vice versa # parseResult = parseResult # dataKeyTest = dataKeyTrain csvFilenameTest = 'covtype20x.data' csvPathname = importFolderPath + "/" + csvFilenameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500) print "Parse result['destination_key']:", parseResultTest[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] print "Parse end", dataKeyTest # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. params = {'ntrees': 2, 'destination_key': 'RF_model'} # colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() kwargs["response"] = "C55" # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 100 start = time.time() h2o_cmd.runSpeeDRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, noPoll=True, **kwargs) print "rf job dispatch end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' ### print "rf response:", h2o.dump_json(rfv) start = time.time() h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=360, pollTimeoutSecs=120, retryDelaySecs=5) print "rf job end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['destination_key'] ntrees = kwargs['ntrees'] start = time.time() h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time( ) - start, 'seconds' for trial in range(3): # scoring start = time.time() rfView = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds.' rfView["drf_model"] = rfView.pop("speedrf_model") (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees) # FIX! should update this expected classification error # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) print "predict", trial, "end on ", dataKeyTest, 'took', time.time( ) - start, 'seconds.' print "Trial #", trial, "completed"
def test_rf_change_data_key_fvec(self): importFolderPath = 'standard' csvFilenameTrain = 'covtype.data' csvPathname = importFolderPath + "/" + csvFilenameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500) h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # we could train on covtype, and then use covtype20x for test? or vice versa # parseResult = parseResult # dataKeyTest = dataKeyTrain csvFilenameTest = 'covtype20x.data' csvPathname = importFolderPath + "/" + csvFilenameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, timeoutSecs=500) print "Parse result['destination_key']:", parseResultTest['destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] print "Parse end", dataKeyTest # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. params = { 'ntrees': 2, 'destination_key': 'RF_model' } # colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() kwargs["response"] = "C55" # adjust timeoutSecs with the number of trees # seems ec2 can be really slow timeoutSecs = 100 start = time.time() h2o_cmd.runSpeeDRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, noPoll=True, **kwargs) print "rf job dispatch end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' ### print "rf response:", h2o.dump_json(rfv) start = time.time() h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=360, pollTimeoutSecs=120, retryDelaySecs=5) print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['destination_key'] ntrees = kwargs['ntrees'] start = time.time() h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' for trial in range(3): # scoring start = time.time() rfView = h2o_cmd.runSpeeDRFView(None, model_key,timeoutSecs) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' rfView["drf_model"] = rfView.pop("speedrf_model") (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntrees) # FIX! should update this expected classification error # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' print "Trial #", trial, "completed"
def test_rf_big1_nopoll_fvec(self): h2o.beta_features = True csvFilename = 'hhp_107_01.data.gz' hex_key = csvFilename + ".hex" print "\n" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, hex_key=hex_key, timeoutSecs=15, schema='put') rfViewInitial = [] # dispatch multiple jobs back to back for jobDispatch in range(3): start = time.time() kwargs = {} model_key = "" if OVERWRITE_RF_MODEL: print "Since we're overwriting here, we have to wait for each to complete noPoll=False" model_key = 'SRF_model' else: model_key = 'SRF_model' + str(jobDispatch) kwargs['ntrees'] = 1 if OVERWRITE_RF_MODEL: print "Change the number of trees, while keeping the rf model key name the same" print "Checks that we correctly overwrite previous rf model" kwargs['ntrees'] += 1 kwargs['seed'] = random.randint(0, sys.maxint) kwargs['response'] = "C107" # FIX! what model keys do these get? randomNode = h2o.nodes[random.randint(0,len(h2o.nodes)-1)] h2o_cmd.runSpeeDRF(node=randomNode, parseResult=parseResult, destination_key=model_key, timeoutSecs=300, noPoll=False, **kwargs) print "rf job dispatch end on ", csvFilename, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch print "\n MODEL KEY: ", model_key rfViewInitial.append(h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60)) # h2o_jobs.pollWaitJobs(pattern='SRF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected first = None print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) model_key = rfView["speedrf_model"]['_key'] ntree = rfView["speedrf_model"]["parameters"]['ntrees'] print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)" # allow it to poll to complete rfViewResult = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60) if first is None: # we'll use this to compare the others first = rfViewResult.copy() firstModelKey = model_key print "first", h2o.dump_json(first) else: print "Comparing", model_key, "to", firstModelKey df = h2o_util.JsonDiff(rfViewResult, first, vice_versa=True, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_rf_covtype_fvec(self): h2o.beta_features = True # fvec importFolderPath = "standard" # Parse Train ****************************************************** csvTrainFilename = 'covtype.shuffled.90pct.data' csvTrainPathname = importFolderPath + "/" + csvTrainFilename hex_key = csvTrainFilename + ".hex" parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=hex_key, timeoutSecs=180, doSummary=False) inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key']) # Parse Test ****************************************************** csvTestFilename = 'covtype.shuffled.10pct.data' csvTestPathname = importFolderPath + "/" + csvTestFilename hex_key = csvTestFilename + ".hex" parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTestPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(None, parseTestResult['destination_key']) rfViewInitial = [] xList = [] eList = [] fList = [] trial = 0 depthList = [10, 20, 30, 40] ntreesList = [5, 10, 20, 30] # ntreesList = [2] nbinsList = [10, 100, 1000] if TRY == 'max_depth': tryList = depthList elif TRY == 'ntrees': tryList = ntreesList elif TRY == 'nbins': tryList = nbinsList else: raise Exception("huh? %s" % TRY) for d in tryList: if TRY == 'max_depth': paramDict['max_depth'] = d elif TRY == 'ntrees': paramDict['ntrees'] = d elif TRY == 'nbins': paramDict['nbins'] = d else: raise Exception("huh? %s" % TRY) # adjust timeoutSecs with the number of trees # seems ec2 can be really slow if DO_OOBE: paramDict['validation'] = None else: paramDict['validation'] = parseTestResult['destination_key'] timeoutSecs = 30 + paramDict['ntrees'] * 200 # do ten starts, to see the bad id problem? TRIES = 5 for i in range(TRIES): lastOne = i==(TRIES-1) # have unique model names trial += 1 kwargs = paramDict.copy() model_key = 'RFModel_' + str(trial) kwargs['destination_key'] = model_key data_key = parseTrainResult['destination_key'] start = time.time() rfResult = h2o_cmd.runSpeeDRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, noPoll=True, **kwargs) trainElapsed = time.time() - start print 'rf train end', i, 'on', csvTrainPathname, 'took', trainElapsed, 'seconds' # don't cancel the last one if not lastOne: time.sleep(1) h2o_jobs.cancelAllJobs(timeoutSecs=2) ### print "rfView", h2o.dump_json(rfView) print "We have a result from the RF above, completed but didn't do RFView yet" # could the RF indicate 'done' too soon? # if rfResult['state']=='RUNNING': # raise Exception("Why is this RF still in RUNNING state? %s" % h2o.dump_json(rfResult)) # if 'drf_model' not in rfResult: # raise Exception("How come there's no drf_model in this RF result? %s" % h2o.dump_json(rfResult)) h2o_jobs.pollWaitJobs(timeoutSecs=300) rfView = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60) print "rfView:", h2o.dump_json(rfView) rfView["drf_model"] = rfView.pop("speedrf_model") rf_model = rfView['drf_model'] cms = rf_model['cms'] ### print "cm:", h2o.dump_json(cm) ntrees = rf_model['N'] errs = rf_model['errs'] N = rf_model['N'] varimp = rf_model['varimp'] treeStats = rf_model['treeStats'] print "maxDepth:", treeStats['maxDepth'] print "maxLeaves:", treeStats['maxLeaves'] print "minDepth:", treeStats['minDepth'] print "minLeaves:", treeStats['minLeaves'] print "meanLeaves:", treeStats['meanLeaves'] print "meanDepth:", treeStats['meanDepth'] print "errs[0]:", errs[0] print "errs[-1]:", errs[-1] print "errs:", errs (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView) # we iterate over params, so can't really do this check # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error) print "classErrorPctList:", classErrorPctList self.assertEqual(len(classErrorPctList), 7, "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict") # FIX! should update this expected classification error predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key) eList.append(classErrorPctList[4]) fList.append(trainElapsed) if DO_PLOT: if TRY == 'max_depth': xLabel = 'max_depth' elif TRY == 'ntrees': xLabel = 'ntrees' elif TRY == 'nbins': xLabel = 'nbins' else: raise Exception("huh? %s" % TRY) xList.append(paramDict[xLabel]) if DO_PLOT: eLabel = 'class 4 pctWrong' fLabel = 'trainElapsed' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_rf_big1_overwrite_model_fvec(self): h2o.beta_features = True csvFilename = 'hhp_107_01.data.gz' hex_key = csvFilename + ".hex" print "\n" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, hex_key=hex_key, timeoutSecs=15, schema='put') firstRfView = None # dispatch multiple jobs back to back for jobDispatch in range(3): start = time.time() kwargs = {} if OVERWRITE_RF_MODEL: print "Since we're overwriting here, we have to wait for each to complete noPoll=False" model_key = 'RF_model' else: model_key = 'RF_model' + str(jobDispatch) print "Change the number of trees, while keeping the rf model key name the same" print "Checks that we correctly overwrite previous rf model" if OVERWRITE_RF_MODEL: kwargs['num_trees'] = 1 + jobDispatch else: kwargs['num_trees'] = 1 # don't change the seed if we're overwriting the model. It should get # different results just from changing the tree count kwargs['seed'] = random.randint(0, sys.maxint) kwargs["response"] = "C107" # FIX! what model keys do these get? randomNode = h2o.nodes[random.randint(0,len(h2o.nodes)-1)] h2o_cmd.runSpeeDRF(node=randomNode, parseResult=parseResult, destination_key=model_key, timeoutSecs=300, noPoll=True, **kwargs) # FIX! are these already in there? rfView = {} rfView['_dataKey'] = hex_key rfView['_key'] = model_key print "rf job dispatch end on ", csvFilename, 'took', time.time() - start, 'seconds' print "\njobDispatch #", jobDispatch # we're going to compare rf results to previous as we go along (so we save rf view results h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # In this test we're waiting after each one, so we can save the RFView results for comparison to future print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['_dataKey'] model_key = rfView['_key'] print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)" # allow it to poll to complete rfViewResult = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60) if firstRfView is None: # we'll use this to compare the others firstRfView = rfViewResult.copy() firstModelKey = model_key print "firstRfView", h2o.dump_json(firstRfView) else: print "Comparing", model_key, "to", firstModelKey df = h2o_util.JsonDiff(rfViewResult, firstRfView, vice_versa=True, with_values=True) print "df.difference:", h2o.dump_json(df.difference) self.assertGreater(len(df.difference), 29, msg="Want >=30 , not %d differences between the two rfView json responses. %s" % \ (len(df.difference), h2o.dump_json(df.difference)))
def test_rf_covtype20x_fvec(self): importFolderPath = 'standard' if DO_SMALL: csvFilenameTrain = 'covtype.data' hex_key = 'covtype1x.data.A.hex' else: csvFilenameTrain = 'covtype20x.data' hex_key = 'covtype20x.data.A.hex' csvPathname = importFolderPath + "/" + csvFilenameTrain parseResultTrain = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) inspect = h2o_cmd.runInspect(key=parseResultTrain['destination_key']) dataKeyTrain = parseResultTrain['destination_key'] print "Parse end", dataKeyTrain # have to re import since source key is gone # we could just copy the key, but sometimes we change the test/train data to covtype.data if DO_SMALL: csvFilenameTest = 'covtype.data' hex_key = 'covtype1x.data.B.hex' dataKeyTest2 = 'covtype1x.data.C.hex' else: csvFilenameTest = 'covtype20x.data' hex_key = 'covtype20x.data.B.hex' dataKeyTest2 = 'covtype20x.data.C.hex' csvPathname = importFolderPath + "/" + csvFilenameTest parseResultTest = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=500) print "Parse result['destination_key']:", parseResultTest['destination_key'] inspect = h2o_cmd.runInspect(key=parseResultTest['destination_key']) dataKeyTest = parseResultTest['destination_key'] print "Parse end", dataKeyTest # make a 3rd key so the predict is uncached too! execExpr = dataKeyTest2 + "=" + dataKeyTest kwargs = {'str': execExpr, 'timeoutSecs': 15} resultExec = h2o_cmd.runExec(**kwargs) # train # this does RFView to understand when RF completes, so the time reported for RFView here, should be # considered the "first RFView" times..subsequent have some caching?. # unless the no_confusion_matrix works # params is mutable. This is default. paramDict = drf2ParamDict params = { 'ntrees': 20, 'destination_key': 'RF_model', 'response': "C55", } # colX = h2o_rf.pickRandRfParams(paramDict, params) kwargs = params.copy() timeoutSecs = 30 + kwargs['ntrees'] * 60 start = time.time() h2o_cmd.runSpeeDRF(parseResult=parseResultTrain, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) print "rf job end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' print "\nRFView start after job completion" model_key = kwargs['destination_key'] ntree = kwargs['ntrees'] start = time.time() # this does the RFModel view for v2. but only model_key is used. Data doesn't matter? (nor ntree) h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=timeoutSecs) print "First rfview end on ", dataKeyTrain, 'took', time.time() - start, 'seconds' for trial in range(1): # scoring start = time.time() rfView = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=timeoutSecs) print "rfview", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' rfView["drf_model"] = rfView.pop("speedrf_model") (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=ntree) self.assertAlmostEqual(classification_error, 50, delta=50, msg="Classification error %s differs too much" % classification_error) start = time.time() predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest2) print "predict", trial, "end on ", dataKeyTest, 'took', time.time() - start, 'seconds.' parseKey = parseResultTrain['destination_key'] rfModelKey = rfView['drf_model']['_key'] predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=parseKey, model_key=rfModelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=parseKey, vactual='C55', predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed"
def test_rf_big1_nopoll_fvec(self): h2o.beta_features = True csvFilename = 'hhp_107_01.data.gz' hex_key = csvFilename + ".hex" print "\n" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, hex_key=hex_key, timeoutSecs=15, schema='put') rfViewInitial = [] # dispatch multiple jobs back to back for jobDispatch in range(3): start = time.time() kwargs = {} model_key = "" if OVERWRITE_RF_MODEL: print "Since we're overwriting here, we have to wait for each to complete noPoll=False" model_key = 'SRF_model' else: model_key = 'SRF_model' + str(jobDispatch) kwargs['ntrees'] = 1 if OVERWRITE_RF_MODEL: print "Change the number of trees, while keeping the rf model key name the same" print "Checks that we correctly overwrite previous rf model" kwargs['ntrees'] += 1 kwargs['seed'] = random.randint(0, sys.maxint) kwargs['response'] = "C107" # FIX! what model keys do these get? randomNode = h2o.nodes[random.randint(0, len(h2o.nodes) - 1)] h2o_cmd.runSpeeDRF(node=randomNode, parseResult=parseResult, destination_key=model_key, timeoutSecs=300, noPoll=False, **kwargs) print "rf job dispatch end on ", csvFilename, 'took', time.time( ) - start, 'seconds' print "\njobDispatch #", jobDispatch print "\n MODEL KEY: ", model_key rfViewInitial.append( h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60)) # h2o_jobs.pollWaitJobs(pattern='SRF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # we saved the initial response? # if we do another poll they should be done now, and better to get it that # way rather than the inspect (to match what simpleCheckGLM is expected first = None print "rfViewInitial", rfViewInitial for rfView in rfViewInitial: print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) model_key = rfView["speedrf_model"]['_key'] ntree = rfView["speedrf_model"]["parameters"]['ntrees'] print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)" # allow it to poll to complete rfViewResult = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60) if first is None: # we'll use this to compare the others first = rfViewResult.copy() firstModelKey = model_key print "first", h2o.dump_json(first) else: print "Comparing", model_key, "to", firstModelKey df = h2o_util.JsonDiff(rfViewResult, first, vice_versa=True, with_values=True) print "df.difference:", h2o.dump_json(df.difference)
def test_rf_big1_overwrite_model_fvec(self): csvFilename = 'hhp_107_01.data.gz' hex_key = csvFilename + ".hex" print "\n" + csvFilename parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename, hex_key=hex_key, timeoutSecs=15, schema='put') firstRfView = None # dispatch multiple jobs back to back for jobDispatch in range(3): start = time.time() kwargs = {} if OVERWRITE_RF_MODEL: print "Since we're overwriting here, we have to wait for each to complete noPoll=False" model_key = 'RF_model' else: model_key = 'RF_model' + str(jobDispatch) print "Change the number of trees, while keeping the rf model key name the same" print "Checks that we correctly overwrite previous rf model" if OVERWRITE_RF_MODEL: kwargs['ntrees'] = 1 + jobDispatch else: kwargs['ntrees'] = 1 # don't change the seed if we're overwriting the model. It should get # different results just from changing the tree count kwargs['seed'] = random.randint(0, sys.maxint) kwargs["response"] = "C107" # FIX! what model keys do these get? randomNode = h2o.nodes[random.randint(0, len(h2o.nodes) - 1)] h2o_cmd.runSpeeDRF(node=randomNode, parseResult=parseResult, destination_key=model_key, timeoutSecs=300, noPoll=True, **kwargs) # FIX! are these already in there? rfView = {} rfView['_dataKey'] = hex_key rfView['_key'] = model_key print "rf job dispatch end on ", csvFilename, 'took', time.time( ) - start, 'seconds' print "\njobDispatch #", jobDispatch # we're going to compare rf results to previous as we go along (so we save rf view results h2o_jobs.pollWaitJobs(pattern='RF_model', timeoutSecs=300, pollTimeoutSecs=10, retryDelaySecs=5) # In this test we're waiting after each one, so we can save the RFView results for comparison to future print "Checking completed job:", rfView print "rfView", h2o.dump_json(rfView) data_key = rfView['_dataKey'] model_key = rfView['_key'] print "Temporary hack: need to do two rf views minimum, to complete a RF (confusion matrix creation)" # allow it to poll to complete rfViewResult = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60) if firstRfView is None: # we'll use this to compare the others firstRfView = rfViewResult.copy() firstModelKey = model_key print "firstRfView", h2o.dump_json(firstRfView) else: print "Comparing", model_key, "to", firstModelKey df = h2o_util.JsonDiff(rfViewResult, firstRfView, vice_versa=True, with_values=True) print "df.difference:", h2o.dump_json(df.difference) self.assertGreater(len(df.difference), 29, msg="Want >=30 , not %d differences between the two rfView json responses. %s" % \ (len(df.difference), h2o.dump_json(df.difference)))