def test_GLM2_covtype_train(self): h2o.beta_features = True importFolderPath = "standard" csvFilename = 'covtype.shuffled.data' csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" # Parse and Exec************************************************ parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=180) execExpr="A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict # will have to live with random extract. will create variance # class 4 = 1, everything else 0 y = 54 execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, 4) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key="A.hex") print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # Split Test/Train************************************************ # how many rows for each pct? numRows = inspect['numRows'] pct10 = int(numRows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = numRows - rowsForPct[9] rowsForPct[10] = last10 # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] print "Creating the key of the last 10% data, for scoring" trainDataKey = "rTrain" testDataKey = "rTest" # start at 90% rows + 1 # GLM, predict, CM*******************************************************8 kwargs = { 'response': 'C' + str(y+1), 'max_iter': 20, 'n_folds': 0, 'alpha': 0.1, 'lambda': 1e-5, 'family': 'binomial', } timeoutSecs = 180 for trial in range(10): # always slice from the beginning rowsToUse = rowsForPct[trial%10] # test/train split **********************************************8 h2o_cmd.createTestTrain(srcKey='A.hex', trainDstKey=trainDataKey, testDstKey=testDataKey, trainPercent=90) aHack = {'destination_key': trainDataKey} parseKey = trainDataKey # GLM **********************************************8 start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) modelKey = glm['glm_model']['_key'] # Score ********************************************** predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=testDataKey, vactual='C' + str(y+1), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); self.assertLess(pctWrong, 8,"Should see less than 7% error (class = 4)") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/numRows), "pct. of all rows"
def rf_covtype_train_oobe(self, csvFilename, checkExpectedResults=True): # the expected results are only for the shuffled version # since getting 10% samples etc of the smallish dataset will vary between # shuffled and non-shuffled datasets importFolderPath = "standard" csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numCols = inspect['numCols'] numRows = inspect['numRows'] pct10 = int(numRows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = numRows - rowsForPct[9] rowsForPct[10] = numRows # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] # 0 isn't used expectTrainPctRightList = [0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79] expectScorePctRightList = [0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78] # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] trial = 0 for rowPct in [0.9]: trial += 1 # Not using this now (did use it for slicing) rowsToUse = rowsForPct[trial%10] resultKey = "r_" + csvFilename + "_" + str(trial) # just do random split for now dataKeyTrain = 'rTrain.hex' dataKeyTest = 'rTest.hex' h2o_cmd.createTestTrain(hex_key, dataKeyTrain, dataKeyTest, trainPercent=90, outputClass=4, outputCol=numCols-1, changeToBinomial=not DO_MULTINOMIAL) sliceResult = {'destination_key': dataKeyTrain} # adjust timeoutSecs with the number of trees kwargs = paramDict.copy() kwargs['destination_key'] = "model_" + csvFilename + "_" + str(trial) timeoutSecs = 30 + kwargs['ntrees'] * 20 start = time.time() rfv = h2o_cmd.runRF(parseResult=sliceResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, **kwargs) # oobeTrainPctRight = 100 * (1.0 - error) oobeTrainPctRight = 100 - error if checkExpectedResults: self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%. Note this is silly if we trained on 100% of the data" print "Or sorted by output class, so that the last 10% is the last few classes" rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] rfvScoring = h2o_cmd.runRFView(None, dataKeyTest, model_key, used_trees, timeoutSecs, retryDelaySecs=1, **kwargs) (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfvScoring, **kwargs) fullScorePctRight = 100 - error h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) if checkExpectedResults: self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/numRows), "pct. of all rows" actualDelta = [abs(a-b) for a,b in zip(expectTrainPctRightList, actualTrainPctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "maybe should update with actual. Remove single quotes" print "actualTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [abs(a-b) for a,b in zip(expectScorePctRightList, actualScorePctRightList)] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "actualScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp return rfvScoring
def test_GLM2_covtype20x_train(self): h2o.beta_features = True importFolderPath = "standard" csvFilename = 'covtype20x.data' csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" # Parse and Exec************************************************ parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=180) execExpr="A.hex=%s" % parseResult['destination_key'] h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # use exec to change the output col to binary, case_mode/case_val doesn't work if we use predict # will have to live with random extract. will create variance # class 4 = 1, everything else 0 y = 54 execExpr="A.hex[,%s]=(A.hex[,%s]==%s)" % (y+1, y+1, 4) h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) inspect = h2o_cmd.runInspect(key="A.hex") print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # Split Test/Train************************************************ # how many rows for each pct? numRows = inspect['numRows'] pct10 = int(numRows * .1) rowsForPct = [i * pct10 for i in range(0,11)] # this can be slightly less than 10% last10 = numRows - rowsForPct[9] rowsForPct[10] = last10 # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] print "Creating the key of the last 10% data, for scoring" trainDataKey = "rTrain" testDataKey = "rTest" # start at 90% rows + 1 # GLM, predict, CM*******************************************************8 kwargs = { 'response': 'C' + str(y), 'max_iter': 20, 'n_folds': 0, 'alpha': 0.1, 'lambda': 1e-5, 'family': 'binomial', 'classification': 1, } timeoutSecs = 60 for trial in range(100): # always slice from the beginning rowsToUse = rowsForPct[trial%10] # test/train split **********************************************8 h2o_cmd.createTestTrain(srcKey='A.hex', trainDstKey=trainDataKey, testDstKey=testDataKey, trainPercent=90) aHack = {'destination_key': trainDataKey} parseKey = trainDataKey # GLM **********************************************8 start = time.time() glm = h2o_cmd.runGLM(parseResult=aHack, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs) print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds' h2o_glm.simpleCheckGLM(self, glm, None, **kwargs) modelKey = glm['glm_model']['_key'] # Score ********************************************** predictKey = 'Predict.hex' start = time.time() predictResult = h2o_cmd.runPredict( data_key=testDataKey, model_key=modelKey, destination_key=predictKey, timeoutSecs=timeoutSecs) predictCMResult = h2o.nodes[0].predict_confusion_matrix( actual=testDataKey, vactual='C' + str(y), predict=predictKey, vpredict='predict', ) cm = predictCMResult['cm'] # These will move into the h2o_gbm.py pctWrong = h2o_gbm.pp_cm_summary(cm); self.assertLess(pctWrong, 8,"Should see less than 7% error (class = 4)") print "\nTest\n==========\n" print h2o_gbm.pp_cm(cm) print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/numRows), "pct. of all rows"
def rf_covtype_train_oobe(self, csvFilename, checkExpectedResults=True): # the expected results are only for the shuffled version # since getting 10% samples etc of the smallish dataset will vary between # shuffled and non-shuffled datasets importFolderPath = "standard" csvPathname = importFolderPath + "/" + csvFilename hex_key = csvFilename + ".hex" parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, timeoutSecs=180) inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) numCols = inspect['numCols'] numRows = inspect['numRows'] pct10 = int(numRows * .1) rowsForPct = [i * pct10 for i in range(0, 11)] # this can be slightly less than 10% last10 = numRows - rowsForPct[9] rowsForPct[10] = numRows # use mod below for picking "rows-to-do" in case we do more than 9 trials # use 10 if 0 just to see (we copied 10 to 0 above) rowsForPct[0] = rowsForPct[10] # 0 isn't used expectTrainPctRightList = [ 0, 85.16, 88.45, 90.24, 91.27, 92.03, 92.64, 93.11, 93.48, 93.79 ] expectScorePctRightList = [ 0, 88.81, 91.72, 93.06, 94.02, 94.52, 95.09, 95.41, 95.77, 95.78 ] # keep the 0 entry empty actualTrainPctRightList = [0] actualScorePctRightList = [0] trial = 0 for rowPct in [0.9]: trial += 1 # Not using this now (did use it for slicing) rowsToUse = rowsForPct[trial % 10] resultKey = "r_" + csvFilename + "_" + str(trial) # just do random split for now dataKeyTrain = 'rTrain.hex' dataKeyTest = 'rTest.hex' h2o_cmd.createTestTrain(hex_key, dataKeyTrain, dataKeyTest, trainPercent=90, outputClass=4, outputCol=numCols - 1, changeToBinomial=not DO_MULTINOMIAL) sliceResult = {'destination_key': dataKeyTrain} # adjust timeoutSecs with the number of trees kwargs = paramDict.copy() kwargs['destination_key'] = "model_" + csvFilename + "_" + str( trial) timeoutSecs = 30 + kwargs['ntrees'] * 20 start = time.time() rfv = h2o_cmd.runRF(parseResult=sliceResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfv, **kwargs) # oobeTrainPctRight = 100 * (1.0 - error) oobeTrainPctRight = 100 - error if checkExpectedResults: self.assertAlmostEqual(oobeTrainPctRight, expectTrainPctRightList[trial], msg="OOBE: pct. right for %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), oobeTrainPctRight, expectTrainPctRightList[trial]), delta=ALLOWED_DELTA) actualTrainPctRightList.append(oobeTrainPctRight) print "Now score on the last 10%. Note this is silly if we trained on 100% of the data" print "Or sorted by output class, so that the last 10% is the last few classes" rf_model = rfv['drf_model'] used_trees = rf_model['N'] data_key = rf_model['_dataKey'] model_key = rf_model['_key'] rfvScoring = h2o_cmd.runRFView(None, dataKeyTest, model_key, used_trees, timeoutSecs, retryDelaySecs=1, **kwargs) (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfvScoring, **kwargs) fullScorePctRight = 100 - error h2o.nodes[0].generate_predictions(model_key=model_key, data_key=dataKeyTest) if checkExpectedResults: self.assertAlmostEqual(fullScorePctRight,expectScorePctRightList[trial], msg="Full: pct. right for scoring after %s pct. training not close enough %6.2f %6.2f"% \ ((trial*10), fullScorePctRight, expectScorePctRightList[trial]), delta=ALLOWED_DELTA) actualScorePctRightList.append(fullScorePctRight) print "Trial #", trial, "completed", "using %6.2f" % ( rowsToUse * 100.0 / numRows), "pct. of all rows" actualDelta = [ abs(a - b) for a, b in zip(expectTrainPctRightList, actualTrainPctRightList) ] niceFp = ["{0:0.2f}".format(i) for i in actualTrainPctRightList] print "maybe should update with actual. Remove single quotes" print "actualTrainPctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp actualDelta = [ abs(a - b) for a, b in zip(expectScorePctRightList, actualScorePctRightList) ] niceFp = ["{0:0.2f}".format(i) for i in actualScorePctRightList] print "maybe should update with actual. Remove single quotes" print "actualScorePctRightList =", niceFp niceFp = ["{0:0.2f}".format(i) for i in actualDelta] print "actualDelta =", niceFp return rfvScoring