def test4(self): """ include thresholding """ self.details.tableName = 'ferro_quant' self.details.threshold = 0.80 self.details.doHoldout = 0 self.details.doTraining = 0 compos = pickle.load( open(os.path.join(self.baseDir, 'ferromag_quant_10.pkl'), 'rb')) tgt = 7 assert len(compos) == tgt, 'bad composite loaded: %d != %d' % ( len(compos), tgt) nGood, misCount, nSkipped, avgGood, avgBad, avgSkip, tbl = ScreenComposite.ScreenFromDetails( compos, self.details) assert nGood == 87, str(nGood) assert misCount == 1 assert nSkipped == 7, nSkipped assert feq(avgGood, 1.0), avgGood assert feq(avgBad, 1.000), avgBad assert feq(avgSkip, .7571), avgSkip assert tbl[0, 0] == 50 assert tbl[1, 1] == 37 assert tbl[0, 1] == 1 assert tbl[1, 0] == 0
def test11_filtering_segmentation(self): # """ filtering with segmentation """ self.details.tableName = 'ferro_noquant' with open(os.path.join(self.baseDir, 'ferromag_filt_10_3.pkl'), 'r') as pklTF: buf = pklTF.read().replace('\r\n', '\n').encode('utf-8') pklTF.close() with io.BytesIO(buf) as pklF: compos = pickle.load(pklF) tgt = 10 self.assertEqual(len(compos), tgt) self.details.doHoldout = 1 self.details.filterVal = 1 self.details.filterFrac = .33 nGood, misCount, nSkipped, avgGood, avgBad, avgSkip, tbl = ScreenComposite.ScreenFromDetails( compos, self.details) self.assertEqual(nGood, 37) self.assertEqual(misCount, 6) self.assertEqual(nSkipped, 0) self.assertAlmostEqual(avgGood, .95946, 4) self.assertAlmostEqual(avgBad, .85, 4) self.assertAlmostEqual(avgSkip, 0, 4) self.assertEqual(tbl[0, 0], 14) self.assertEqual(tbl[1, 1], 23) self.assertEqual(tbl[0, 1], 1) self.assertEqual(tbl[1, 0], 5)
def test6_multiple_models(self): # """ multiple models """ self.details.tableName = 'ferro_noquant' with open(os.path.join(self.baseDir, 'ferromag_auto_10_3.pkl'), 'r') as pklTF: buf = pklTF.read().replace('\r\n', '\n').encode('utf-8') pklTF.close() with io.BytesIO(buf) as pklF: compos = pickle.load(pklF) tgt = 10 self.assertEqual(len(compos), tgt) composites = [compos, compos] tpl = ScreenComposite.ScreenFromDetails(composites, self.details) nGood, misCount, nSkipped, avgGood, avgBad, avgSkip, tbl = tpl self.assertEqual(nGood[0], 95) self.assertEqual(misCount[0], 8) self.assertEqual(nSkipped[0], 0) self.assertAlmostEqual(avgGood[0], .9684, 4) self.assertAlmostEqual(avgBad[0], .8375, 4) self.assertAlmostEqual(avgSkip[0], 0.0, 4) self.assertEqual(nGood[1], 0) self.assertEqual(misCount[1], 0) self.assertEqual(nSkipped[1], 0) self.assertEqual(avgGood[1], 0) self.assertEqual(avgBad[1], 0) self.assertEqual(avgSkip[1], 0) self.assertEqual(tbl[0, 0], 50) self.assertEqual(tbl[1, 1], 45) self.assertEqual(tbl[0, 1], 5) self.assertEqual(tbl[1, 0], 3)
def test2_include_holdout(self): # """ include holdout data only """ self.details.tableName = 'ferro_quant' self.details.doHoldout = 1 self.details.doTraining = 0 with open(os.path.join(self.baseDir, 'ferromag_quant_10.pkl'), 'r') as pklTF: buf = pklTF.read().replace('\r\n', '\n').encode('utf-8') pklTF.close() with io.BytesIO(buf) as pklF: compos = pickle.load(pklF) tgt = 5 self.assertEqual(len(compos), tgt) nGood, misCount, nSkipped, avgGood, avgBad, avgSkip, tbl = ScreenComposite.ScreenFromDetails( compos, self.details) self.assertEqual(nGood, 28) self.assertEqual(misCount, 1) self.assertEqual(nSkipped, 0) self.assertAlmostEqual(avgGood, .9964, 4) self.assertAlmostEqual(avgBad, 1.000, 4) self.assertAlmostEqual(avgSkip, 0, 4) self.assertEqual(tbl[0, 0], 16) self.assertEqual(tbl[1, 1], 12) self.assertEqual(tbl[0, 1], 1) self.assertEqual(tbl[1, 0], 0)
def test6(self): """ multiple models """ self.details.tableName = 'ferro_noquant' compos = pickle.load( open(os.path.join(self.baseDir, 'ferromag_auto_10_3.pkl'), 'rb')) tgt = 10 assert len(compos) == tgt, 'bad composite loaded: %d != %d' % ( len(compos), tgt) composites = [compos, compos] tpl = ScreenComposite.ScreenFromDetails(composites, self.details) nGood, misCount, nSkipped, avgGood, avgBad, avgSkip, tbl = tpl assert feq(nGood[0], 93), nGood assert feq(misCount[0], 10) assert feq(nSkipped[0], 0) assert feq(avgGood[0], .9699), avgGood assert feq(avgBad[0], .8100), avgBad assert feq(nGood[1], 0) assert feq(misCount[1], 0) assert feq(nSkipped[1], 0) assert feq(avgGood[1], 0) assert feq(avgBad[1], 0) assert feq(tbl[0, 0], 48), tbl assert feq(tbl[1, 1], 45) assert feq(tbl[0, 1], 7) assert feq(tbl[1, 0], 3)
def setUp(self): self.baseDir = os.path.join(RDConfig.RDCodeDir, 'ML', 'test_data') self.dbName = RDConfig.RDTestDatabase self.details = ScreenComposite.SetDefaults() self.details.dbName = self.dbName self.details.dbUser = RDConfig.defaultDBUser self.details.dbPassword = RDConfig.defaultDBPassword
def test9_shuffle_segmentation2(self): # """ shuffle with segmentation2 """ self.details.tableName = 'ferro_noquant' with open(os.path.join(self.baseDir, 'ferromag_shuffle_10_3.pkl'), 'r') as pklTF: buf = pklTF.read().replace('\r\n', '\n').encode('utf-8') pklTF.close() with io.BytesIO(buf) as pklF: compos = pickle.load(pklF) tgt = 10 self.assertEqual(len(compos), tgt) self.details.shuffleActivities = 1 self.details.doTraining = 1 nGood, misCount, nSkipped, avgGood, avgBad, avgSkip, tbl = ScreenComposite.ScreenFromDetails( compos, self.details) self.assertEqual(nGood, 31) self.assertEqual(misCount, 41) self.assertEqual(nSkipped, 0) self.assertAlmostEqual(avgGood, .7161, 4) self.assertAlmostEqual(avgBad, .7707, 4) self.assertAlmostEqual(avgSkip, 0.0, 4) self.assertEqual(tbl[0, 0], 18) self.assertEqual(tbl[1, 1], 13) self.assertEqual(tbl[0, 1], 19) self.assertEqual(tbl[1, 0], 22)
def test3_include_training(self): # """ include training data only """ self.details.tableName = 'ferro_quant' self.details.doHoldout = 0 self.details.doTraining = 1 with open(os.path.join(self.baseDir, 'ferromag_quant_10.pkl'), 'r') as pklTF: buf = pklTF.read().replace('\r\n', '\n').encode('utf-8') pklTF.close() with io.BytesIO(buf) as pklF: compos = pickle.load(pklF) tgt = 5 self.assertEqual(len(compos), tgt, 'bad composite loaded: %d != %d' % (len(compos), tgt)) nGood, misCount, nSkipped, avgGood, avgBad, avgSkip, tbl = ScreenComposite.ScreenFromDetails( compos, self.details) self.assertEqual(nGood, 65) self.assertEqual(misCount, 1) self.assertEqual(nSkipped, 0) self.assertAlmostEqual(avgGood, .98307, 4) self.assertAlmostEqual(avgBad, 0.600, 4) self.assertAlmostEqual(avgSkip, 0, 4) self.assertEqual(tbl[0, 0], 38, tbl) self.assertEqual(tbl[1, 1], 27) self.assertEqual(tbl[0, 1], 1) self.assertEqual(tbl[1, 0], 0)
def test7(self): """ shuffle """ self.details.tableName = 'ferro_noquant' with open(os.path.join(self.baseDir,'ferromag_shuffle_10_3.pkl'),'rb') as pklF: compos = pickle.load(pklF) tgt = 10 self.assertEqual(len(compos),tgt) self.details.shuffleActivities=1 nGood,misCount,nSkipped,avgGood,avgBad,avgSkip,tbl = ScreenComposite.ScreenFromDetails(compos,self.details) self.assertEqual(nGood,50) self.assertEqual(misCount,53) self.assertEqual(nSkipped,0) self.assertAlmostEqual(avgGood,.7380,4) self.assertAlmostEqual(avgBad,.7660,4) self.assertEqual(tbl[0,0] , 30) self.assertEqual(tbl[1,1] , 20) self.assertEqual(tbl[0,1] , 25) self.assertEqual(tbl[1,0] , 28)
def test1(self): """ basics """ self.details.tableName = 'ferro_quant' with open(os.path.join(self.baseDir,'ferromag_quant_10.pkl'),'rb') as pklF: compos = pickle.load(pklF) tgt = 5 self.assertEqual(len(compos),tgt) nGood,misCount,nSkipped,avgGood,avgBad,avgSkip,tbl = ScreenComposite.ScreenFromDetails(compos,self.details) self.assertEqual(nGood,93) self.assertEqual(misCount,2) self.assertEqual(nSkipped,0) self.assertAlmostEqual(avgGood,.9871,4) self.assertAlmostEqual(avgBad,.8000,4) self.assertEqual(tbl[0,0] , 54) self.assertEqual(tbl[1,1] , 39) self.assertEqual(tbl[0,1] , 2) self.assertEqual(tbl[1,0] , 0)
def test12(self): """ test the naive bayes composite""" self.details.tableName = 'ferro_noquant' with open(os.path.join(self.baseDir,'ferromag_NaiveBayes.pkl'), 'rb') as pklF: compos = pickle.load(pklF) tgt = 10 self.assertEqual(len(compos),tgt) self.details.doHoldout=1 nGood,misCount,nSkipped,avgGood,avgBad,avgSkip,tbl = ScreenComposite.ScreenFromDetails(compos,self.details) self.assertEqual(nGood , 25) self.assertEqual(misCount , 6) self.assertEqual(nSkipped , 0) self.assertAlmostEqual(avgGood, 0.9800,4) self.assertAlmostEqual(avgBad, 0.86667,4) self.assertEqual(tbl[0,0] , 9) self.assertEqual(tbl[0,1] , 6) self.assertEqual(tbl[1,0] , 0) self.assertEqual(tbl[1,1] , 16)
def test12(self): """ test the naive bayes composite""" self.details.tableName = 'ferro_noquant' compos = pickle.load( open(os.path.join(self.baseDir, 'ferromag_NaiveBayes.pkl'), 'rb')) tgt = 10 assert len(compos) == tgt, 'bad composite loaded: %d != %d' % ( len(compos), tgt) self.details.doHoldout = 1 nGood, misCount, nSkipped, avgGood, avgBad, avgSkip, tbl = ScreenComposite.ScreenFromDetails( compos, self.details) assert nGood == 27, nGood assert misCount == 4, misCount assert nSkipped == 0, nSkipped assert feq(avgGood, 0.9407), avgGood assert feq(avgBad, 0.875), avgBad assert tbl[0, 0] == 11, tbl assert tbl[0, 1] == 4 assert tbl[1, 0] == 0 assert tbl[1, 1] == 16
def test9(self): """ shuffle with segmentation2 """ self.details.tableName = 'ferro_noquant' with open(os.path.join(self.baseDir,'ferromag_shuffle_10_3.pkl'), 'rb') as pklF: compos = pickle.load(pklF) tgt = 10 self.assertEqual(len(compos),tgt) self.details.shuffleActivities=1 self.details.doTraining=1 nGood,misCount,nSkipped,avgGood,avgBad,avgSkip,tbl = ScreenComposite.ScreenFromDetails(compos,self.details) self.assertEqual(nGood,31) self.assertEqual(misCount,41) self.assertEqual(nSkipped,0) self.assertAlmostEqual(avgGood,.7161,4) self.assertAlmostEqual(avgBad,.7707,4) self.assertEqual(tbl[0,0] , 18) self.assertEqual(tbl[1,1] , 13) self.assertEqual(tbl[0,1] , 19) self.assertEqual(tbl[1,0] , 22)
def test1(self): """ basics """ self.details.tableName = 'ferro_quant' compos = pickle.load( open(os.path.join(self.baseDir, 'ferromag_quant_10.pkl'), 'rb')) tgt = 7 assert len(compos) == tgt, 'bad composite loaded: %d != %d' % ( len(compos), tgt) nGood, misCount, nSkipped, avgGood, avgBad, avgSkip, tbl = ScreenComposite.ScreenFromDetails( compos, self.details) assert nGood == 93 assert misCount == 2 assert nSkipped == 0 assert feq(avgGood, .9849), avgGood assert feq(avgBad, .8500), avgBad assert tbl[0, 0] == 54, tbl assert tbl[1, 1] == 39 assert tbl[0, 1] == 2 assert tbl[1, 0] == 0
def test7(self): """ shuffle """ self.details.tableName = 'ferro_noquant' compos = pickle.load( open(os.path.join(self.baseDir, 'ferromag_shuffle_10_3.pkl'), 'rb')) tgt = 10 assert len(compos) == tgt, 'bad composite loaded: %d != %d' % ( len(compos), tgt) self.details.shuffleActivities = 1 nGood, misCount, nSkipped, avgGood, avgBad, avgSkip, tbl = ScreenComposite.ScreenFromDetails( compos, self.details) assert nGood == 50, nGood assert misCount == 53 assert nSkipped == 0 assert feq(avgGood, .7380), avgGood assert feq(avgBad, .7660), avgBad assert tbl[0, 0] == 30, tbl assert tbl[1, 1] == 20 assert tbl[0, 1] == 25 assert tbl[1, 0] == 28
def test3(self): """ include training data only """ self.details.tableName = 'ferro_quant' self.details.doHoldout=0 self.details.doTraining=1 with open(os.path.join(self.baseDir,'ferromag_quant_10.pkl'),'rb') as pklF: compos = pickle.load(pklF) tgt = 5 self.assertEqual(len(compos),tgt,'bad composite loaded: %d != %d'%(len(compos),tgt)) nGood,misCount,nSkipped,avgGood,avgBad,avgSkip,tbl = ScreenComposite.ScreenFromDetails(compos,self.details) self.assertEqual(nGood,65) self.assertEqual(misCount,1) self.assertEqual(nSkipped,0) self.assertAlmostEqual(avgGood,.98307,4) self.assertAlmostEqual(avgBad,0.600,4) self.assertEqual(tbl[0,0] , 38,tbl) self.assertEqual(tbl[1,1] , 27) self.assertEqual(tbl[0,1] , 1) self.assertEqual(tbl[1,0] , 0)
def test2(self): """ include holdout data only """ self.details.tableName = 'ferro_quant' self.details.doHoldout=1 self.details.doTraining=0 with open(os.path.join(self.baseDir,'ferromag_quant_10.pkl'),'rb') as pklF: compos = pickle.load(pklF) tgt = 5 self.assertEqual(len(compos),tgt) nGood,misCount,nSkipped,avgGood,avgBad,avgSkip,tbl = ScreenComposite.ScreenFromDetails(compos,self.details) self.assertEqual(nGood,28) self.assertEqual(misCount,1) self.assertEqual(nSkipped,0) self.assertAlmostEqual(avgGood,.9964,4) self.assertAlmostEqual(avgBad,1.000,4) self.assertEqual(tbl[0,0] , 16) self.assertEqual(tbl[1,1] , 12) self.assertEqual(tbl[0,1] , 1) self.assertEqual(tbl[1,0] , 0)
def test10(self): """ filtering """ self.details.tableName = 'ferro_noquant' with open(os.path.join(self.baseDir,'ferromag_filt_10_3.pkl'), 'rb') as pklF: compos = pickle.load(pklF) tgt = 10 self.assertEqual(len(compos),tgt) self.details.filterVal=1 self.details.filterFrac=.33 nGood,misCount,nSkipped,avgGood,avgBad,avgSkip,tbl = ScreenComposite.ScreenFromDetails(compos,self.details) self.assertEqual(nGood,90) self.assertEqual(misCount,13) self.assertEqual(nSkipped,0) self.assertAlmostEqual(avgGood,.9578,4) self.assertAlmostEqual(avgBad,.8538,4) self.assertEqual(tbl[0,0] , 54) self.assertEqual(tbl[1,1] , 36) self.assertEqual(tbl[0,1] , 1) self.assertEqual(tbl[1,0] , 12)
def test5(self): """ basics """ self.details.tableName = 'ferro_noquant' compos = pickle.load( open(os.path.join(self.baseDir, 'ferromag_auto_10_3.pkl'), 'rb')) tgt = 10 assert len(compos) == tgt, 'bad composite loaded: %d != %d' % ( len(compos), tgt) tpl = ScreenComposite.ScreenFromDetails(compos, self.details) nGood, misCount, nSkipped, avgGood, avgBad, avgSkip, tbl = tpl assert nGood == 93, nGood assert misCount == 10 assert nSkipped == 0 assert feq(avgGood, .9699), avgGood assert feq(avgBad, .8100), avgBad assert tbl[0, 0] == 48, tbl assert tbl[1, 1] == 45 assert tbl[0, 1] == 7 assert tbl[1, 0] == 3
def test5(self): """ basics """ self.details.tableName = 'ferro_noquant' with open(os.path.join(self.baseDir,'ferromag_auto_10_3.pkl'),'rb') as pklF: compos = pickle.load(pklF) tgt = 10 self.assertEqual(len(compos),tgt) tpl = ScreenComposite.ScreenFromDetails(compos,self.details) nGood,misCount,nSkipped,avgGood,avgBad,avgSkip,tbl = tpl self.assertEqual(nGood,95) self.assertEqual(misCount,8) self.assertEqual(nSkipped,0) self.assertAlmostEqual(avgGood,.9684,4) self.assertAlmostEqual(avgBad,.8375,4) self.assertEqual(tbl[0,0] , 50) self.assertEqual(tbl[1,1] , 45) self.assertEqual(tbl[0,1] , 5) self.assertEqual(tbl[1,0] , 3)
def test1(self): """ basics """ self.details.tableName = 'ferro_quant' with open(os.path.join(self.baseDir, 'ferromag_quant_10.pkl'), 'r') as pklTF: buf = pklTF.read().replace('\r\n', '\n').encode('utf-8') pklTF.close() with io.BytesIO(buf) as pklF: compos = pickle.load(pklF) tgt = 5 self.assertEqual(len(compos), tgt) nGood, misCount, nSkipped, avgGood, avgBad, avgSkip, tbl = ScreenComposite.ScreenFromDetails( compos, self.details) self.assertEqual(nGood, 93) self.assertEqual(misCount, 2) self.assertEqual(nSkipped, 0) self.assertAlmostEqual(avgGood, .9871, 4) self.assertAlmostEqual(avgBad, .8000, 4) self.assertEqual(tbl[0, 0], 54) self.assertEqual(tbl[1, 1], 39) self.assertEqual(tbl[0, 1], 2) self.assertEqual(tbl[1, 0], 0)
def test10(self): """ filtering """ self.details.tableName = 'ferro_noquant' compos = pickle.load( open(os.path.join(self.baseDir, 'ferromag_filt_10_3.pkl'), 'rb')) tgt = 10 assert len(compos) == tgt, 'bad composite loaded: %d != %d' % ( len(compos), tgt) self.details.filterVal = 1 self.details.filterFrac = .33 nGood, misCount, nSkipped, avgGood, avgBad, avgSkip, tbl = ScreenComposite.ScreenFromDetails( compos, self.details) assert nGood == 90 assert misCount == 13 assert nSkipped == 0 assert feq(avgGood, .9578) assert feq(avgBad, .8538) assert tbl[0, 0] == 54 assert tbl[1, 1] == 36 assert tbl[0, 1] == 1 assert tbl[1, 0] == 12
def test9(self): """ shuffle with segmentation2 """ self.details.tableName = 'ferro_noquant' compos = pickle.load( open(os.path.join(self.baseDir, 'ferromag_shuffle_10_3.pkl'), 'rb')) tgt = 10 assert len(compos) == tgt, 'bad composite loaded: %d != %d' % ( len(compos), tgt) self.details.shuffleActivities = 1 self.details.doTraining = 1 nGood, misCount, nSkipped, avgGood, avgBad, avgSkip, tbl = ScreenComposite.ScreenFromDetails( compos, self.details) assert nGood == 31, nGood assert misCount == 41 assert nSkipped == 0 assert feq(avgGood, .7161), avgGood assert feq(avgBad, .7707), avgBad assert tbl[0, 0] == 18, tbl assert tbl[1, 1] == 13 assert tbl[0, 1] == 19 assert tbl[1, 0] == 22
def test4(self): """ include thresholding """ self.details.tableName = 'ferro_quant' self.details.threshold = 0.80 self.details.doHoldout=0 self.details.doTraining=0 with open(os.path.join(self.baseDir,'ferromag_quant_10.pkl'),'rb') as pklF: compos = pickle.load(pklF) tgt = 5 self.assertEqual(len(compos),tgt) nGood,misCount,nSkipped,avgGood,avgBad,avgSkip,tbl = ScreenComposite.ScreenFromDetails(compos,self.details) self.assertEqual(nGood,91) self.assertEqual(misCount,1) self.assertEqual(nSkipped,3) self.assertAlmostEqual(avgGood,0.9956,4) self.assertAlmostEqual(avgBad,1.000,4) self.assertAlmostEqual(avgSkip,0.6000,4) self.assertEqual(tbl[0,0] , 54) self.assertEqual(tbl[1,1] , 37) self.assertEqual(tbl[0,1] , 1) self.assertEqual(tbl[1,0] , 0)
def test3(self): """ include training data only """ self.details.tableName = 'ferro_quant' self.details.doHoldout = 0 self.details.doTraining = 1 compos = pickle.load( open(os.path.join(self.baseDir, 'ferromag_quant_10.pkl'), 'rb')) tgt = 7 assert len(compos) == tgt, 'bad composite loaded: %d != %d' % ( len(compos), tgt) nGood, misCount, nSkipped, avgGood, avgBad, avgSkip, tbl = ScreenComposite.ScreenFromDetails( compos, self.details) assert nGood == 65 assert misCount == 1 assert nSkipped == 0 assert feq(avgGood, .9846), avgGood assert feq(avgBad, .7000), avgBad assert tbl[0, 0] == 38, tbl assert tbl[1, 1] == 27 assert tbl[0, 1] == 1 assert tbl[1, 0] == 0
def test12_naiveBayes_composite(self): # """ test the naive bayes composite""" self.details.tableName = 'ferro_noquant' with open(os.path.join(self.baseDir, 'ferromag_NaiveBayes.pkl'), 'r') as pklTF: buf = pklTF.read().replace('\r\n', '\n').encode('utf-8') pklTF.close() with io.BytesIO(buf) as pklF: compos = pickle.load(pklF) tgt = 10 self.assertEqual(len(compos), tgt) self.details.doHoldout = 1 nGood, misCount, nSkipped, avgGood, avgBad, avgSkip, tbl = ScreenComposite.ScreenFromDetails( compos, self.details) self.assertEqual(nGood, 25) self.assertEqual(misCount, 6) self.assertEqual(nSkipped, 0) self.assertAlmostEqual(avgGood, 0.9800, 4) self.assertAlmostEqual(avgBad, 0.86667, 4) self.assertAlmostEqual(avgSkip, 0, 4) self.assertEqual(tbl[0, 0], 9) self.assertEqual(tbl[0, 1], 6) self.assertEqual(tbl[1, 0], 0) self.assertEqual(tbl[1, 1], 16)
def test7_shuffle(self): # """ shuffle """ self.details.tableName = 'ferro_noquant' with open(os.path.join(self.baseDir, 'ferromag_shuffle_10_3.pkl'), 'r') as pklTF: buf = pklTF.read().replace('\r\n', '\n').encode('utf-8') pklTF.close() with io.BytesIO(buf) as pklF: compos = pickle.load(pklF) tgt = 10 self.assertEqual(len(compos), tgt) self.details.shuffleActivities = 1 nGood, misCount, nSkipped, avgGood, avgBad, avgSkip, tbl = ScreenComposite.ScreenFromDetails( compos, self.details) self.assertEqual(nGood, 50) self.assertEqual(misCount, 53) self.assertEqual(nSkipped, 0) self.assertAlmostEqual(avgGood, .7380, 4) self.assertAlmostEqual(avgBad, .7660, 4) self.assertAlmostEqual(avgSkip, 0, 4) self.assertEqual(tbl[0, 0], 30) self.assertEqual(tbl[1, 1], 20) self.assertEqual(tbl[0, 1], 25) self.assertEqual(tbl[1, 0], 28)
def RunOnData(details, data, progressCallback=None, saveIt=1, setDescNames=0): nExamples = data.GetNPts() if details.lockRandom: seed = details.randomSeed else: import random seed = (random.randint(0, 1e6), random.randint(0, 1e6)) DataUtils.InitRandomNumbers(seed) testExamples = [] if details.shuffleActivities == 1: DataUtils.RandomizeActivities(data, shuffle=1, runDetails=details) elif details.randomActivities == 1: DataUtils.RandomizeActivities(data, shuffle=0, runDetails=details) namedExamples = data.GetNamedData() if details.splitRun == 1: trainIdx, testIdx = SplitData.SplitIndices(len(namedExamples), details.splitFrac, silent=not _verbose) trainExamples = [namedExamples[x] for x in trainIdx] testExamples = [namedExamples[x] for x in testIdx] else: testExamples = [] testIdx = [] trainIdx = range(len(namedExamples)) trainExamples = namedExamples if details.filterFrac != 0.0: # if we're doing quantization on the fly, we need to handle that here: if hasattr(details, 'activityBounds') and details.activityBounds: tExamples = [] bounds = details.activityBounds for pt in trainExamples: pt = pt[:] act = pt[-1] placed = 0 bound = 0 while not placed and bound < len(bounds): if act < bounds[bound]: pt[-1] = bound placed = 1 else: bound += 1 if not placed: pt[-1] = bound tExamples.append(pt) else: bounds = None tExamples = trainExamples trainIdx, temp = DataUtils.FilterData(tExamples, details.filterVal, details.filterFrac, -1, indicesOnly=1) tmp = [trainExamples[x] for x in trainIdx] testExamples += [trainExamples[x] for x in temp] trainExamples = tmp counts = DataUtils.CountResults(trainExamples, bounds=bounds) ks = counts.keys() ks.sort() message('Result Counts in training set:') for k in ks: message(str((k, counts[k]))) counts = DataUtils.CountResults(testExamples, bounds=bounds) ks = counts.keys() ks.sort() message('Result Counts in test set:') for k in ks: message(str((k, counts[k]))) nExamples = len(trainExamples) message('Training with %d examples' % (nExamples)) nVars = data.GetNVars() attrs = range(1, nVars + 1) nPossibleVals = data.GetNPossibleVals() for i in range(1, len(nPossibleVals)): if nPossibleVals[i - 1] == -1: attrs.remove(i) if details.pickleDataFileName != '': pickleDataFile = open(details.pickleDataFileName, 'wb+') cPickle.dump(trainExamples, pickleDataFile) cPickle.dump(testExamples, pickleDataFile) pickleDataFile.close() if details.bayesModel: composite = BayesComposite.BayesComposite() else: composite = Composite.Composite() composite._randomSeed = seed composite._splitFrac = details.splitFrac composite._shuffleActivities = details.shuffleActivities composite._randomizeActivities = details.randomActivities if hasattr(details, 'filterFrac'): composite._filterFrac = details.filterFrac if hasattr(details, 'filterVal'): composite._filterVal = details.filterVal composite.SetModelFilterData(details.modelFilterFrac, details.modelFilterVal) composite.SetActivityQuantBounds(details.activityBounds) nPossibleVals = data.GetNPossibleVals() if details.activityBounds: nPossibleVals[-1] = len(details.activityBounds) + 1 if setDescNames: composite.SetInputOrder(data.GetVarNames()) composite.SetDescriptorNames(details._descNames) else: composite.SetDescriptorNames(data.GetVarNames()) composite.SetActivityQuantBounds(details.activityBounds) if details.nModels == 1: details.internalHoldoutFrac = 0.0 if details.useTrees: from rdkit.ML.DecTree import CrossValidate, PruneTree if details.qBounds != []: from rdkit.ML.DecTree import BuildQuantTree builder = BuildQuantTree.QuantTreeBoot else: from rdkit.ML.DecTree import ID3 builder = ID3.ID3Boot driver = CrossValidate.CrossValidationDriver pruner = PruneTree.PruneTree composite.SetQuantBounds(details.qBounds) nPossibleVals = data.GetNPossibleVals() if details.activityBounds: nPossibleVals[-1] = len(details.activityBounds) + 1 composite.Grow(trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver, pruner=pruner, nTries=details.nModels, pruneIt=details.pruneIt, lessGreedy=details.lessGreedy, needsQuantization=0, treeBuilder=builder, nQuantBounds=details.qBounds, startAt=details.startAt, maxDepth=details.limitDepth, progressCallback=progressCallback, holdOutFrac=details.internalHoldoutFrac, replacementSelection=details.replacementSelection, recycleVars=details.recycleVars, randomDescriptors=details.randomDescriptors, silent=not _verbose) elif details.useSigTrees: from rdkit.ML.DecTree import CrossValidate from rdkit.ML.DecTree import BuildSigTree builder = BuildSigTree.SigTreeBuilder driver = CrossValidate.CrossValidationDriver nPossibleVals = data.GetNPossibleVals() if details.activityBounds: nPossibleVals[-1] = len(details.activityBounds) + 1 if hasattr(details, 'sigTreeBiasList'): biasList = details.sigTreeBiasList else: biasList = None if hasattr(details, 'useCMIM'): useCMIM = details.useCMIM else: useCMIM = 0 if hasattr(details, 'allowCollections'): allowCollections = details.allowCollections else: allowCollections = False composite.Grow(trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver, nTries=details.nModels, needsQuantization=0, treeBuilder=builder, maxDepth=details.limitDepth, progressCallback=progressCallback, holdOutFrac=details.internalHoldoutFrac, replacementSelection=details.replacementSelection, recycleVars=details.recycleVars, randomDescriptors=details.randomDescriptors, biasList=biasList, useCMIM=useCMIM, allowCollection=allowCollections, silent=not _verbose) elif details.useKNN: from rdkit.ML.KNN import CrossValidate from rdkit.ML.KNN import DistFunctions driver = CrossValidate.CrossValidationDriver dfunc = '' if (details.knnDistFunc == "Euclidean"): dfunc = DistFunctions.EuclideanDist elif (details.knnDistFunc == "Tanimoto"): dfunc = DistFunctions.TanimotoDist else: assert 0, "Bad KNN distance metric value" composite.Grow(trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver, nTries=details.nModels, needsQuantization=0, numNeigh=details.knnNeighs, holdOutFrac=details.internalHoldoutFrac, distFunc=dfunc) elif details.useNaiveBayes or details.useSigBayes: from rdkit.ML.NaiveBayes import CrossValidate driver = CrossValidate.CrossValidationDriver if not (hasattr(details, 'useSigBayes') and details.useSigBayes): composite.Grow(trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver, nTries=details.nModels, needsQuantization=0, nQuantBounds=details.qBounds, holdOutFrac=details.internalHoldoutFrac, replacementSelection=details.replacementSelection, mEstimateVal=details.mEstimateVal, silent=not _verbose) else: if hasattr(details, 'useCMIM'): useCMIM = details.useCMIM else: useCMIM = 0 composite.Grow(trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver, nTries=details.nModels, needsQuantization=0, nQuantBounds=details.qBounds, mEstimateVal=details.mEstimateVal, useSigs=True, useCMIM=useCMIM, holdOutFrac=details.internalHoldoutFrac, replacementSelection=details.replacementSelection, silent=not _verbose) ## elif details.useSVM: ## from rdkit.ML.SVM import CrossValidate ## driver = CrossValidate.CrossValidationDriver ## composite.Grow(trainExamples, attrs, nPossibleVals=[0]+nPossibleVals, ## buildDriver=driver, nTries=details.nModels, ## needsQuantization=0, ## cost=details.svmCost,gamma=details.svmGamma, ## weights=details.svmWeights,degree=details.svmDegree, ## type=details.svmType,kernelType=details.svmKernel, ## coef0=details.svmCoeff,eps=details.svmEps,nu=details.svmNu, ## cache_size=details.svmCache,shrinking=details.svmShrink, ## dataType=details.svmDataType, ## holdOutFrac=details.internalHoldoutFrac, ## replacementSelection=details.replacementSelection, ## silent=not _verbose) else: from rdkit.ML.Neural import CrossValidate driver = CrossValidate.CrossValidationDriver composite.Grow(trainExamples, attrs, [0] + nPossibleVals, nTries=details.nModels, buildDriver=driver, needsQuantization=0) composite.AverageErrors() composite.SortModels() modelList, counts, avgErrs = composite.GetAllData() counts = numpy.array(counts) avgErrs = numpy.array(avgErrs) composite._varNames = data.GetVarNames() for i in range(len(modelList)): modelList[i].NameModel(composite._varNames) # do final statistics weightedErrs = counts * avgErrs averageErr = sum(weightedErrs) / sum(counts) devs = (avgErrs - averageErr) devs = devs * counts devs = numpy.sqrt(devs * devs) avgDev = sum(devs) / sum(counts) message('# Overall Average Error: %%% 5.2f, Average Deviation: %%% 6.2f' % (100. * averageErr, 100. * avgDev)) if details.bayesModel: composite.Train(trainExamples, verbose=0) # blow out the saved examples and then save the composite: composite.ClearModelExamples() if saveIt: composite.Pickle(details.outName) details.model = DbModule.binaryHolder(cPickle.dumps(composite)) badExamples = [] if not details.detailedRes and (not hasattr(details, 'noScreen') or not details.noScreen): if details.splitRun: message('Testing all hold-out examples') wrong = testall(composite, testExamples, badExamples) message('%d examples (%% %5.2f) were misclassified' % (len(wrong), 100. * float(len(wrong)) / float(len(testExamples)))) _runDetails.holdout_error = float(len(wrong)) / len(testExamples) else: message('Testing all examples') wrong = testall(composite, namedExamples, badExamples) message('%d examples (%% %5.2f) were misclassified' % (len(wrong), 100. * float(len(wrong)) / float(len(namedExamples)))) _runDetails.overall_error = float(len(wrong)) / len(namedExamples) if details.detailedRes: message('\nEntire data set:') resTup = ScreenComposite.ShowVoteResults(range(data.GetNPts()), data, composite, nPossibleVals[-1], details.threshold) nGood, nBad, nSkip, avgGood, avgBad, avgSkip, voteTab = resTup nPts = len(namedExamples) nClass = nGood + nBad _runDetails.overall_error = float(nBad) / nClass _runDetails.overall_correct_conf = avgGood _runDetails.overall_incorrect_conf = avgBad _runDetails.overall_result_matrix = repr(voteTab) nRej = nClass - nPts if nRej > 0: _runDetails.overall_fraction_dropped = float(nRej) / nPts if details.splitRun: message('\nHold-out data:') resTup = ScreenComposite.ShowVoteResults(range(len(testExamples)), testExamples, composite, nPossibleVals[-1], details.threshold) nGood, nBad, nSkip, avgGood, avgBad, avgSkip, voteTab = resTup nPts = len(testExamples) nClass = nGood + nBad _runDetails.holdout_error = float(nBad) / nClass _runDetails.holdout_correct_conf = avgGood _runDetails.holdout_incorrect_conf = avgBad _runDetails.holdout_result_matrix = repr(voteTab) nRej = nClass - nPts if nRej > 0: _runDetails.holdout_fraction_dropped = float(nRej) / nPts if details.persistTblName and details.dbName: message('Updating results table %s:%s' % (details.dbName, details.persistTblName)) details.Store(db=details.dbName, table=details.persistTblName) if details.badName != '': badFile = open(details.badName, 'w+') for i in range(len(badExamples)): ex = badExamples[i] vote = wrong[i] outStr = '%s\t%s\n' % (ex, vote) badFile.write(outStr) badFile.close() composite.ClearModelExamples() return composite
def ErrorStats(conn, where, enrich=1): fields = ( 'overall_error,holdout_error,overall_result_matrix,' + 'holdout_result_matrix,overall_correct_conf,overall_incorrect_conf,' + 'holdout_correct_conf,holdout_incorrect_conf') try: data = conn.GetData(fields=fields, where=where) except Exception: import traceback traceback.print_exc() return None nPts = len(data) if not nPts: sys.stderr.write('no runs found\n') return None overall = numpy.zeros(nPts, numpy.float) overallEnrich = numpy.zeros(nPts, numpy.float) oCorConf = 0.0 oInCorConf = 0.0 holdout = numpy.zeros(nPts, numpy.float) holdoutEnrich = numpy.zeros(nPts, numpy.float) hCorConf = 0.0 hInCorConf = 0.0 overallMatrix = None holdoutMatrix = None for i in range(nPts): if data[i][0] is not None: overall[i] = data[i][0] oCorConf += data[i][4] oInCorConf += data[i][5] if data[i][1] is not None: holdout[i] = data[i][1] haveHoldout = 1 else: haveHoldout = 0 tmpOverall = 1. * eval(data[i][2]) if enrich >= 0: overallEnrich[i] = ScreenComposite.CalcEnrichment(tmpOverall, tgt=enrich) if haveHoldout: tmpHoldout = 1. * eval(data[i][3]) if enrich >= 0: holdoutEnrich[i] = ScreenComposite.CalcEnrichment(tmpHoldout, tgt=enrich) if overallMatrix is None: if data[i][2] is not None: overallMatrix = tmpOverall if haveHoldout and data[i][3] is not None: holdoutMatrix = tmpHoldout else: overallMatrix += tmpOverall if haveHoldout: holdoutMatrix += tmpHoldout if haveHoldout: hCorConf += data[i][6] hInCorConf += data[i][7] avgOverall = sum(overall) / nPts oCorConf /= nPts oInCorConf /= nPts overallMatrix /= nPts oSort = numpy.argsort(overall) oMin = overall[oSort[0]] overall -= avgOverall devOverall = numpy.sqrt(sum(overall**2) / (nPts - 1)) res = {} res['oAvg'] = 100 * avgOverall res['oDev'] = 100 * devOverall res['oCorrectConf'] = 100 * oCorConf res['oIncorrectConf'] = 100 * oInCorConf res['oResultMat'] = overallMatrix res['oBestIdx'] = oSort[0] res['oBestErr'] = 100 * oMin if enrich >= 0: mean, dev = Stats.MeanAndDev(overallEnrich) res['oAvgEnrich'] = mean res['oDevEnrich'] = dev if haveHoldout: avgHoldout = sum(holdout) / nPts hCorConf /= nPts hInCorConf /= nPts holdoutMatrix /= nPts hSort = numpy.argsort(holdout) hMin = holdout[hSort[0]] holdout -= avgHoldout devHoldout = numpy.sqrt(sum(holdout**2) / (nPts - 1)) res['hAvg'] = 100 * avgHoldout res['hDev'] = 100 * devHoldout res['hCorrectConf'] = 100 * hCorConf res['hIncorrectConf'] = 100 * hInCorConf res['hResultMat'] = holdoutMatrix res['hBestIdx'] = hSort[0] res['hBestErr'] = 100 * hMin if enrich >= 0: mean, dev = Stats.MeanAndDev(holdoutEnrich) res['hAvgEnrich'] = mean res['hDevEnrich'] = dev return res
def GrowIt(details,composite,progressCallback=None, saveIt=1,setDescNames=0,data=None): """ does the actual work of building a composite model **Arguments** - details: a _CompositeRun.CompositeRun_ object containing details (options, parameters, etc.) about the run - composite: the composite model to grow - progressCallback: (optional) a function which is called with a single argument (the number of models built so far) after each model is built. - saveIt: (optional) if this is nonzero, the resulting model will be pickled and dumped to the filename specified in _details.outName_ - setDescNames: (optional) if nonzero, the composite's _SetInputOrder()_ method will be called using the results of the data set's _GetVarNames()_ method; it is assumed that the details object has a _descNames attribute which is passed to the composites _SetDescriptorNames()_ method. Otherwise (the default), _SetDescriptorNames()_ gets the results of _GetVarNames()_. - data: (optional) the data set to be used. If this is not provided, the data set described in details will be used. **Returns** the enlarged composite model """ details.rundate = time.asctime() if data is None: fName = details.tableName.strip() if details.outName == '': details.outName = fName + '.pkl' if details.dbName == '': data = DataUtils.BuildQuantDataSet(fName) elif details.qBounds != []: details.tableName = fName data = details.GetDataSet() else: data = DataUtils.DBToQuantData(details.dbName,fName,quantName=details.qTableName, user=details.dbUser,password=details.dbPassword) nExamples = data.GetNPts() seed = composite._randomSeed DataUtils.InitRandomNumbers(seed) testExamples = [] if details.shuffleActivities == 1: DataUtils.RandomizeActivities(data,shuffle=1,runDetails=details) elif details.randomActivities == 1: DataUtils.RandomizeActivities(data,shuffle=0,runDetails=details) namedExamples = data.GetNamedData() trainExamples = namedExamples nExamples = len(trainExamples) message('Training with %d examples'%(nExamples)) message('\t%d descriptors'%(len(trainExamples[0])-2)) nVars = data.GetNVars() nPossibleVals = composite.nPossibleVals attrs = range(1,nVars+1) if details.useTrees: from rdkit.ML.DecTree import CrossValidate,PruneTree if details.qBounds != []: from rdkit.ML.DecTree import BuildQuantTree builder = BuildQuantTree.QuantTreeBoot else: from rdkit.ML.DecTree import ID3 builder = ID3.ID3Boot driver = CrossValidate.CrossValidationDriver pruner = PruneTree.PruneTree if setDescNames: composite.SetInputOrder(data.GetVarNames()) composite.Grow(trainExamples,attrs,[0]+nPossibleVals, buildDriver=driver, pruner=pruner, nTries=details.nModels,pruneIt=details.pruneIt, lessGreedy=details.lessGreedy,needsQuantization=0, treeBuilder=builder,nQuantBounds=details.qBounds, startAt=details.startAt, maxDepth=details.limitDepth, progressCallback=progressCallback, silent=not _verbose) else: from rdkit.ML.Neural import CrossValidate driver = CrossValidate.CrossValidationDriver composite.Grow(trainExamples,attrs,[0]+nPossibleVals,nTries=details.nModels, buildDriver=driver,needsQuantization=0) composite.AverageErrors() composite.SortModels() modelList,counts,avgErrs = composite.GetAllData() counts = numpy.array(counts) avgErrs = numpy.array(avgErrs) composite._varNames = data.GetVarNames() for i in range(len(modelList)): modelList[i].NameModel(composite._varNames) # do final statistics weightedErrs = counts*avgErrs averageErr = sum(weightedErrs)/sum(counts) devs = (avgErrs - averageErr) devs = devs * counts devs = numpy.sqrt(devs*devs) avgDev = sum(devs)/sum(counts) if _verbose: message('# Overall Average Error: %%% 5.2f, Average Deviation: %%% 6.2f'%(100.*averageErr,100.*avgDev)) if details.bayesModel: composite.Train(trainExamples,verbose=0) badExamples = [] if not details.detailedRes: if _verbose: message('Testing all examples') wrong = BuildComposite.testall(composite,namedExamples,badExamples) if _verbose: message('%d examples (%% %5.2f) were misclassified'%(len(wrong),100.*float(len(wrong))/float(len(namedExamples)))) _runDetails.overall_error = float(len(wrong))/len(namedExamples) if details.detailedRes: if _verbose: message('\nEntire data set:') resTup = ScreenComposite.ShowVoteResults(range(data.GetNPts()),data,composite, nPossibleVals[-1],details.threshold) nGood,nBad,nSkip,avgGood,avgBad,avgSkip,voteTab = resTup nPts = len(namedExamples) nClass = nGood+nBad _runDetails.overall_error = float(nBad) / nClass _runDetails.overall_correct_conf = avgGood _runDetails.overall_incorrect_conf = avgBad _runDetails.overall_result_matrix = repr(voteTab) nRej = nClass-nPts if nRej > 0: _runDetails.overall_fraction_dropped = float(nRej)/nPts return composite