def testTreeGrow(self): " testing tree-based composite " self.refCompos = cPickle.load( open( RDConfig.RDCodeDir + '/ML/Composite/test_data/composite_base.pkl', 'rb')) composite = Composite.Composite() composite._varNames = self.varNames composite.SetQuantBounds(self.qBounds, self.nPoss) from rdkit.ML.DecTree import CrossValidate driver = CrossValidate.CrossValidationDriver pruner = None composite.Grow(self.examples, self.attrs, [], buildDriver=driver, pruner=pruner, nTries=100, silent=1) composite.AverageErrors() composite.SortModels() self.treeComposite = composite assert len(composite) == len( self.refCompos), 'length mismatch %d!=%d' % (len(composite), len(self.refCompos)) for i in xrange(len(composite)): t1, c1, e1 = composite[i] t2, c2, e2 = self.refCompos[i] assert e1 == e2, 'error mismatch'
def testTreeGrow(self): " testing tree-based composite " with open( RDConfig.RDCodeDir + '/ML/Composite/test_data/composite_base.pkl', 'rb') as pklF: self.refCompos = cPickle.load(pklF) composite = Composite.Composite() composite._varNames = self.varNames composite.SetQuantBounds(self.qBounds, self.nPoss) from rdkit.ML.DecTree import CrossValidate driver = CrossValidate.CrossValidationDriver pruner = None composite.Grow(self.examples, self.attrs, [], buildDriver=driver, pruner=pruner, nTries=100, silent=1) composite.AverageErrors() composite.SortModels() #with open(RDConfig.RDCodeDir+'/ML/Composite/test_data/composite_base.pkl','wb') as pklF: # cPickle.dump(composite,pklF) self.treeComposite = composite self.assertEqual(len(composite), len(self.refCompos)) for i in xrange(len(composite)): t1, c1, e1 = composite[i] t2, c2, e2 = self.refCompos[i] self.assertEqual(e1, e2)
def testQuantize(self): # testing data quantization qBounds = [[], [1, 2, 3]] examples = [['foo', 0], ['foo', 1.5], ['foo', 5.5], ['foo', 2.5]] answers = [['foo', 0], ['foo', 1], ['foo', 3], ['foo', 2]] nPoss = [0, 4] composite = Composite.Composite() composite.SetQuantBounds(qBounds, nPoss) for i in range(len(examples)): qEx = composite.QuantizeExample(examples[i]) self.assertEqual(qEx, answers[i])
def testQuantize(self): " testing data quantization " qBounds = [[], [1, 2, 3]] examples = [['foo', 0], ['foo', 1.5], ['foo', 5.5], ['foo', 2.5]] answers = [['foo', 0], ['foo', 1], ['foo', 3], ['foo', 2]] nPoss = [0, 4] composite = Composite.Composite() composite.SetQuantBounds(qBounds, nPoss) for i in xrange(len(examples)): qEx = composite.QuantizeExample(examples[i]) assert qEx == answers[i], 'quantization of %s failed' % (str( examples[i]))
def test_exceptions(self): compos = Composite.Composite() compos.SetQuantBounds([(0.5, ), (0.5, ), (0.5, ), []], [2, 2, 2, 2]) compos.SetDescriptorNames(('ID', 'D0', 'D1', 'D2', 'Act')) compos.SetInputOrder(('ID', 'D2', 'D1', 'D0', 'Act')) self.assertEqual(compos._mapOrder, [0, 3, 2, 1, 4]) # Probes caught exception for ID compos.SetInputOrder(('D2', 'D1', 'D0', 'Act')) self.assertEqual(compos._mapOrder, [0, 2, 1, 0, 3]) # Probes caught exception for Act compos.SetInputOrder(('ID', 'D2', 'D1', 'D0')) self.assertEqual(compos._mapOrder, [0, 3, 2, 1, -1]) self.assertRaises(ValueError, compos.SetInputOrder, ('Unknown', 'D0'))
def testTreeGrow(self): # testing tree-based composite with open( RDConfig.RDCodeDir + '/ML/Composite/test_data/composite_base.pkl', 'r') as pklTF: buf = pklTF.read().replace('\r\n', '\n').encode('utf-8') pklTF.close() with io.BytesIO(buf) as pklF: self.refCompos = cPickle.load(pklF) composite = Composite.Composite() composite._varNames = self.varNames composite.SetQuantBounds(self.qBounds, self.nPoss) from rdkit.ML.DecTree import CrossValidate driver = CrossValidate.CrossValidationDriver pruner = None composite.Grow(self.examples, self.attrs, [], buildDriver=driver, pruner=pruner, nTries=100, silent=1) composite.AverageErrors() composite.SortModels(sortOnError=False) self.assertEqual(composite.countList, sorted(composite.countList)) self.assertNotEqual(composite.errList, sorted(composite.errList)) composite.SortModels() self.assertNotEqual(composite.countList, sorted(composite.countList)) self.assertEqual(composite.errList, sorted(composite.errList)) # with open(RDConfig.RDCodeDir+'/ML/Composite/test_data/composite_base.pkl','wb') as pklF: # cPickle.dump(composite,pklF) self.treeComposite = composite self.assertEqual(len(composite), len(self.refCompos)) for i in range(len(composite)): t1, c1, e1 = composite[i] t2, c2, e2 = self.refCompos[i] self.assertEqual(e1, e2) # we used to check for equality here, but since there are redundant errors, # that's non-trivial. # assert t1 == t2, 'tree mismatch' # assert c1 == c2, 'count mismatch' s = str(composite) self.assertIn('Composite', s) self.assertIn('Model', s) self.assertIn('error', s)
def RunOnData(details, data, progressCallback=None, saveIt=1, setDescNames=0): nExamples = data.GetNPts() if details.lockRandom: seed = details.randomSeed else: import random seed = (random.randint(0, 1e6), random.randint(0, 1e6)) DataUtils.InitRandomNumbers(seed) testExamples = [] if details.shuffleActivities == 1: DataUtils.RandomizeActivities(data, shuffle=1, runDetails=details) elif details.randomActivities == 1: DataUtils.RandomizeActivities(data, shuffle=0, runDetails=details) namedExamples = data.GetNamedData() if details.splitRun == 1: trainIdx, testIdx = SplitData.SplitIndices(len(namedExamples), details.splitFrac, silent=not _verbose) trainExamples = [namedExamples[x] for x in trainIdx] testExamples = [namedExamples[x] for x in testIdx] else: testExamples = [] testIdx = [] trainIdx = range(len(namedExamples)) trainExamples = namedExamples if details.filterFrac != 0.0: # if we're doing quantization on the fly, we need to handle that here: if hasattr(details, 'activityBounds') and details.activityBounds: tExamples = [] bounds = details.activityBounds for pt in trainExamples: pt = pt[:] act = pt[-1] placed = 0 bound = 0 while not placed and bound < len(bounds): if act < bounds[bound]: pt[-1] = bound placed = 1 else: bound += 1 if not placed: pt[-1] = bound tExamples.append(pt) else: bounds = None tExamples = trainExamples trainIdx, temp = DataUtils.FilterData(tExamples, details.filterVal, details.filterFrac, -1, indicesOnly=1) tmp = [trainExamples[x] for x in trainIdx] testExamples += [trainExamples[x] for x in temp] trainExamples = tmp counts = DataUtils.CountResults(trainExamples, bounds=bounds) ks = counts.keys() ks.sort() message('Result Counts in training set:') for k in ks: message(str((k, counts[k]))) counts = DataUtils.CountResults(testExamples, bounds=bounds) ks = counts.keys() ks.sort() message('Result Counts in test set:') for k in ks: message(str((k, counts[k]))) nExamples = len(trainExamples) message('Training with %d examples' % (nExamples)) nVars = data.GetNVars() attrs = range(1, nVars + 1) nPossibleVals = data.GetNPossibleVals() for i in range(1, len(nPossibleVals)): if nPossibleVals[i - 1] == -1: attrs.remove(i) if details.pickleDataFileName != '': pickleDataFile = open(details.pickleDataFileName, 'wb+') cPickle.dump(trainExamples, pickleDataFile) cPickle.dump(testExamples, pickleDataFile) pickleDataFile.close() if details.bayesModel: composite = BayesComposite.BayesComposite() else: composite = Composite.Composite() composite._randomSeed = seed composite._splitFrac = details.splitFrac composite._shuffleActivities = details.shuffleActivities composite._randomizeActivities = details.randomActivities if hasattr(details, 'filterFrac'): composite._filterFrac = details.filterFrac if hasattr(details, 'filterVal'): composite._filterVal = details.filterVal composite.SetModelFilterData(details.modelFilterFrac, details.modelFilterVal) composite.SetActivityQuantBounds(details.activityBounds) nPossibleVals = data.GetNPossibleVals() if details.activityBounds: nPossibleVals[-1] = len(details.activityBounds) + 1 if setDescNames: composite.SetInputOrder(data.GetVarNames()) composite.SetDescriptorNames(details._descNames) else: composite.SetDescriptorNames(data.GetVarNames()) composite.SetActivityQuantBounds(details.activityBounds) if details.nModels == 1: details.internalHoldoutFrac = 0.0 if details.useTrees: from rdkit.ML.DecTree import CrossValidate, PruneTree if details.qBounds != []: from rdkit.ML.DecTree import BuildQuantTree builder = BuildQuantTree.QuantTreeBoot else: from rdkit.ML.DecTree import ID3 builder = ID3.ID3Boot driver = CrossValidate.CrossValidationDriver pruner = PruneTree.PruneTree composite.SetQuantBounds(details.qBounds) nPossibleVals = data.GetNPossibleVals() if details.activityBounds: nPossibleVals[-1] = len(details.activityBounds) + 1 composite.Grow(trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver, pruner=pruner, nTries=details.nModels, pruneIt=details.pruneIt, lessGreedy=details.lessGreedy, needsQuantization=0, treeBuilder=builder, nQuantBounds=details.qBounds, startAt=details.startAt, maxDepth=details.limitDepth, progressCallback=progressCallback, holdOutFrac=details.internalHoldoutFrac, replacementSelection=details.replacementSelection, recycleVars=details.recycleVars, randomDescriptors=details.randomDescriptors, silent=not _verbose) elif details.useSigTrees: from rdkit.ML.DecTree import CrossValidate from rdkit.ML.DecTree import BuildSigTree builder = BuildSigTree.SigTreeBuilder driver = CrossValidate.CrossValidationDriver nPossibleVals = data.GetNPossibleVals() if details.activityBounds: nPossibleVals[-1] = len(details.activityBounds) + 1 if hasattr(details, 'sigTreeBiasList'): biasList = details.sigTreeBiasList else: biasList = None if hasattr(details, 'useCMIM'): useCMIM = details.useCMIM else: useCMIM = 0 if hasattr(details, 'allowCollections'): allowCollections = details.allowCollections else: allowCollections = False composite.Grow(trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver, nTries=details.nModels, needsQuantization=0, treeBuilder=builder, maxDepth=details.limitDepth, progressCallback=progressCallback, holdOutFrac=details.internalHoldoutFrac, replacementSelection=details.replacementSelection, recycleVars=details.recycleVars, randomDescriptors=details.randomDescriptors, biasList=biasList, useCMIM=useCMIM, allowCollection=allowCollections, silent=not _verbose) elif details.useKNN: from rdkit.ML.KNN import CrossValidate from rdkit.ML.KNN import DistFunctions driver = CrossValidate.CrossValidationDriver dfunc = '' if (details.knnDistFunc == "Euclidean"): dfunc = DistFunctions.EuclideanDist elif (details.knnDistFunc == "Tanimoto"): dfunc = DistFunctions.TanimotoDist else: assert 0, "Bad KNN distance metric value" composite.Grow(trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver, nTries=details.nModels, needsQuantization=0, numNeigh=details.knnNeighs, holdOutFrac=details.internalHoldoutFrac, distFunc=dfunc) elif details.useNaiveBayes or details.useSigBayes: from rdkit.ML.NaiveBayes import CrossValidate driver = CrossValidate.CrossValidationDriver if not (hasattr(details, 'useSigBayes') and details.useSigBayes): composite.Grow(trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver, nTries=details.nModels, needsQuantization=0, nQuantBounds=details.qBounds, holdOutFrac=details.internalHoldoutFrac, replacementSelection=details.replacementSelection, mEstimateVal=details.mEstimateVal, silent=not _verbose) else: if hasattr(details, 'useCMIM'): useCMIM = details.useCMIM else: useCMIM = 0 composite.Grow(trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver, nTries=details.nModels, needsQuantization=0, nQuantBounds=details.qBounds, mEstimateVal=details.mEstimateVal, useSigs=True, useCMIM=useCMIM, holdOutFrac=details.internalHoldoutFrac, replacementSelection=details.replacementSelection, silent=not _verbose) ## elif details.useSVM: ## from rdkit.ML.SVM import CrossValidate ## driver = CrossValidate.CrossValidationDriver ## composite.Grow(trainExamples, attrs, nPossibleVals=[0]+nPossibleVals, ## buildDriver=driver, nTries=details.nModels, ## needsQuantization=0, ## cost=details.svmCost,gamma=details.svmGamma, ## weights=details.svmWeights,degree=details.svmDegree, ## type=details.svmType,kernelType=details.svmKernel, ## coef0=details.svmCoeff,eps=details.svmEps,nu=details.svmNu, ## cache_size=details.svmCache,shrinking=details.svmShrink, ## dataType=details.svmDataType, ## holdOutFrac=details.internalHoldoutFrac, ## replacementSelection=details.replacementSelection, ## silent=not _verbose) else: from rdkit.ML.Neural import CrossValidate driver = CrossValidate.CrossValidationDriver composite.Grow(trainExamples, attrs, [0] + nPossibleVals, nTries=details.nModels, buildDriver=driver, needsQuantization=0) composite.AverageErrors() composite.SortModels() modelList, counts, avgErrs = composite.GetAllData() counts = numpy.array(counts) avgErrs = numpy.array(avgErrs) composite._varNames = data.GetVarNames() for i in range(len(modelList)): modelList[i].NameModel(composite._varNames) # do final statistics weightedErrs = counts * avgErrs averageErr = sum(weightedErrs) / sum(counts) devs = (avgErrs - averageErr) devs = devs * counts devs = numpy.sqrt(devs * devs) avgDev = sum(devs) / sum(counts) message('# Overall Average Error: %%% 5.2f, Average Deviation: %%% 6.2f' % (100. * averageErr, 100. * avgDev)) if details.bayesModel: composite.Train(trainExamples, verbose=0) # blow out the saved examples and then save the composite: composite.ClearModelExamples() if saveIt: composite.Pickle(details.outName) details.model = DbModule.binaryHolder(cPickle.dumps(composite)) badExamples = [] if not details.detailedRes and (not hasattr(details, 'noScreen') or not details.noScreen): if details.splitRun: message('Testing all hold-out examples') wrong = testall(composite, testExamples, badExamples) message('%d examples (%% %5.2f) were misclassified' % (len(wrong), 100. * float(len(wrong)) / float(len(testExamples)))) _runDetails.holdout_error = float(len(wrong)) / len(testExamples) else: message('Testing all examples') wrong = testall(composite, namedExamples, badExamples) message('%d examples (%% %5.2f) were misclassified' % (len(wrong), 100. * float(len(wrong)) / float(len(namedExamples)))) _runDetails.overall_error = float(len(wrong)) / len(namedExamples) if details.detailedRes: message('\nEntire data set:') resTup = ScreenComposite.ShowVoteResults(range(data.GetNPts()), data, composite, nPossibleVals[-1], details.threshold) nGood, nBad, nSkip, avgGood, avgBad, avgSkip, voteTab = resTup nPts = len(namedExamples) nClass = nGood + nBad _runDetails.overall_error = float(nBad) / nClass _runDetails.overall_correct_conf = avgGood _runDetails.overall_incorrect_conf = avgBad _runDetails.overall_result_matrix = repr(voteTab) nRej = nClass - nPts if nRej > 0: _runDetails.overall_fraction_dropped = float(nRej) / nPts if details.splitRun: message('\nHold-out data:') resTup = ScreenComposite.ShowVoteResults(range(len(testExamples)), testExamples, composite, nPossibleVals[-1], details.threshold) nGood, nBad, nSkip, avgGood, avgBad, avgSkip, voteTab = resTup nPts = len(testExamples) nClass = nGood + nBad _runDetails.holdout_error = float(nBad) / nClass _runDetails.holdout_correct_conf = avgGood _runDetails.holdout_incorrect_conf = avgBad _runDetails.holdout_result_matrix = repr(voteTab) nRej = nClass - nPts if nRej > 0: _runDetails.holdout_fraction_dropped = float(nRej) / nPts if details.persistTblName and details.dbName: message('Updating results table %s:%s' % (details.dbName, details.persistTblName)) details.Store(db=details.dbName, table=details.persistTblName) if details.badName != '': badFile = open(details.badName, 'w+') for i in range(len(badExamples)): ex = badExamples[i] vote = wrong[i] outStr = '%s\t%s\n' % (ex, vote) badFile.write(outStr) badFile.close() composite.ClearModelExamples() return composite
def testErrorEstimate(self): " testing out-of-bag error estimates " compos = Composite.Composite() compos.SetQuantBounds([(0.5, ), (0.5, ), (0.5, ), []], [2, 2, 2, 2]) compos.SetDescriptorNames(('D0', 'D1', 'D2', 'Act')) compos.SetInputOrder(('ID', 'D0', 'D1', 'D2', 'Act')) data = [['A', 0, 0, 0, 0], ['B', 1, 0, 0, 1], ['C', 0, 1, 0, 0], ['D', 1, 1, 1, 1]] # # Build and validate three simple trees: # t1 = Node(None, 'D0', 0) n = Node(t1, 'D1', 1) t1.AddChildNode(n) n.AddChildNode(Node(n, '0', 0, isTerminal=1)) n.AddChildNode(Node(n, '1', 1, isTerminal=1)) n = Node(t1, 'D2', 2) t1.AddChildNode(n) n.AddChildNode(Node(n, '1', 1, isTerminal=1)) n.AddChildNode(Node(n, '0', 0, isTerminal=1)) assert t1.ClassifyExample(data[0][1:]) == 0 assert t1.ClassifyExample(data[1][1:]) == 1 assert t1.ClassifyExample(data[2][1:]) == 1 assert t1.ClassifyExample(data[3][1:]) == 0 t1._trainIndices = (0, 1) compos.AddModel(t1, .5) t2 = Node(None, 'D1', 1) n = Node(t2, 'D0', 0) t2.AddChildNode(n) n.AddChildNode(Node(n, '0', 0, isTerminal=1)) n.AddChildNode(Node(n, '1', 1, isTerminal=1)) n = Node(t2, 'D2', 2) t2.AddChildNode(n) n.AddChildNode(Node(n, '0', 0, isTerminal=1)) n.AddChildNode(Node(n, '1', 1, isTerminal=1)) assert t2.ClassifyExample(data[0][1:]) == 0 assert t2.ClassifyExample(data[1][1:]) == 1 assert t2.ClassifyExample(data[2][1:]) == 0 assert t2.ClassifyExample(data[3][1:]) == 1 t2._trainIndices = (1, 2) compos.AddModel(t2, 0.0) t3 = Node(None, 'D0', 0) n = Node(t3, 'D2', 2) t3.AddChildNode(n) n.AddChildNode(Node(n, '0', 0, isTerminal=1)) n.AddChildNode(Node(n, '1', 1, isTerminal=1)) n = Node(t3, 'D1', 1) t3.AddChildNode(n) n.AddChildNode(Node(n, '0', 0, isTerminal=1)) n.AddChildNode(Node(n, '1', 1, isTerminal=1)) assert t3.ClassifyExample(data[0][1:]) == 0 assert t3.ClassifyExample(data[1][1:]) == 0 assert t3.ClassifyExample(data[2][1:]) == 0 assert t3.ClassifyExample(data[3][1:]) == 1 t3._trainIndices = (2, 3) compos.AddModel(t3, 0.25) # # validate the composite itself: # pred, conf = compos.ClassifyExample(data[0]) assert pred == 0 assert conf == 1.0 pred, conf = compos.ClassifyExample(data[1]) assert pred == 1 assert conf == 2. / 3. pred, conf = compos.ClassifyExample(data[2]) assert pred == 0 assert conf == 2. / 3. pred, conf = compos.ClassifyExample(data[3]) assert pred == 1 assert conf == 2. / 3. # # now test the out-of-bag calculation: # pred, conf = compos.ClassifyExample(data[0], onlyModels=(1, 2)) assert pred == 0 assert conf == 1.0 pred, conf = compos.ClassifyExample(data[1], onlyModels=(2, )) assert pred == 0 assert conf == 1.0 pred, conf = compos.ClassifyExample(data[2], onlyModels=(0, )) assert pred == 1 assert conf == 1.0 pred, conf = compos.ClassifyExample(data[3], onlyModels=(0, 1)) assert pred == 0 assert conf == 0.5