def test5(self): """ indicesToUse """ probes = [ (.5, 4, 2), (.7, 3, 3), (.75, 3, 3), (.333, 6, 0), (.25, 4, 2), ] nPts = len(self.d1) for frac, nKeep, nRej in probes: DataUtils.InitRandomNumbers((23, 42)) k, r = DataUtils.FilterData(self.d1, 1, frac, indicesToUse=range(nPts)) assert len(k) == nKeep, 'bad nKeep (%d != %d)' % (len(k), nKeep) assert len(r) == nRej, 'bad nRej (%d != %d)' % (len(r), nRej) keep, rej = k, r # make sure the examples are actually correct DataUtils.InitRandomNumbers((23, 42)) tgtKeep, tgtRej = DataUtils.FilterData(self.d1, 1, frac) assert keep == tgtKeep, '%.2f: %s!=%s' % (frac, str(keep), str(tgtKeep)) assert rej == tgtRej, '%.2f: %s!=%s' % (frac, str(rej), str(tgtRej))
def test4_indicesOnly_indicesToUse(self): # """ indicesOnly with indicesToUse """ probes = [ (.5, 4, 2), (.7, 3, 3), (.75, 3, 3), (.333, 6, 0), (.25, 4, 2), ] nPts = len(self.d1) for frac, nKeep, nRej in probes: DataUtils.InitRandomNumbers((23, 42)) k, r = DataUtils.FilterData(self.d1, 1, frac, indicesToUse=range(nPts), indicesOnly=1) assert len(k) == nKeep, 'bad nKeep (%d != %d)' % (len(k), nKeep) assert len(r) == nRej, 'bad nRej (%d != %d)' % (len(r), nRej) # make sure the indices are actually correct keep = [self.d1[x] for x in k] rej = [self.d1[x] for x in r] DataUtils.InitRandomNumbers((23, 42)) tgtKeep, tgtRej = DataUtils.FilterData(self.d1, 1, frac) assert keep == tgtKeep, '%.2f: %s!=%s' % (frac, str(keep), str(tgtKeep)) assert rej == tgtRej, '%.2f: %s!=%s' % (frac, str(rej), str(tgtRej))
def testPerm1(self): """ tests the descriptor remapping stuff in a packager """ from rdkit.Chem import Descriptors with open(os.path.join(self.dataDir,'Jan9_build3_pkg.pkl'),'r') as pkgTF: buf = pkgTF.read().replace('\r\n', '\n').encode('utf-8') pkgTF.close() with io.BytesIO(buf) as pkgF: pkg = cPickle.load(pkgF) calc = pkg.GetCalculator() names = calc.GetDescriptorNames() ref = {} DataUtils.InitRandomNumbers((23,42)) for smi,pred,conf in self.testD: for desc in names: fn = getattr(Descriptors,desc,lambda x:777) m = Chem.MolFromSmiles(smi) ref[desc] = fn(m) for i in range(5): perm = list(names) random.shuffle(perm,random=random.random) m = Chem.MolFromSmiles(smi) for desc in perm: fn = getattr(Descriptors,desc,lambda x:777) val = fn(m) assert feq(val,ref[desc],1e-4),'%s: %s(%s): %f!=%f'%(str(perm), smi, desc, val, ref[desc])
def testPerm1(self): """ tests the descriptor remapping stuff in a packager """ from rdkit.Chem import Descriptors pkg = cPickle.load( open(os.path.join(self.dataDir, 'Jan9_build3_pkg.pkl'), 'rb')) calc = pkg.GetCalculator() names = calc.GetDescriptorNames() ref = {} DataUtils.InitRandomNumbers((23, 42)) for smi, pred, conf in self.testD: for desc in names: fn = getattr(Descriptors, desc, lambda x: 777) m = Chem.MolFromSmiles(smi) ref[desc] = fn(m) for i in range(5): perm = list(names) random.shuffle(perm) m = Chem.MolFromSmiles(smi) for desc in perm: fn = getattr(Descriptors, desc, lambda x: 777) val = fn(m) assert feq( val, ref[desc], 1e-4), '%s: %s(%s): %f!=%f' % (str(perm), smi, desc, val, ref[desc])
def setUp(self): # here is what we are going to do to test this out # - generate bit vectrs of length nbits # - turn on a fraction of the first nbits/2 bits at random # - for each bit i turned on in the range (0, nbits/2) turn on the bit # nbits/2 + i # - basically the first half of a fingerprint is same as the second half of the # fingerprint # - if we repeat this process often enough we whould see strong correlation between # the bits i (i < nbits/2) and (nbits/2 + i) DataUtils.InitRandomNumbers((100, 23)) self.nbits = 200 self.d = 40 self.nfp = 1000 self.blist = range(self.nbits) self.fps = [] for fi in range(self.nfp): fp = DataStructs.ExplicitBitVect(self.nbits) obits = range(self.nbits / 2) random.shuffle(obits) obits = obits[0:self.d] for bit in obits: fp.SetBit(bit) fp.SetBit(bit + self.nbits / 2) self.fps.append(fp)
def testPerm1(self): # """ tests the descriptor remapping stuff in a packager """ pkg = self._loadPackage() calc = pkg.GetCalculator() names = calc.GetDescriptorNames() ref = {} DataUtils.InitRandomNumbers((23, 42)) for smi, _, _ in self.testD: for desc in names: fn = getattr(Descriptors, desc, lambda x: 777) m = Chem.MolFromSmiles(smi) ref[desc] = fn(m) for _ in range(5): perm = list(names) random.shuffle(perm, random=random.random) m = Chem.MolFromSmiles(smi) for desc in perm: fn = getattr(Descriptors, desc, lambda x: 777) val = fn(m) assert feq( val, ref[desc], 1e-4), '%s: %s(%s): %f!=%f' % (str(perm), smi, desc, val, ref[desc])
def testPerm2(self): # """ tests the descriptor remapping stuff in a packager """ pkg = self._loadPackage() calc = pkg.GetCalculator() names = calc.GetDescriptorNames() DataUtils.InitRandomNumbers((23, 42)) perm = list(names) random.shuffle(perm, random=random.random) calc.simpleList = perm calc.descriptorNames = perm pkg.Init() self._verify(pkg, self.testD)
def testPerm2(self): """ tests the descriptor remapping stuff in a packager """ pkg = cPickle.load( open(os.path.join(self.dataDir, 'Jan9_build3_pkg.pkl'), 'rb')) calc = pkg.GetCalculator() names = calc.GetDescriptorNames() DataUtils.InitRandomNumbers((23, 42)) perm = list(names) random.shuffle(perm) calc.simpleList = perm calc.descriptorNames = perm pkg.Init() self._verify(pkg, self.testD)
def test_SplitData(self): self.assertRaises(ValueError, SplitData.SplitDataSet, None, -1.1) self.assertRaises(ValueError, SplitData.SplitDataSet, None, 1.1) data = list(range(10)) DataUtils.InitRandomNumbers((23, 42)) f = StringIO() with redirect_stdout(f): result = SplitData.SplitDataSet(data, 0.5) self.assertEqual(set(result[0]).intersection(result[1]), set()) self.assertEqual(len(result[0]), 5) s = f.getvalue() self.assertIn('Training', s) self.assertIn('hold-out', s)
def setUp(self): #print '\n%s: '%self.shortDescription(), self.examples = cPickle.load( open(RDConfig.RDCodeDir + '/ML/Composite/test_data/ferro.pkl', 'rb')) self.varNames = [ 'composition', 'max_atomic', 'has3d', 'has4d', 'has5d', 'elconc', 'atvol', 'isferro' ] self.qBounds = [[], [1.89, 3.53], [], [], [], [0.55, 0.73], [11.81, 14.52], []] self.nPoss = [0, 3, 2, 2, 2, 3, 3, 2] self.attrs = range(1, len(self.varNames) - 1) from rdkit.ML.Data import DataUtils DataUtils.InitRandomNumbers((23, 43))
def testPerm2(self): """ tests the descriptor remapping stuff in a packager """ with open(os.path.join(self.dataDir,'Jan9_build3_pkg.pkl'),'r') as pkgTF: buf = pkgTF.read().replace('\r\n', '\n').encode('utf-8') pkgTF.close() with io.BytesIO(buf) as pkgF: pkg = cPickle.load(pkgF) calc = pkg.GetCalculator() names = calc.GetDescriptorNames() DataUtils.InitRandomNumbers((23,42)) perm = list(names) random.shuffle(perm,random=random.random) calc.simpleList = perm calc.descriptorNames = perm pkg.Init() self._verify(pkg,self.testD)
def setUp(self): with open(RDConfig.RDCodeDir + '/ML/Composite/test_data/ferro.pkl', 'r') as pklTF: buf = pklTF.read().replace('\r\n', '\n').encode('utf-8') pklTF.close() with io.BytesIO(buf) as pklF: self.examples = cPickle.load(pklF) self.varNames = [ 'composition', 'max_atomic', 'has3d', 'has4d', 'has5d', 'elconc', 'atvol', 'isferro' ] self.qBounds = [[], [1.89, 3.53], [], [], [], [0.55, 0.73], [11.81, 14.52], []] self.nPoss = [0, 3, 2, 2, 2, 3, 3, 2] self.attrs = list(range(1, len(self.varNames) - 1)) from rdkit.ML.Data import DataUtils DataUtils.InitRandomNumbers((23, 43))
import traceback print('problems with model %s:' % modelName) traceback.print_exc() else: models.append(model) nModels = len(models) pickVects = {} halfwayPts = [1e8] * len(models) for whichModel, model in enumerate(models): tmpD = dataSet try: seed = model._randomSeed except AttributeError: pass else: DataUtils.InitRandomNumbers(seed) if details.shuffleActivities: DataUtils.RandomizeActivities(tmpD, shuffle=1) if hasattr(model, '_splitFrac') and (details.doHoldout or details.doTraining): trainIdx, testIdx = SplitData.SplitIndices(tmpD.GetNPts(), model._splitFrac, silent=1) if details.filterFrac != 0.0: trainFilt, temp = DataUtils.FilterData(tmpD, details.filterVal, details.filterFrac, -1, indicesToUse=trainIdx, indicesOnly=1) testIdx += temp
def RunOnData(details, data, progressCallback=None, saveIt=1, setDescNames=0): nExamples = data.GetNPts() if details.lockRandom: seed = details.randomSeed else: import random seed = (random.randint(0, 1e6), random.randint(0, 1e6)) DataUtils.InitRandomNumbers(seed) testExamples = [] if details.shuffleActivities == 1: DataUtils.RandomizeActivities(data, shuffle=1, runDetails=details) elif details.randomActivities == 1: DataUtils.RandomizeActivities(data, shuffle=0, runDetails=details) namedExamples = data.GetNamedData() if details.splitRun == 1: trainIdx, testIdx = SplitData.SplitIndices(len(namedExamples), details.splitFrac, silent=not _verbose) trainExamples = [namedExamples[x] for x in trainIdx] testExamples = [namedExamples[x] for x in testIdx] else: testExamples = [] testIdx = [] trainIdx = range(len(namedExamples)) trainExamples = namedExamples if details.filterFrac != 0.0: # if we're doing quantization on the fly, we need to handle that here: if hasattr(details, 'activityBounds') and details.activityBounds: tExamples = [] bounds = details.activityBounds for pt in trainExamples: pt = pt[:] act = pt[-1] placed = 0 bound = 0 while not placed and bound < len(bounds): if act < bounds[bound]: pt[-1] = bound placed = 1 else: bound += 1 if not placed: pt[-1] = bound tExamples.append(pt) else: bounds = None tExamples = trainExamples trainIdx, temp = DataUtils.FilterData(tExamples, details.filterVal, details.filterFrac, -1, indicesOnly=1) tmp = [trainExamples[x] for x in trainIdx] testExamples += [trainExamples[x] for x in temp] trainExamples = tmp counts = DataUtils.CountResults(trainExamples, bounds=bounds) ks = counts.keys() ks.sort() message('Result Counts in training set:') for k in ks: message(str((k, counts[k]))) counts = DataUtils.CountResults(testExamples, bounds=bounds) ks = counts.keys() ks.sort() message('Result Counts in test set:') for k in ks: message(str((k, counts[k]))) nExamples = len(trainExamples) message('Training with %d examples' % (nExamples)) nVars = data.GetNVars() attrs = range(1, nVars + 1) nPossibleVals = data.GetNPossibleVals() for i in range(1, len(nPossibleVals)): if nPossibleVals[i - 1] == -1: attrs.remove(i) if details.pickleDataFileName != '': pickleDataFile = open(details.pickleDataFileName, 'wb+') cPickle.dump(trainExamples, pickleDataFile) cPickle.dump(testExamples, pickleDataFile) pickleDataFile.close() if details.bayesModel: composite = BayesComposite.BayesComposite() else: composite = Composite.Composite() composite._randomSeed = seed composite._splitFrac = details.splitFrac composite._shuffleActivities = details.shuffleActivities composite._randomizeActivities = details.randomActivities if hasattr(details, 'filterFrac'): composite._filterFrac = details.filterFrac if hasattr(details, 'filterVal'): composite._filterVal = details.filterVal composite.SetModelFilterData(details.modelFilterFrac, details.modelFilterVal) composite.SetActivityQuantBounds(details.activityBounds) nPossibleVals = data.GetNPossibleVals() if details.activityBounds: nPossibleVals[-1] = len(details.activityBounds) + 1 if setDescNames: composite.SetInputOrder(data.GetVarNames()) composite.SetDescriptorNames(details._descNames) else: composite.SetDescriptorNames(data.GetVarNames()) composite.SetActivityQuantBounds(details.activityBounds) if details.nModels == 1: details.internalHoldoutFrac = 0.0 if details.useTrees: from rdkit.ML.DecTree import CrossValidate, PruneTree if details.qBounds != []: from rdkit.ML.DecTree import BuildQuantTree builder = BuildQuantTree.QuantTreeBoot else: from rdkit.ML.DecTree import ID3 builder = ID3.ID3Boot driver = CrossValidate.CrossValidationDriver pruner = PruneTree.PruneTree composite.SetQuantBounds(details.qBounds) nPossibleVals = data.GetNPossibleVals() if details.activityBounds: nPossibleVals[-1] = len(details.activityBounds) + 1 composite.Grow(trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver, pruner=pruner, nTries=details.nModels, pruneIt=details.pruneIt, lessGreedy=details.lessGreedy, needsQuantization=0, treeBuilder=builder, nQuantBounds=details.qBounds, startAt=details.startAt, maxDepth=details.limitDepth, progressCallback=progressCallback, holdOutFrac=details.internalHoldoutFrac, replacementSelection=details.replacementSelection, recycleVars=details.recycleVars, randomDescriptors=details.randomDescriptors, silent=not _verbose) elif details.useSigTrees: from rdkit.ML.DecTree import CrossValidate from rdkit.ML.DecTree import BuildSigTree builder = BuildSigTree.SigTreeBuilder driver = CrossValidate.CrossValidationDriver nPossibleVals = data.GetNPossibleVals() if details.activityBounds: nPossibleVals[-1] = len(details.activityBounds) + 1 if hasattr(details, 'sigTreeBiasList'): biasList = details.sigTreeBiasList else: biasList = None if hasattr(details, 'useCMIM'): useCMIM = details.useCMIM else: useCMIM = 0 if hasattr(details, 'allowCollections'): allowCollections = details.allowCollections else: allowCollections = False composite.Grow(trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver, nTries=details.nModels, needsQuantization=0, treeBuilder=builder, maxDepth=details.limitDepth, progressCallback=progressCallback, holdOutFrac=details.internalHoldoutFrac, replacementSelection=details.replacementSelection, recycleVars=details.recycleVars, randomDescriptors=details.randomDescriptors, biasList=biasList, useCMIM=useCMIM, allowCollection=allowCollections, silent=not _verbose) elif details.useKNN: from rdkit.ML.KNN import CrossValidate from rdkit.ML.KNN import DistFunctions driver = CrossValidate.CrossValidationDriver dfunc = '' if (details.knnDistFunc == "Euclidean"): dfunc = DistFunctions.EuclideanDist elif (details.knnDistFunc == "Tanimoto"): dfunc = DistFunctions.TanimotoDist else: assert 0, "Bad KNN distance metric value" composite.Grow(trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver, nTries=details.nModels, needsQuantization=0, numNeigh=details.knnNeighs, holdOutFrac=details.internalHoldoutFrac, distFunc=dfunc) elif details.useNaiveBayes or details.useSigBayes: from rdkit.ML.NaiveBayes import CrossValidate driver = CrossValidate.CrossValidationDriver if not (hasattr(details, 'useSigBayes') and details.useSigBayes): composite.Grow(trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver, nTries=details.nModels, needsQuantization=0, nQuantBounds=details.qBounds, holdOutFrac=details.internalHoldoutFrac, replacementSelection=details.replacementSelection, mEstimateVal=details.mEstimateVal, silent=not _verbose) else: if hasattr(details, 'useCMIM'): useCMIM = details.useCMIM else: useCMIM = 0 composite.Grow(trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver, nTries=details.nModels, needsQuantization=0, nQuantBounds=details.qBounds, mEstimateVal=details.mEstimateVal, useSigs=True, useCMIM=useCMIM, holdOutFrac=details.internalHoldoutFrac, replacementSelection=details.replacementSelection, silent=not _verbose) ## elif details.useSVM: ## from rdkit.ML.SVM import CrossValidate ## driver = CrossValidate.CrossValidationDriver ## composite.Grow(trainExamples, attrs, nPossibleVals=[0]+nPossibleVals, ## buildDriver=driver, nTries=details.nModels, ## needsQuantization=0, ## cost=details.svmCost,gamma=details.svmGamma, ## weights=details.svmWeights,degree=details.svmDegree, ## type=details.svmType,kernelType=details.svmKernel, ## coef0=details.svmCoeff,eps=details.svmEps,nu=details.svmNu, ## cache_size=details.svmCache,shrinking=details.svmShrink, ## dataType=details.svmDataType, ## holdOutFrac=details.internalHoldoutFrac, ## replacementSelection=details.replacementSelection, ## silent=not _verbose) else: from rdkit.ML.Neural import CrossValidate driver = CrossValidate.CrossValidationDriver composite.Grow(trainExamples, attrs, [0] + nPossibleVals, nTries=details.nModels, buildDriver=driver, needsQuantization=0) composite.AverageErrors() composite.SortModels() modelList, counts, avgErrs = composite.GetAllData() counts = numpy.array(counts) avgErrs = numpy.array(avgErrs) composite._varNames = data.GetVarNames() for i in range(len(modelList)): modelList[i].NameModel(composite._varNames) # do final statistics weightedErrs = counts * avgErrs averageErr = sum(weightedErrs) / sum(counts) devs = (avgErrs - averageErr) devs = devs * counts devs = numpy.sqrt(devs * devs) avgDev = sum(devs) / sum(counts) message('# Overall Average Error: %%% 5.2f, Average Deviation: %%% 6.2f' % (100. * averageErr, 100. * avgDev)) if details.bayesModel: composite.Train(trainExamples, verbose=0) # blow out the saved examples and then save the composite: composite.ClearModelExamples() if saveIt: composite.Pickle(details.outName) details.model = DbModule.binaryHolder(cPickle.dumps(composite)) badExamples = [] if not details.detailedRes and (not hasattr(details, 'noScreen') or not details.noScreen): if details.splitRun: message('Testing all hold-out examples') wrong = testall(composite, testExamples, badExamples) message('%d examples (%% %5.2f) were misclassified' % (len(wrong), 100. * float(len(wrong)) / float(len(testExamples)))) _runDetails.holdout_error = float(len(wrong)) / len(testExamples) else: message('Testing all examples') wrong = testall(composite, namedExamples, badExamples) message('%d examples (%% %5.2f) were misclassified' % (len(wrong), 100. * float(len(wrong)) / float(len(namedExamples)))) _runDetails.overall_error = float(len(wrong)) / len(namedExamples) if details.detailedRes: message('\nEntire data set:') resTup = ScreenComposite.ShowVoteResults(range(data.GetNPts()), data, composite, nPossibleVals[-1], details.threshold) nGood, nBad, nSkip, avgGood, avgBad, avgSkip, voteTab = resTup nPts = len(namedExamples) nClass = nGood + nBad _runDetails.overall_error = float(nBad) / nClass _runDetails.overall_correct_conf = avgGood _runDetails.overall_incorrect_conf = avgBad _runDetails.overall_result_matrix = repr(voteTab) nRej = nClass - nPts if nRej > 0: _runDetails.overall_fraction_dropped = float(nRej) / nPts if details.splitRun: message('\nHold-out data:') resTup = ScreenComposite.ShowVoteResults(range(len(testExamples)), testExamples, composite, nPossibleVals[-1], details.threshold) nGood, nBad, nSkip, avgGood, avgBad, avgSkip, voteTab = resTup nPts = len(testExamples) nClass = nGood + nBad _runDetails.holdout_error = float(nBad) / nClass _runDetails.holdout_correct_conf = avgGood _runDetails.holdout_incorrect_conf = avgBad _runDetails.holdout_result_matrix = repr(voteTab) nRej = nClass - nPts if nRej > 0: _runDetails.holdout_fraction_dropped = float(nRej) / nPts if details.persistTblName and details.dbName: message('Updating results table %s:%s' % (details.dbName, details.persistTblName)) details.Store(db=details.dbName, table=details.persistTblName) if details.badName != '': badFile = open(details.badName, 'w+') for i in range(len(badExamples)): ex = badExamples[i] vote = wrong[i] outStr = '%s\t%s\n' % (ex, vote) badFile.write(outStr) badFile.close() composite.ClearModelExamples() return composite
def BalanceComposite(details,composite,data1=None,data2=None): """ balances the composite using the parameters provided in details **Arguments** - details a _CompositeRun.RunDetails_ object - composite: the composite model to be balanced - data1: (optional) if provided, this should be the data set used to construct the original models - data2: (optional) if provided, this should be the data set used to construct the new individual models """ if not details.balCnt or details.balCnt > len(composite): return composite message("Balancing Composite") # # start by getting data set 1: which is the data set used to build the # original models # if data1 is None: message("\tReading First Data Set") fName = details.balTable.strip() tmp = details.tableName details.tableName = fName dbName = details.dbName details.dbName = details.balDb data1 = details.GetDataSet() details.tableName = tmp details.dbName = dbName if data1 is None: return composite details.splitFrac = composite._splitFrac details.randomSeed = composite._randomSeed DataUtils.InitRandomNumbers(details.randomSeed) if details.shuffleActivities == 1: DataUtils.RandomizeActivities(data1,shuffle=1,runDetails=details) elif details.randomActivities == 1: DataUtils.RandomizeActivities(data1,shuffle=0,runDetails=details) namedExamples = data1.GetNamedData() if details.balDoHoldout or details.balDoTrain: trainIdx,testIdx = SplitData.SplitIndices(len(namedExamples),details.splitFrac, silent=1) trainExamples = [namedExamples[x] for x in trainIdx] testExamples = [namedExamples[x] for x in testIdx] if details.filterFrac != 0.0: trainIdx,temp = DataUtils.FilterData(trainExamples,details.filterVal, details.filterFrac,-1, indicesOnly=1) tmp = [trainExamples[x] for x in trainIdx] testExamples += [trainExamples[x] for x in temp] trainExamples = tmp if details.balDoHoldout: testExamples,trainExamples = trainExamples,testExamples else: trainExamples = namedExamples dataSet1 = trainExamples cols1 = [x.upper() for x in data1.GetVarNames()] data1 = None # # now grab data set 2: the data used to build the new individual models # if data2 is None: message("\tReading Second Data Set") data2 = details.GetDataSet() if data2 is None: return composite details.splitFrac = composite._splitFrac details.randomSeed = composite._randomSeed DataUtils.InitRandomNumbers(details.randomSeed) if details.shuffleActivities == 1: DataUtils.RandomizeActivities(data2,shuffle=1,runDetails=details) elif details.randomActivities == 1: DataUtils.RandomizeActivities(data2,shuffle=0,runDetails=details) dataSet2 = data2.GetNamedData() cols2 = [x.upper() for x in data2.GetVarNames()] data2 = None # and balance it: res = [] weights = details.balWeight if type(weights) not in (types.TupleType,types.ListType): weights = (weights,) for weight in weights: message("\tBalancing with Weight: %.4f"%(weight)) res.append(AdjustComposite.BalanceComposite(composite,dataSet1,dataSet2, weight, details.balCnt, names1=cols1,names2=cols2)) return res
def GrowIt(details,composite,progressCallback=None, saveIt=1,setDescNames=0,data=None): """ does the actual work of building a composite model **Arguments** - details: a _CompositeRun.CompositeRun_ object containing details (options, parameters, etc.) about the run - composite: the composite model to grow - progressCallback: (optional) a function which is called with a single argument (the number of models built so far) after each model is built. - saveIt: (optional) if this is nonzero, the resulting model will be pickled and dumped to the filename specified in _details.outName_ - setDescNames: (optional) if nonzero, the composite's _SetInputOrder()_ method will be called using the results of the data set's _GetVarNames()_ method; it is assumed that the details object has a _descNames attribute which is passed to the composites _SetDescriptorNames()_ method. Otherwise (the default), _SetDescriptorNames()_ gets the results of _GetVarNames()_. - data: (optional) the data set to be used. If this is not provided, the data set described in details will be used. **Returns** the enlarged composite model """ details.rundate = time.asctime() if data is None: fName = details.tableName.strip() if details.outName == '': details.outName = fName + '.pkl' if details.dbName == '': data = DataUtils.BuildQuantDataSet(fName) elif details.qBounds != []: details.tableName = fName data = details.GetDataSet() else: data = DataUtils.DBToQuantData(details.dbName,fName,quantName=details.qTableName, user=details.dbUser,password=details.dbPassword) nExamples = data.GetNPts() seed = composite._randomSeed DataUtils.InitRandomNumbers(seed) testExamples = [] if details.shuffleActivities == 1: DataUtils.RandomizeActivities(data,shuffle=1,runDetails=details) elif details.randomActivities == 1: DataUtils.RandomizeActivities(data,shuffle=0,runDetails=details) namedExamples = data.GetNamedData() trainExamples = namedExamples nExamples = len(trainExamples) message('Training with %d examples'%(nExamples)) message('\t%d descriptors'%(len(trainExamples[0])-2)) nVars = data.GetNVars() nPossibleVals = composite.nPossibleVals attrs = range(1,nVars+1) if details.useTrees: from rdkit.ML.DecTree import CrossValidate,PruneTree if details.qBounds != []: from rdkit.ML.DecTree import BuildQuantTree builder = BuildQuantTree.QuantTreeBoot else: from rdkit.ML.DecTree import ID3 builder = ID3.ID3Boot driver = CrossValidate.CrossValidationDriver pruner = PruneTree.PruneTree if setDescNames: composite.SetInputOrder(data.GetVarNames()) composite.Grow(trainExamples,attrs,[0]+nPossibleVals, buildDriver=driver, pruner=pruner, nTries=details.nModels,pruneIt=details.pruneIt, lessGreedy=details.lessGreedy,needsQuantization=0, treeBuilder=builder,nQuantBounds=details.qBounds, startAt=details.startAt, maxDepth=details.limitDepth, progressCallback=progressCallback, silent=not _verbose) else: from rdkit.ML.Neural import CrossValidate driver = CrossValidate.CrossValidationDriver composite.Grow(trainExamples,attrs,[0]+nPossibleVals,nTries=details.nModels, buildDriver=driver,needsQuantization=0) composite.AverageErrors() composite.SortModels() modelList,counts,avgErrs = composite.GetAllData() counts = numpy.array(counts) avgErrs = numpy.array(avgErrs) composite._varNames = data.GetVarNames() for i in range(len(modelList)): modelList[i].NameModel(composite._varNames) # do final statistics weightedErrs = counts*avgErrs averageErr = sum(weightedErrs)/sum(counts) devs = (avgErrs - averageErr) devs = devs * counts devs = numpy.sqrt(devs*devs) avgDev = sum(devs)/sum(counts) if _verbose: message('# Overall Average Error: %%% 5.2f, Average Deviation: %%% 6.2f'%(100.*averageErr,100.*avgDev)) if details.bayesModel: composite.Train(trainExamples,verbose=0) badExamples = [] if not details.detailedRes: if _verbose: message('Testing all examples') wrong = BuildComposite.testall(composite,namedExamples,badExamples) if _verbose: message('%d examples (%% %5.2f) were misclassified'%(len(wrong),100.*float(len(wrong))/float(len(namedExamples)))) _runDetails.overall_error = float(len(wrong))/len(namedExamples) if details.detailedRes: if _verbose: message('\nEntire data set:') resTup = ScreenComposite.ShowVoteResults(range(data.GetNPts()),data,composite, nPossibleVals[-1],details.threshold) nGood,nBad,nSkip,avgGood,avgBad,avgSkip,voteTab = resTup nPts = len(namedExamples) nClass = nGood+nBad _runDetails.overall_error = float(nBad) / nClass _runDetails.overall_correct_conf = avgGood _runDetails.overall_incorrect_conf = avgBad _runDetails.overall_result_matrix = repr(voteTab) nRej = nClass-nPts if nRej > 0: _runDetails.overall_fraction_dropped = float(nRej)/nPts return composite
def setUp(self): DataUtils.InitRandomNumbers((25, 25))