def test5(self): """ indicesToUse """ probes = [ (.5, 4, 2), (.7, 3, 3), (.75, 3, 3), (.333, 6, 0), (.25, 4, 2), ] nPts = len(self.d1) for frac, nKeep, nRej in probes: DataUtils.InitRandomNumbers((23, 42)) k, r = DataUtils.FilterData(self.d1, 1, frac, indicesToUse=range(nPts)) assert len(k) == nKeep, 'bad nKeep (%d != %d)' % (len(k), nKeep) assert len(r) == nRej, 'bad nRej (%d != %d)' % (len(r), nRej) keep, rej = k, r # make sure the examples are actually correct DataUtils.InitRandomNumbers((23, 42)) tgtKeep, tgtRej = DataUtils.FilterData(self.d1, 1, frac) assert keep == tgtKeep, '%.2f: %s!=%s' % (frac, str(keep), str(tgtKeep)) assert rej == tgtRej, '%.2f: %s!=%s' % (frac, str(rej), str(tgtRej))
def test4_indicesOnly_indicesToUse(self): # """ indicesOnly with indicesToUse """ probes = [ (.5, 4, 2), (.7, 3, 3), (.75, 3, 3), (.333, 6, 0), (.25, 4, 2), ] nPts = len(self.d1) for frac, nKeep, nRej in probes: DataUtils.InitRandomNumbers((23, 42)) k, r = DataUtils.FilterData(self.d1, 1, frac, indicesToUse=range(nPts), indicesOnly=1) assert len(k) == nKeep, 'bad nKeep (%d != %d)' % (len(k), nKeep) assert len(r) == nRej, 'bad nRej (%d != %d)' % (len(r), nRej) # make sure the indices are actually correct keep = [self.d1[x] for x in k] rej = [self.d1[x] for x in r] DataUtils.InitRandomNumbers((23, 42)) tgtKeep, tgtRej = DataUtils.FilterData(self.d1, 1, frac) assert keep == tgtKeep, '%.2f: %s!=%s' % (frac, str(keep), str(tgtKeep)) assert rej == tgtRej, '%.2f: %s!=%s' % (frac, str(rej), str(tgtRej))
def _balanced_parallel_build_trees(n_trees, forest, X, y, sample_weight, sample_mask, X_argsorted, seed, verbose): """Private function used to build a batch of trees within a job""" from sklearn.utils import check_random_state from sklearn.utils.fixes import bincount import random MAX_INT = numpy.iinfo(numpy.int32).max random_state = check_random_state(seed) trees = [] for i in xrange(n_trees): if verbose > 1: print("building tree %d of %d" % (i + 1, n_trees)) seed = random_state.randint(MAX_INT) tree = forest._make_estimator(append=False) tree.set_params(compute_importances=forest.compute_importances) tree.set_params(random_state=check_random_state(seed)) if forest.bootstrap: n_samples = X.shape[0] if sample_weight is None: curr_sample_weight = numpy.ones((n_samples, ), dtype=numpy.float64) else: curr_sample_weight = sample_weight.copy() ty = list(enumerate(y)) indices = DataUtils.FilterData(ty, val=1, frac=0.5, col=1, indicesToUse=0, indicesOnly=1)[0] indices2 = random_state.randint(0, len(indices), len(indices)) indices = [indices[j] for j in indices2] sample_counts = bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts curr_sample_mask = sample_mask.copy() curr_sample_mask[sample_counts == 0] = False tree.fit(X, y, sample_weight=curr_sample_weight, sample_mask=curr_sample_mask, X_argsorted=X_argsorted, check_input=False) tree.indices = curr_sample_mask else: tree.fit(X, y, sample_weight=sample_weight, sample_mask=sample_mask, X_argsorted=X_argsorted, check_input=False) trees.append(tree) return trees
def test1(self): """ basics """ probes = [ (.5, 4, 2), (.7, 3, 3), (.75, 3, 3), (.333, 6, 0), (.25, 4, 2), ] for frac, nKeep, nRej in probes: k, r = DataUtils.FilterData(self.d1, 1, frac) assert len(k) == nKeep, 'bad nKeep (%d != %d)' % (len(k), nKeep) assert len(r) == nRej, 'bad nRej (%d != %d)' % (len(r), nRej)
seed = model._randomSeed except AttributeError: pass else: DataUtils.InitRandomNumbers(seed) if details.shuffleActivities: DataUtils.RandomizeActivities(tmpD, shuffle=1) if hasattr(model, '_splitFrac') and (details.doHoldout or details.doTraining): trainIdx, testIdx = SplitData.SplitIndices(tmpD.GetNPts(), model._splitFrac, silent=1) if details.filterFrac != 0.0: trainFilt, temp = DataUtils.FilterData(tmpD, details.filterVal, details.filterFrac, -1, indicesToUse=trainIdx, indicesOnly=1) testIdx += temp trainIdx = trainFilt if details.doTraining: testIdx, trainIdx = trainIdx, testIdx else: testIdx = range(tmpD.GetNPts()) message('screening %d examples' % (len(testIdx))) nTrueActives, screenRes = ScreenModel( model, descs, tmpD, picking=details.activeTgt,
def RunOnData(details, data, progressCallback=None, saveIt=1, setDescNames=0): nExamples = data.GetNPts() if details.lockRandom: seed = details.randomSeed else: import random seed = (random.randint(0, 1e6), random.randint(0, 1e6)) DataUtils.InitRandomNumbers(seed) testExamples = [] if details.shuffleActivities == 1: DataUtils.RandomizeActivities(data, shuffle=1, runDetails=details) elif details.randomActivities == 1: DataUtils.RandomizeActivities(data, shuffle=0, runDetails=details) namedExamples = data.GetNamedData() if details.splitRun == 1: trainIdx, testIdx = SplitData.SplitIndices(len(namedExamples), details.splitFrac, silent=not _verbose) trainExamples = [namedExamples[x] for x in trainIdx] testExamples = [namedExamples[x] for x in testIdx] else: testExamples = [] testIdx = [] trainIdx = range(len(namedExamples)) trainExamples = namedExamples if details.filterFrac != 0.0: # if we're doing quantization on the fly, we need to handle that here: if hasattr(details, 'activityBounds') and details.activityBounds: tExamples = [] bounds = details.activityBounds for pt in trainExamples: pt = pt[:] act = pt[-1] placed = 0 bound = 0 while not placed and bound < len(bounds): if act < bounds[bound]: pt[-1] = bound placed = 1 else: bound += 1 if not placed: pt[-1] = bound tExamples.append(pt) else: bounds = None tExamples = trainExamples trainIdx, temp = DataUtils.FilterData(tExamples, details.filterVal, details.filterFrac, -1, indicesOnly=1) tmp = [trainExamples[x] for x in trainIdx] testExamples += [trainExamples[x] for x in temp] trainExamples = tmp counts = DataUtils.CountResults(trainExamples, bounds=bounds) ks = counts.keys() ks.sort() message('Result Counts in training set:') for k in ks: message(str((k, counts[k]))) counts = DataUtils.CountResults(testExamples, bounds=bounds) ks = counts.keys() ks.sort() message('Result Counts in test set:') for k in ks: message(str((k, counts[k]))) nExamples = len(trainExamples) message('Training with %d examples' % (nExamples)) nVars = data.GetNVars() attrs = range(1, nVars + 1) nPossibleVals = data.GetNPossibleVals() for i in range(1, len(nPossibleVals)): if nPossibleVals[i - 1] == -1: attrs.remove(i) if details.pickleDataFileName != '': pickleDataFile = open(details.pickleDataFileName, 'wb+') cPickle.dump(trainExamples, pickleDataFile) cPickle.dump(testExamples, pickleDataFile) pickleDataFile.close() if details.bayesModel: composite = BayesComposite.BayesComposite() else: composite = Composite.Composite() composite._randomSeed = seed composite._splitFrac = details.splitFrac composite._shuffleActivities = details.shuffleActivities composite._randomizeActivities = details.randomActivities if hasattr(details, 'filterFrac'): composite._filterFrac = details.filterFrac if hasattr(details, 'filterVal'): composite._filterVal = details.filterVal composite.SetModelFilterData(details.modelFilterFrac, details.modelFilterVal) composite.SetActivityQuantBounds(details.activityBounds) nPossibleVals = data.GetNPossibleVals() if details.activityBounds: nPossibleVals[-1] = len(details.activityBounds) + 1 if setDescNames: composite.SetInputOrder(data.GetVarNames()) composite.SetDescriptorNames(details._descNames) else: composite.SetDescriptorNames(data.GetVarNames()) composite.SetActivityQuantBounds(details.activityBounds) if details.nModels == 1: details.internalHoldoutFrac = 0.0 if details.useTrees: from rdkit.ML.DecTree import CrossValidate, PruneTree if details.qBounds != []: from rdkit.ML.DecTree import BuildQuantTree builder = BuildQuantTree.QuantTreeBoot else: from rdkit.ML.DecTree import ID3 builder = ID3.ID3Boot driver = CrossValidate.CrossValidationDriver pruner = PruneTree.PruneTree composite.SetQuantBounds(details.qBounds) nPossibleVals = data.GetNPossibleVals() if details.activityBounds: nPossibleVals[-1] = len(details.activityBounds) + 1 composite.Grow(trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver, pruner=pruner, nTries=details.nModels, pruneIt=details.pruneIt, lessGreedy=details.lessGreedy, needsQuantization=0, treeBuilder=builder, nQuantBounds=details.qBounds, startAt=details.startAt, maxDepth=details.limitDepth, progressCallback=progressCallback, holdOutFrac=details.internalHoldoutFrac, replacementSelection=details.replacementSelection, recycleVars=details.recycleVars, randomDescriptors=details.randomDescriptors, silent=not _verbose) elif details.useSigTrees: from rdkit.ML.DecTree import CrossValidate from rdkit.ML.DecTree import BuildSigTree builder = BuildSigTree.SigTreeBuilder driver = CrossValidate.CrossValidationDriver nPossibleVals = data.GetNPossibleVals() if details.activityBounds: nPossibleVals[-1] = len(details.activityBounds) + 1 if hasattr(details, 'sigTreeBiasList'): biasList = details.sigTreeBiasList else: biasList = None if hasattr(details, 'useCMIM'): useCMIM = details.useCMIM else: useCMIM = 0 if hasattr(details, 'allowCollections'): allowCollections = details.allowCollections else: allowCollections = False composite.Grow(trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver, nTries=details.nModels, needsQuantization=0, treeBuilder=builder, maxDepth=details.limitDepth, progressCallback=progressCallback, holdOutFrac=details.internalHoldoutFrac, replacementSelection=details.replacementSelection, recycleVars=details.recycleVars, randomDescriptors=details.randomDescriptors, biasList=biasList, useCMIM=useCMIM, allowCollection=allowCollections, silent=not _verbose) elif details.useKNN: from rdkit.ML.KNN import CrossValidate from rdkit.ML.KNN import DistFunctions driver = CrossValidate.CrossValidationDriver dfunc = '' if (details.knnDistFunc == "Euclidean"): dfunc = DistFunctions.EuclideanDist elif (details.knnDistFunc == "Tanimoto"): dfunc = DistFunctions.TanimotoDist else: assert 0, "Bad KNN distance metric value" composite.Grow(trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver, nTries=details.nModels, needsQuantization=0, numNeigh=details.knnNeighs, holdOutFrac=details.internalHoldoutFrac, distFunc=dfunc) elif details.useNaiveBayes or details.useSigBayes: from rdkit.ML.NaiveBayes import CrossValidate driver = CrossValidate.CrossValidationDriver if not (hasattr(details, 'useSigBayes') and details.useSigBayes): composite.Grow(trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver, nTries=details.nModels, needsQuantization=0, nQuantBounds=details.qBounds, holdOutFrac=details.internalHoldoutFrac, replacementSelection=details.replacementSelection, mEstimateVal=details.mEstimateVal, silent=not _verbose) else: if hasattr(details, 'useCMIM'): useCMIM = details.useCMIM else: useCMIM = 0 composite.Grow(trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver, nTries=details.nModels, needsQuantization=0, nQuantBounds=details.qBounds, mEstimateVal=details.mEstimateVal, useSigs=True, useCMIM=useCMIM, holdOutFrac=details.internalHoldoutFrac, replacementSelection=details.replacementSelection, silent=not _verbose) ## elif details.useSVM: ## from rdkit.ML.SVM import CrossValidate ## driver = CrossValidate.CrossValidationDriver ## composite.Grow(trainExamples, attrs, nPossibleVals=[0]+nPossibleVals, ## buildDriver=driver, nTries=details.nModels, ## needsQuantization=0, ## cost=details.svmCost,gamma=details.svmGamma, ## weights=details.svmWeights,degree=details.svmDegree, ## type=details.svmType,kernelType=details.svmKernel, ## coef0=details.svmCoeff,eps=details.svmEps,nu=details.svmNu, ## cache_size=details.svmCache,shrinking=details.svmShrink, ## dataType=details.svmDataType, ## holdOutFrac=details.internalHoldoutFrac, ## replacementSelection=details.replacementSelection, ## silent=not _verbose) else: from rdkit.ML.Neural import CrossValidate driver = CrossValidate.CrossValidationDriver composite.Grow(trainExamples, attrs, [0] + nPossibleVals, nTries=details.nModels, buildDriver=driver, needsQuantization=0) composite.AverageErrors() composite.SortModels() modelList, counts, avgErrs = composite.GetAllData() counts = numpy.array(counts) avgErrs = numpy.array(avgErrs) composite._varNames = data.GetVarNames() for i in range(len(modelList)): modelList[i].NameModel(composite._varNames) # do final statistics weightedErrs = counts * avgErrs averageErr = sum(weightedErrs) / sum(counts) devs = (avgErrs - averageErr) devs = devs * counts devs = numpy.sqrt(devs * devs) avgDev = sum(devs) / sum(counts) message('# Overall Average Error: %%% 5.2f, Average Deviation: %%% 6.2f' % (100. * averageErr, 100. * avgDev)) if details.bayesModel: composite.Train(trainExamples, verbose=0) # blow out the saved examples and then save the composite: composite.ClearModelExamples() if saveIt: composite.Pickle(details.outName) details.model = DbModule.binaryHolder(cPickle.dumps(composite)) badExamples = [] if not details.detailedRes and (not hasattr(details, 'noScreen') or not details.noScreen): if details.splitRun: message('Testing all hold-out examples') wrong = testall(composite, testExamples, badExamples) message('%d examples (%% %5.2f) were misclassified' % (len(wrong), 100. * float(len(wrong)) / float(len(testExamples)))) _runDetails.holdout_error = float(len(wrong)) / len(testExamples) else: message('Testing all examples') wrong = testall(composite, namedExamples, badExamples) message('%d examples (%% %5.2f) were misclassified' % (len(wrong), 100. * float(len(wrong)) / float(len(namedExamples)))) _runDetails.overall_error = float(len(wrong)) / len(namedExamples) if details.detailedRes: message('\nEntire data set:') resTup = ScreenComposite.ShowVoteResults(range(data.GetNPts()), data, composite, nPossibleVals[-1], details.threshold) nGood, nBad, nSkip, avgGood, avgBad, avgSkip, voteTab = resTup nPts = len(namedExamples) nClass = nGood + nBad _runDetails.overall_error = float(nBad) / nClass _runDetails.overall_correct_conf = avgGood _runDetails.overall_incorrect_conf = avgBad _runDetails.overall_result_matrix = repr(voteTab) nRej = nClass - nPts if nRej > 0: _runDetails.overall_fraction_dropped = float(nRej) / nPts if details.splitRun: message('\nHold-out data:') resTup = ScreenComposite.ShowVoteResults(range(len(testExamples)), testExamples, composite, nPossibleVals[-1], details.threshold) nGood, nBad, nSkip, avgGood, avgBad, avgSkip, voteTab = resTup nPts = len(testExamples) nClass = nGood + nBad _runDetails.holdout_error = float(nBad) / nClass _runDetails.holdout_correct_conf = avgGood _runDetails.holdout_incorrect_conf = avgBad _runDetails.holdout_result_matrix = repr(voteTab) nRej = nClass - nPts if nRej > 0: _runDetails.holdout_fraction_dropped = float(nRej) / nPts if details.persistTblName and details.dbName: message('Updating results table %s:%s' % (details.dbName, details.persistTblName)) details.Store(db=details.dbName, table=details.persistTblName) if details.badName != '': badFile = open(details.badName, 'w+') for i in range(len(badExamples)): ex = badExamples[i] vote = wrong[i] outStr = '%s\t%s\n' % (ex, vote) badFile.write(outStr) badFile.close() composite.ClearModelExamples() return composite
def BalanceComposite(details,composite,data1=None,data2=None): """ balances the composite using the parameters provided in details **Arguments** - details a _CompositeRun.RunDetails_ object - composite: the composite model to be balanced - data1: (optional) if provided, this should be the data set used to construct the original models - data2: (optional) if provided, this should be the data set used to construct the new individual models """ if not details.balCnt or details.balCnt > len(composite): return composite message("Balancing Composite") # # start by getting data set 1: which is the data set used to build the # original models # if data1 is None: message("\tReading First Data Set") fName = details.balTable.strip() tmp = details.tableName details.tableName = fName dbName = details.dbName details.dbName = details.balDb data1 = details.GetDataSet() details.tableName = tmp details.dbName = dbName if data1 is None: return composite details.splitFrac = composite._splitFrac details.randomSeed = composite._randomSeed DataUtils.InitRandomNumbers(details.randomSeed) if details.shuffleActivities == 1: DataUtils.RandomizeActivities(data1,shuffle=1,runDetails=details) elif details.randomActivities == 1: DataUtils.RandomizeActivities(data1,shuffle=0,runDetails=details) namedExamples = data1.GetNamedData() if details.balDoHoldout or details.balDoTrain: trainIdx,testIdx = SplitData.SplitIndices(len(namedExamples),details.splitFrac, silent=1) trainExamples = [namedExamples[x] for x in trainIdx] testExamples = [namedExamples[x] for x in testIdx] if details.filterFrac != 0.0: trainIdx,temp = DataUtils.FilterData(trainExamples,details.filterVal, details.filterFrac,-1, indicesOnly=1) tmp = [trainExamples[x] for x in trainIdx] testExamples += [trainExamples[x] for x in temp] trainExamples = tmp if details.balDoHoldout: testExamples,trainExamples = trainExamples,testExamples else: trainExamples = namedExamples dataSet1 = trainExamples cols1 = [x.upper() for x in data1.GetVarNames()] data1 = None # # now grab data set 2: the data used to build the new individual models # if data2 is None: message("\tReading Second Data Set") data2 = details.GetDataSet() if data2 is None: return composite details.splitFrac = composite._splitFrac details.randomSeed = composite._randomSeed DataUtils.InitRandomNumbers(details.randomSeed) if details.shuffleActivities == 1: DataUtils.RandomizeActivities(data2,shuffle=1,runDetails=details) elif details.randomActivities == 1: DataUtils.RandomizeActivities(data2,shuffle=0,runDetails=details) dataSet2 = data2.GetNamedData() cols2 = [x.upper() for x in data2.GetVarNames()] data2 = None # and balance it: res = [] weights = details.balWeight if type(weights) not in (types.TupleType,types.ListType): weights = (weights,) for weight in weights: message("\tBalancing with Weight: %.4f"%(weight)) res.append(AdjustComposite.BalanceComposite(composite,dataSet1,dataSet2, weight, details.balCnt, names1=cols1,names2=cols2)) return res
def Grow(self,examples,attrs,nPossibleVals,buildDriver,pruner=None, nTries=10,pruneIt=0, needsQuantization=1,progressCallback=None, **buildArgs): """ Grows the composite **Arguments** - examples: a list of examples to be used in training - attrs: a list of the variables to be used in training - nPossibleVals: this is used to provide a list of the number of possible values for each variable. It is used if the local quantBounds have not been set (for example for when you are working with data which is already quantized). - buildDriver: the function to call to build the new models - pruner: a function used to "prune" (reduce the complexity of) the resulting model. - nTries: the number of new models to add - pruneIt: toggles whether or not pruning is done - needsQuantization: used to indicate whether or not this type of model requires quantized data - **buildArgs: all other keyword args are passed to _buildDriver_ **Note** - new models are *added* to the existing ones """ silent = buildArgs.get('silent',0) buildArgs['silent']=1 buildArgs['calcTotalError']=1 if self._mapOrder is not None: examples = map(self._RemapInput,examples) if self.GetActivityQuantBounds(): for i in range(len(examples)): examples[i] = self.QuantizeActivity(examples[i]) nPossibleVals[-1]=len(self.GetActivityQuantBounds())+1 if self.nPossibleVals is None: self.nPossibleVals = nPossibleVals[:] if needsQuantization: trainExamples = [None]*len(examples) nPossibleVals = self.nPossibleVals for i in range(len(examples)): trainExamples[i] = self.QuantizeExample(examples[i],self.quantBounds) else: trainExamples = examples for i in range(nTries): trainSet = None if (hasattr(self, '_modelFilterFrac')) and (self._modelFilterFrac != 0) : trainIdx, temp = DataUtils.FilterData(trainExamples, self._modelFilterVal, self._modelFilterFrac,-1, indicesOnly=1) trainSet = [trainExamples[x] for x in trainIdx] else: trainSet = trainExamples #print("Training model %i with %i out of %i examples"%(i, len(trainSet), len(trainExamples))) model,frac = buildDriver(*(trainSet,attrs,nPossibleVals), **buildArgs) if pruneIt: model,frac2 = pruner(model,model.GetTrainingExamples(), model.GetTestExamples(), minimizeTestErrorOnly=0) frac = frac2 if hasattr(self, '_modelFilterFrac') and self._modelFilterFrac!=0 and \ hasattr(model,'_trainIndices'): # correct the model's training indices: trainIndices = [trainIdx[x] for x in model._trainIndices] model._trainIndices = trainIndices self.AddModel(model,frac,needsQuantization) if not silent and (nTries < 10 or i % (nTries/10) == 0): print('Cycle: % 4d'%(i)) if progressCallback is not None: progressCallback(i)