def RandomizeMolBlock(molB): splitB = molB.split('\n') res = [] res.extend(splitB[0:3]) idx = 3 inL = splitB[idx] res.append(inL) nAts = int(inL[0:3]) nBonds = int(inL[3:6]) idx += 1 atLines = splitB[idx:idx + nAts] order = list(range(nAts)) random.shuffle(order, random=random.random) for i in order: res.append(atLines[i]) #print 'ORDER:',order idx += nAts for i in range(nBonds): inL = splitB[idx] idx1 = int(inL[0:3]) - 1 idx2 = int(inL[3:6]) - 1 idx1 = order.index(idx1) idx2 = order.index(idx2) inL = '% 3d% 3d' % (idx1 + 1, idx2 + 1) + inL[6:] res.append(inL) idx += 1 res.append('M END') return '\n'.join(res)
def RandomizeMolBlock(molB): splitB = molB.split('\n') res = [] res.extend(splitB[0:3]) idx = 3 inL = splitB[idx] res.append(inL) nAts = int(inL[0:3]) nBonds = int(inL[3:6]) idx+=1 atLines = splitB[idx:idx+nAts] order = list(range(nAts)) random.shuffle(order,random=random.random) for i in order: res.append(atLines[i]) #print 'ORDER:',order idx += nAts for i in range(nBonds): inL = splitB[idx] idx1 = int(inL[0:3])-1 idx2 = int(inL[3:6])-1 idx1 = order.index(idx1) idx2 = order.index(idx2) inL = '% 3d% 3d'%(idx1+1,idx2+1)+inL[6:] res.append(inL) idx += 1 res.append('M END') return '\n'.join(res)
def _testSpecific(): from rdkit.ML.DecTree import ID3 oPts= [ \ [0,0,1,0], [0,1,1,1], [1,0,1,1], [1,1,0,0], [1,1,1,1], ] tPts = oPts+[[0,1,1,0],[0,1,1,0]] tree = ID3.ID3Boot(oPts,attrs=range(3),nPossibleVals=[2]*4) tree.Print() err,badEx = CrossValidate.CrossValidate(tree,oPts) print('original error:',err) err,badEx = CrossValidate.CrossValidate(tree,tPts) print('original holdout error:',err) newTree,frac2 = PruneTree(tree,oPts,tPts) newTree.Print() err,badEx = CrossValidate.CrossValidate(newTree,tPts) print('pruned holdout error is:',err) print(badEx) print(len(tree),len(newTree))
def _testChain(): from rdkit.ML.DecTree import ID3 oPts= [ \ [1,0,0,0,1], [1,0,0,0,1], [1,0,0,0,1], [1,0,0,0,1], [1,0,0,0,1], [1,0,0,0,1], [1,0,0,0,1], [0,0,1,1,0], [0,0,1,1,0], [0,0,1,1,1], [0,1,0,1,0], [0,1,0,1,0], [0,1,0,0,1], ] tPts = oPts tree = ID3.ID3Boot(oPts, attrs=range(len(oPts[0]) - 1), nPossibleVals=[2] * len(oPts[0])) tree.Print() err, badEx = CrossValidate.CrossValidate(tree, oPts) print('original error:', err) err, badEx = CrossValidate.CrossValidate(tree, tPts) print('original holdout error:', err) newTree, frac2 = PruneTree(tree, oPts, tPts) newTree.Print() err, badEx = CrossValidate.CrossValidate(newTree, tPts) print('pruned holdout error is:', err) print(badEx)
def TestQuantTree(): # pragma: nocover """ Testing code for named trees The created pkl file is required by the unit test code. """ examples1 = [['p1', 0, 1, 0.1, 0], ['p2', 0, 0, 0.1, 1], ['p3', 0, 0, 1.1, 2], ['p4', 0, 1, 1.1, 2], ['p5', 1, 0, 0.1, 2], ['p6', 1, 0, 1.1, 2], ['p7', 1, 1, 0.1, 2], ['p8', 1, 1, 1.1, 0]] attrs = list(range(1, len(examples1[0]) - 1)) nPossibleVals = [0, 2, 2, 0, 3] boundsPerVar = [0, 0, 0, 1, 0] print('base') t1 = QuantTreeBoot(examples1, attrs, nPossibleVals, boundsPerVar) t1.Pickle('test_data/QuantTree1.pkl') t1.Print() print('depth limit') t1 = QuantTreeBoot(examples1, attrs, nPossibleVals, boundsPerVar, maxDepth=1) t1.Pickle('test_data/QuantTree1.pkl') t1.Print()
def TestQuantTree(): """ testing code for named trees """ examples1 = [['p1',0,1,0.1,0], ['p2',0,0,0.1,1], ['p3',0,0,1.1,2], ['p4',0,1,1.1,2], ['p5',1,0,0.1,2], ['p6',1,0,1.1,2], ['p7',1,1,0.1,2], ['p8',1,1,1.1,0] ] attrs = list(range(1,len(examples1[0])-1)) nPossibleVals = [0,2,2,0,3] boundsPerVar=[0,0,0,1,0] print('base') t1 = QuantTreeBoot(examples1,attrs,nPossibleVals,boundsPerVar) t1.Pickle('test_data/QuantTree1.pkl') t1.Print() print('depth limit') t1 = QuantTreeBoot(examples1,attrs,nPossibleVals,boundsPerVar,maxDepth=1) t1.Pickle('test_data/QuantTree1.pkl') t1.Print()
def _testChain(): from rdkit.ML.DecTree import ID3 oPts = [ [1, 0, 0, 0, 1], [1, 0, 0, 0, 1], [1, 0, 0, 0, 1], [1, 0, 0, 0, 1], [1, 0, 0, 0, 1], [1, 0, 0, 0, 1], [1, 0, 0, 0, 1], [0, 0, 1, 1, 0], [0, 0, 1, 1, 0], [0, 0, 1, 1, 1], [0, 1, 0, 1, 0], [0, 1, 0, 1, 0], [0, 1, 0, 0, 1], ] tPts = oPts tree = ID3.ID3Boot(oPts, attrs=range(len(oPts[0]) - 1), nPossibleVals=[2] * len(oPts[0])) tree.Print() err, _ = CrossValidate.CrossValidate(tree, oPts) print('original error:', err) err, _ = CrossValidate.CrossValidate(tree, tPts) print('original holdout error:', err) newTree, frac2 = PruneTree(tree, oPts, tPts) newTree.Print() print('best error of pruned tree:', frac2) err, badEx = CrossValidate.CrossValidate(newTree, tPts) print('pruned holdout error is:', err) print(badEx)
def CheckCanonicalization(mol, nReps=10): refSmi = Chem.MolToSmiles(mol, False) for i in range(nReps): m2 = RandomizeMol(mol) smi = Chem.MolToSmiles(m2, False) if smi != refSmi: raise ValueError('\nRef: %s\n : %s' % (refSmi, smi))
def CheckCanonicalization(mol,nReps=10): refSmi = Chem.MolToSmiles(mol,False) for i in range(nReps): m2 = RandomizeMol(mol) smi = Chem.MolToSmiles(m2,False) if smi!=refSmi: raise ValueError('\nRef: %s\n : %s'%(refSmi,smi))
def _testSpecific(): from rdkit.ML.DecTree import ID3 oPts= [ \ [0,0,1,0], [0,1,1,1], [1,0,1,1], [1,1,0,0], [1,1,1,1], ] tPts = oPts + [[0, 1, 1, 0], [0, 1, 1, 0]] tree = ID3.ID3Boot(oPts, attrs=range(3), nPossibleVals=[2] * 4) tree.Print() err, badEx = CrossValidate.CrossValidate(tree, oPts) print('original error:', err) err, badEx = CrossValidate.CrossValidate(tree, tPts) print('original holdout error:', err) newTree, frac2 = PruneTree(tree, oPts, tPts) newTree.Print() err, badEx = CrossValidate.CrossValidate(newTree, tPts) print('pruned holdout error is:', err) print(badEx) print(len(tree), len(newTree))
def TestQuantTree(): """ testing code for named trees """ examples1 = [['p1', 0, 1, 0.1, 0], ['p2', 0, 0, 0.1, 1], ['p3', 0, 0, 1.1, 2], ['p4', 0, 1, 1.1, 2], ['p5', 1, 0, 0.1, 2], ['p6', 1, 0, 1.1, 2], ['p7', 1, 1, 0.1, 2], ['p8', 1, 1, 1.1, 0]] attrs = list(range(1, len(examples1[0]) - 1)) nPossibleVals = [0, 2, 2, 0, 3] boundsPerVar = [0, 0, 0, 1, 0] print('base') t1 = QuantTreeBoot(examples1, attrs, nPossibleVals, boundsPerVar) t1.Pickle('test_data/QuantTree1.pkl') t1.Print() print('depth limit') t1 = QuantTreeBoot(examples1, attrs, nPossibleVals, boundsPerVar, maxDepth=1) t1.Pickle('test_data/QuantTree1.pkl') t1.Print()
def RandomizeMolBlock(molB): splitB = molB.split('\n') res = [] res.extend(splitB[0:3]) idx = 3 inL = splitB[idx] res.append(inL) nAts = int(inL[0:3]) nBonds = int(inL[3:6]) idx += 1 atLines = splitB[idx:idx + nAts] order = list(range(nAts)) random.shuffle(order, random=random.random) for i in order: res.append(atLines[i]) #print 'ORDER:',order idx += nAts for i in range(nBonds): inL = splitB[idx] idx1 = int(inL[0:3]) - 1 idx2 = int(inL[3:6]) - 1 idx1 = order.index(idx1) idx2 = order.index(idx2) inL = '% 3d% 3d' % (idx1 + 1, idx2 + 1) + inL[6:] res.append(inL) idx += 1 #Charges for i in range(idx, len(splitB)): if splitB[i][0:6] == "M CHG": line = splitB[i] chargeline = line.split() col = line[0:9] for i in range(3, len(chargeline), 2): col = col + "%4i%4i" % (order.index(int(chargeline[i]) - 1) + 1, int(chargeline[i + 1]) + 1) #print col res.append(col) res.append('M END') return '\n'.join(res)
def FindVarMultQuantBounds(vals,nBounds,results,nPossibleRes): """ finds multiple quantization bounds for a single variable **Arguments** - vals: sequence of variable values (assumed to be floats) - nBounds: the number of quantization bounds to find - results: a list of result codes (should be integers) - nPossibleRes: an integer with the number of possible values of the result variable **Returns** - a 2-tuple containing: 1) a list of the quantization bounds (floats) 2) the information gain associated with this quantization """ assert len(vals) == len(results), 'vals/results length mismatch' nData = len(vals) if nData == 0: return [],-1e8 # sort the variable values: svs = list(zip(vals,results)) svs.sort() sortVals,sortResults = zip(*svs) startNext=_FindStartPoints(sortVals,sortResults,nData) if not len(startNext): return [0],0.0 if len(startNext)<nBounds: nBounds = len(startNext)-1 if nBounds == 0: nBounds=1 initCuts = list(range(nBounds)) maxGain,bestCuts = _RecurseOnBounds(sortVals,initCuts,0,startNext, sortResults,nPossibleRes) quantBounds = [] nVs = len(sortVals) for cut in bestCuts: idx = startNext[cut] if idx == nVs: quantBounds.append(sortVals[-1]) elif idx == 0: quantBounds.append(sortVals[idx]) else: quantBounds.append((sortVals[idx]+sortVals[idx-1])/2.) return quantBounds,maxGain
def FindVarMultQuantBounds(vals, nBounds, results, nPossibleRes): """ finds multiple quantization bounds for a single variable **Arguments** - vals: sequence of variable values (assumed to be floats) - nBounds: the number of quantization bounds to find - results: a list of result codes (should be integers) - nPossibleRes: an integer with the number of possible values of the result variable **Returns** - a 2-tuple containing: 1) a list of the quantization bounds (floats) 2) the information gain associated with this quantization """ assert len(vals) == len(results), 'vals/results length mismatch' nData = len(vals) if nData == 0: return [], -1e8 # sort the variable values: svs = list(zip(vals, results)) svs.sort() sortVals, sortResults = zip(*svs) startNext = _FindStartPoints(sortVals, sortResults, nData) if not len(startNext): return [0], 0.0 if len(startNext) < nBounds: nBounds = len(startNext) - 1 if nBounds == 0: nBounds = 1 initCuts = list(range(nBounds)) maxGain, bestCuts = _RecurseOnBounds(sortVals, initCuts, 0, startNext, sortResults, nPossibleRes) quantBounds = [] nVs = len(sortVals) for cut in bestCuts: idx = startNext[cut] if idx == nVs: quantBounds.append(sortVals[-1]) elif idx == 0: quantBounds.append(sortVals[idx]) else: quantBounds.append((sortVals[idx] + sortVals[idx - 1]) / 2.) return quantBounds, maxGain
def TestTree(): """ testing code for named trees """ examples1 = [['p1', 0, 1, 0, 0], ['p2', 0, 0, 0, 1], ['p3', 0, 0, 1, 2], ['p4', 0, 1, 1, 2], ['p5', 1, 0, 0, 2], ['p6', 1, 0, 1, 2], ['p7', 1, 1, 0, 2], ['p8', 1, 1, 1, 0]] attrs = list(range(1, len(examples1[0]) - 1)) nPossibleVals = [0, 2, 2, 2, 3] t1 = ID3.ID3Boot(examples1, attrs, nPossibleVals, maxDepth=1) t1.Print()
def GetFeatFeatDistMatrix(fm, mergeMetric, mergeTol, dirMergeMode, compatFunc): """ NOTE that mergeTol is a max value for merging when using distance-based merging and a min value when using score-based merging. """ dists = [[1e8] * fm.GetNumFeatures() for x in range(fm.GetNumFeatures())] if mergeMetric == MergeMetric.NoMerge: return dists elif mergeMetric == MergeMetric.Distance: mergeTol2 = mergeTol * mergeTol for i in range(fm.GetNumFeatures()): ptI = fm.GetFeature(i) for j in range(i + 1, fm.GetNumFeatures()): ptJ = fm.GetFeature(j) if compatFunc(ptI, ptJ): dist2 = ptI.GetDist2(ptJ) if dist2 < mergeTol2: dists[i][j] = dist2 dists[j][i] = dist2 elif mergeMetric == MergeMetric.Overlap: for i in range(fm.GetNumFeatures()): ptI = fm.GetFeature(i) for j in range(i + 1, fm.GetNumFeatures()): ptJ = fm.GetFeature(j) if compatFunc(ptI, ptJ): score = fm.GetFeatFeatScore(ptI, ptJ, typeMatch=False) score *= -1 * ptJ.weight if score < mergeTol: dists[i][j] = score dists[j][i] = score else: raise ValueError('unrecognized mergeMetric') return dists
def GetFeatFeatDistMatrix(fm,mergeMetric,mergeTol,dirMergeMode,compatFunc): """ NOTE that mergeTol is a max value for merging when using distance-based merging and a min value when using score-based merging. """ dists = [[1e8]*fm.GetNumFeatures() for x in range(fm.GetNumFeatures())] if mergeMetric==MergeMetric.NoMerge: return dists elif mergeMetric==MergeMetric.Distance: mergeTol2 = mergeTol*mergeTol for i in range(fm.GetNumFeatures()): ptI = fm.GetFeature(i) for j in range(i+1,fm.GetNumFeatures()): ptJ = fm.GetFeature(j) if compatFunc(ptI,ptJ): dist2 = ptI.GetDist2(ptJ) if dist2<mergeTol2: dists[i][j]=dist2 dists[j][i]=dist2 elif mergeMetric==MergeMetric.Overlap: for i in range(fm.GetNumFeatures()): ptI = fm.GetFeature(i) for j in range(i+1,fm.GetNumFeatures()): ptJ = fm.GetFeature(j) if compatFunc(ptI,ptJ): score = fm.GetFeatFeatScore(ptI,ptJ,typeMatch=False) score *= -1*ptJ.weight if score<mergeTol: dists[i][j]=score dists[j][i]=score else: raise ValueError('unrecognized mergeMetric') return dists
def TestQuantTree2(): """ testing code for named trees """ examples1 = [['p1', 0.1, 1, 0.1, 0], ['p2', 0.1, 0, 0.1, 1], ['p3', 0.1, 0, 1.1, 2], ['p4', 0.1, 1, 1.1, 2], ['p5', 1.1, 0, 0.1, 2], ['p6', 1.1, 0, 1.1, 2], ['p7', 1.1, 1, 0.1, 2], ['p8', 1.1, 1, 1.1, 0]] attrs = list(range(1, len(examples1[0]) - 1)) nPossibleVals = [0, 0, 2, 0, 3] boundsPerVar = [0, 1, 0, 1, 0] t1 = QuantTreeBoot(examples1, attrs, nPossibleVals, boundsPerVar) t1.Print() t1.Pickle('test_data/QuantTree2.pkl') for example in examples1: print(example, t1.ClassifyExample(example))
def MaxCount(examples): """ given a set of examples, returns the most common result code **Arguments** examples: a list of examples to be counted **Returns** the most common result code """ resList = [x[-1] for x in examples] maxVal = max(resList) counts = [None] * (maxVal + 1) for i in range(maxVal + 1): counts[i] = sum([x == i for x in resList]) return numpy.argmax(counts)
def _GenVarTable(vals, cuts, starts, results, nPossibleRes): """ Primarily intended for internal use constructs a variable table for the data passed in The table for a given variable records the number of times each possible value of that variable appears for each possible result of the function. **Arguments** - vals: a 1D Numeric array with the values of the variables - cuts: a list with the indices of the quantization bounds (indices are into _starts_ ) - starts: a list of potential starting points for quantization bounds - results: a 1D Numeric array of integer result codes - nPossibleRes: an integer with the number of possible result codes **Returns** the varTable, a 2D Numeric array which is nVarValues x nPossibleRes **Notes** - _vals_ should be sorted! """ nVals = len(cuts) + 1 varTable = numpy.zeros((nVals, nPossibleRes), 'i') idx = 0 for i in range(nVals - 1): cut = cuts[i] while idx < starts[cut]: varTable[i, results[idx]] += 1 idx += 1 while idx < len(vals): varTable[-1, results[idx]] += 1 idx += 1 return varTable
def FindBest(resCodes, examples, nBoundsPerVar, nPossibleRes, nPossibleVals, attrs, exIndices=None, **kwargs): bestGain = -1e6 best = -1 bestBounds = [] if exIndices is None: exIndices = list(range(len(examples))) if not len(exIndices): return best, bestGain, bestBounds nToTake = kwargs.get('randomDescriptors', 0) if nToTake > 0: nAttrs = len(attrs) if nToTake < nAttrs: ids = list(range(nAttrs)) random.shuffle(ids, random=random.random) tmp = [attrs[x] for x in ids[:nToTake]] attrs = tmp for var in attrs: nBounds = nBoundsPerVar[var] if nBounds > 0: # vTable = map(lambda x,z=var:x[z],examples) try: vTable = [examples[x][var] for x in exIndices] except IndexError: print('index error retrieving variable: %d' % var) raise qBounds, gainHere = Quantize.FindVarMultQuantBounds( vTable, nBounds, resCodes, nPossibleRes) # print('\tvar:',var,qBounds,gainHere) elif nBounds == 0: vTable = ID3.GenVarTable((examples[x] for x in exIndices), nPossibleVals, [var])[0] gainHere = entropy.InfoGain(vTable) qBounds = [] else: gainHere = -1e6 qBounds = [] if gainHere > bestGain: bestGain = gainHere bestBounds = qBounds best = var elif bestGain == gainHere: if len(qBounds) < len(bestBounds): best = var bestBounds = qBounds if best == -1: print('best unaltered') print('\tattrs:', attrs) print('\tnBounds:', numpy.take(nBoundsPerVar, attrs)) print('\texamples:') for example in (examples[x] for x in exIndices): print('\t\t', example) if 0: print('BEST:', len(exIndices), best, bestGain, bestBounds) if (len(exIndices) < 10): print(len(exIndices), len(resCodes), len(examples)) exs = [examples[x] for x in exIndices] vals = [x[best] for x in exs] sortIdx = numpy.argsort(vals) sortVals = [exs[x] for x in sortIdx] sortResults = [resCodes[x] for x in sortIdx] for i in range(len(vals)): print(' ', i, ['%.4f' % x for x in sortVals[i][1:-1]], sortResults[i]) return best, bestGain, bestBounds
def _PyRecurseOnBounds(vals, cuts, which, starts, results, nPossibleRes, varTable=None): """ Primarily intended for internal use Recursively finds the best quantization boundaries **Arguments** - vals: a 1D Numeric array with the values of the variables, this should be sorted - cuts: a list with the indices of the quantization bounds (indices are into _starts_ ) - which: an integer indicating which bound is being adjusted here (and index into _cuts_ ) - starts: a list of potential starting points for quantization bounds - results: a 1D Numeric array of integer result codes - nPossibleRes: an integer with the number of possible result codes **Returns** - a 2-tuple containing: 1) the best information gain found so far 2) a list of the quantization bound indices ( _cuts_ for the best case) **Notes** - this is not even remotely efficient, which is why a C replacement was written """ nBounds = len(cuts) maxGain = -1e6 bestCuts = None highestCutHere = len(starts) - nBounds + which if varTable is None: varTable = _GenVarTable(vals, cuts, starts, results, nPossibleRes) while cuts[which] <= highestCutHere: varTable = _GenVarTable(vals, cuts, starts, results, nPossibleRes) gainHere = entropy.InfoGain(varTable) if gainHere > maxGain: maxGain = gainHere bestCuts = cuts[:] # recurse on the next vars if needed if which < nBounds - 1: gainHere, cutsHere = _RecurseOnBounds(vals, cuts[:], which + 1, starts, results, nPossibleRes, varTable=varTable) if gainHere > maxGain: maxGain = gainHere bestCuts = cutsHere # update this cut cuts[which] += 1 for i in range(which + 1, nBounds): if cuts[i] == cuts[i - 1]: cuts[i] += 1 return maxGain, bestCuts
def _Pruner(node, level=0): """Recursively finds and removes the nodes whose removals improve classification **Arguments** - node: the tree to be pruned. The pruning data should already be contained within node (i.e. node.GetExamples() should return the pruning data) - level: (optional) the level of recursion, used only in _verbose printing **Returns** the pruned version of node **Notes** - This uses a greedy algorithm which basically does a DFS traversal of the tree, removing nodes whenever possible. - If removing a node does not affect the accuracy, it *will be* removed. We favor smaller trees. """ if _verbose: print(' ' * level, '<%d> ' % level, '>>> Pruner') children = node.GetChildren()[:] bestTree = copy.deepcopy(node) bestErr = 1e6 emptyChildren = [] # # Loop over the children of this node, removing them when doing so # either improves the local error or leaves it unchanged (we're # introducing a bias for simpler trees). # for i in range(len(children)): child = children[i] examples = child.GetExamples() if _verbose: print(' ' * level, '<%d> ' % level, ' Child:', i, child.GetLabel()) bestTree.Print() print() if len(examples): if _verbose: print(' ' * level, '<%d> ' % level, ' Examples', len(examples)) if not child.GetTerminal(): if _verbose: print(' ' * level, '<%d> ' % level, ' Nonterminal') workTree = copy.deepcopy(bestTree) # # First recurse on the child (try removing things below it) # newNode = _Pruner(child, level=level + 1) workTree.ReplaceChildIndex(i, newNode) tempErr = _GetLocalError(workTree) if tempErr <= bestErr: bestErr = tempErr bestTree = copy.deepcopy(workTree) if _verbose: print(' ' * level, '<%d> ' % level, '>->->->->->') print(' ' * level, '<%d> ' % level, 'replacing:', i, child.GetLabel()) child.Print() print(' ' * level, '<%d> ' % level, 'with:') newNode.Print() print(' ' * level, '<%d> ' % level, '<-<-<-<-<-<') else: workTree.ReplaceChildIndex(i, child) # # Now try replacing the child entirely # bestGuess = MaxCount(child.GetExamples()) newNode = DecTree.DecTreeNode(workTree, 'L:%d' % (bestGuess), label=bestGuess, isTerminal=1) newNode.SetExamples(child.GetExamples()) workTree.ReplaceChildIndex(i, newNode) if _verbose: print(' ' * level, '<%d> ' % level, 'ATTEMPT:') workTree.Print() newErr = _GetLocalError(workTree) if _verbose: print(' ' * level, '<%d> ' % level, '---> ', newErr, bestErr) if newErr <= bestErr: bestErr = newErr bestTree = copy.deepcopy(workTree) if _verbose: print(' ' * level, '<%d> ' % level, 'PRUNING:') workTree.Print() else: if _verbose: print(' ' * level, '<%d> ' % level, 'FAIL') # whoops... put the child back in: workTree.ReplaceChildIndex(i, child) else: if _verbose: print(' ' * level, '<%d> ' % level, ' Terminal') else: if _verbose: print(' ' * level, '<%d> ' % level, ' No Examples', len(examples)) # # FIX: we need to figure out what to do here (nodes that contain # no examples in the testing set). I can concoct arguments for # leaving them in and for removing them. At the moment they are # left intact. # pass if _verbose: print(' ' * level, '<%d> ' % level, '<<< out') return bestTree
def BuildQuantTree(examples, target, attrs, nPossibleVals, nBoundsPerVar, depth=0, maxDepth=-1, exIndices=None, **kwargs): """ **Arguments** - examples: a list of lists (nInstances x nVariables+1) of variable values + instance values - target: an int - attrs: a list of ints indicating which variables can be used in the tree - nPossibleVals: a list containing the number of possible values of every variable. - nBoundsPerVar: the number of bounds to include for each variable - depth: (optional) the current depth in the tree - maxDepth: (optional) the maximum depth to which the tree will be grown **Returns** a QuantTree.QuantTreeNode with the decision tree **NOTE:** This code cannot bootstrap (start from nothing...) use _QuantTreeBoot_ (below) for that. """ tree = QuantTree.QuantTreeNode(None, 'node') tree.SetData(-666) nPossibleRes = nPossibleVals[-1] if exIndices is None: exIndices = list(range(len(examples))) # counts of each result code: resCodes = [int(x[-1]) for x in (examples[y] for y in exIndices)] counts = [0] * nPossibleRes for res in resCodes: counts[res] += 1 nzCounts = numpy.nonzero(counts)[0] if len(nzCounts) == 1: # bottomed out because there is only one result code left # with any counts (i.e. there's only one type of example # left... this is GOOD!). res = nzCounts[0] tree.SetLabel(res) tree.SetName(str(res)) tree.SetTerminal(1) elif len(attrs) == 0 or (maxDepth >= 0 and depth > maxDepth): # Bottomed out: no variables left or max depth hit # We don't really know what to do here, so # use the heuristic of picking the most prevalent # result v = numpy.argmax(counts) tree.SetLabel(v) tree.SetName('%d?' % v) tree.SetTerminal(1) else: # find the variable which gives us the largest information gain best, _, bestBounds = FindBest(resCodes, examples, nBoundsPerVar, nPossibleRes, nPossibleVals, attrs, exIndices=exIndices, **kwargs) # remove that variable from the lists of possible variables nextAttrs = attrs[:] if not kwargs.get('recycleVars', 0): nextAttrs.remove(best) # set some info at this node tree.SetName('Var: %d' % (best)) tree.SetLabel(best) tree.SetQuantBounds(bestBounds) tree.SetTerminal(0) # loop over possible values of the new variable and # build a subtree for each one indices = exIndices[:] if len(bestBounds) > 0: for bound in bestBounds: nextExamples = [] for index in indices[:]: ex = examples[index] if ex[best] < bound: nextExamples.append(index) indices.remove(index) if len(nextExamples) == 0: # this particular value of the variable has no examples, # so there's not much sense in recursing. # This can (and does) happen. v = numpy.argmax(counts) tree.AddChild('%d' % v, label=v, data=0.0, isTerminal=1) else: # recurse tree.AddChildNode( BuildQuantTree(examples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=depth + 1, maxDepth=maxDepth, exIndices=nextExamples, **kwargs)) # add the last points remaining nextExamples = [] for index in indices: nextExamples.append(index) if len(nextExamples) == 0: v = numpy.argmax(counts) tree.AddChild('%d' % v, label=v, data=0.0, isTerminal=1) else: tree.AddChildNode( BuildQuantTree(examples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=depth + 1, maxDepth=maxDepth, exIndices=nextExamples, **kwargs)) else: for val in range(nPossibleVals[best]): nextExamples = [] for idx in exIndices: if examples[idx][best] == val: nextExamples.append(idx) if len(nextExamples) == 0: v = numpy.argmax(counts) tree.AddChild('%d' % v, label=v, data=0.0, isTerminal=1) else: tree.AddChildNode( BuildQuantTree(examples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=depth + 1, maxDepth=maxDepth, exIndices=nextExamples, **kwargs)) return tree
def QuantTreeBoot(examples, attrs, nPossibleVals, nBoundsPerVar, initialVar=None, maxDepth=-1, **kwargs): """ Bootstrapping code for the QuantTree If _initialVar_ is not set, the algorithm will automatically choose the first variable in the tree (the standard greedy approach). Otherwise, _initialVar_ will be used as the first split. """ attrs = list(attrs) for i in range(len(nBoundsPerVar)): if nBoundsPerVar[i] == -1 and i in attrs: attrs.remove(i) tree = QuantTree.QuantTreeNode(None, 'node') nPossibleRes = nPossibleVals[-1] tree._nResultCodes = nPossibleRes resCodes = [int(x[-1]) for x in examples] counts = [0] * nPossibleRes for res in resCodes: counts[res] += 1 if initialVar is None: best, gainHere, qBounds = FindBest(resCodes, examples, nBoundsPerVar, nPossibleRes, nPossibleVals, attrs, **kwargs) else: best = initialVar if nBoundsPerVar[best] > 0: vTable = map(lambda x, z=best: x[z], examples) qBounds, gainHere = Quantize.FindVarMultQuantBounds(vTable, nBoundsPerVar[best], resCodes, nPossibleRes) elif nBoundsPerVar[best] == 0: vTable = ID3.GenVarTable(examples, nPossibleVals, [best])[0] gainHere = entropy.InfoGain(vTable) qBounds = [] else: gainHere = -1e6 qBounds = [] tree.SetName('Var: %d' % (best)) tree.SetData(gainHere) tree.SetLabel(best) tree.SetTerminal(0) tree.SetQuantBounds(qBounds) nextAttrs = list(attrs) if not kwargs.get('recycleVars', 0): nextAttrs.remove(best) indices = list(range(len(examples))) if len(qBounds) > 0: for bound in qBounds: nextExamples = [] for index in list(indices): ex = examples[index] if ex[best] < bound: nextExamples.append(ex) indices.remove(index) if len(nextExamples): tree.AddChildNode( BuildQuantTree(nextExamples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=1, maxDepth=maxDepth, **kwargs)) else: v = numpy.argmax(counts) tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1) # add the last points remaining nextExamples = [] for index in indices: nextExamples.append(examples[index]) if len(nextExamples) != 0: tree.AddChildNode( BuildQuantTree(nextExamples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=1, maxDepth=maxDepth, **kwargs)) else: v = numpy.argmax(counts) tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1) else: for val in range(nPossibleVals[best]): nextExamples = [] for example in examples: if example[best] == val: nextExamples.append(example) if len(nextExamples) != 0: tree.AddChildNode( BuildQuantTree(nextExamples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=1, maxDepth=maxDepth, **kwargs)) else: v = numpy.argmax(counts) tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1) return tree
def FindBest(resCodes, examples, nBoundsPerVar, nPossibleRes, nPossibleVals, attrs, exIndices=None, **kwargs): bestGain = -1e6 best = -1 bestBounds = [] if exIndices is None: exIndices = list(range(len(examples))) if not len(exIndices): return best, bestGain, bestBounds nToTake = kwargs.get('randomDescriptors', 0) if nToTake > 0: nAttrs = len(attrs) if nToTake < nAttrs: ids = list(range(nAttrs)) random.shuffle(ids, random=random.random) tmp = [attrs[x] for x in ids[:nToTake]] attrs = tmp for var in attrs: nBounds = nBoundsPerVar[var] if nBounds > 0: # vTable = map(lambda x,z=var:x[z],examples) try: vTable = [examples[x][var] for x in exIndices] except IndexError: print('index error retrieving variable: %d' % var) raise qBounds, gainHere = Quantize.FindVarMultQuantBounds(vTable, nBounds, resCodes, nPossibleRes) # print('\tvar:',var,qBounds,gainHere) elif nBounds == 0: vTable = ID3.GenVarTable((examples[x] for x in exIndices), nPossibleVals, [var])[0] gainHere = entropy.InfoGain(vTable) qBounds = [] else: gainHere = -1e6 qBounds = [] if gainHere > bestGain: bestGain = gainHere bestBounds = qBounds best = var elif bestGain == gainHere: if len(qBounds) < len(bestBounds): best = var bestBounds = qBounds if best == -1: print('best unaltered') print('\tattrs:', attrs) print('\tnBounds:', numpy.take(nBoundsPerVar, attrs)) print('\texamples:') for example in (examples[x] for x in exIndices): print('\t\t', example) if 0: print('BEST:', len(exIndices), best, bestGain, bestBounds) if (len(exIndices) < 10): print(len(exIndices), len(resCodes), len(examples)) exs = [examples[x] for x in exIndices] vals = [x[best] for x in exs] sortIdx = numpy.argsort(vals) sortVals = [exs[x] for x in sortIdx] sortResults = [resCodes[x] for x in sortIdx] for i in range(len(vals)): print(' ', i, ['%.4f' % x for x in sortVals[i][1:-1]], sortResults[i]) return best, bestGain, bestBounds
def MergeFeatPoints(fm, mergeMetric=MergeMetric.NoMerge, mergeTol=1.5, dirMergeMode=DirMergeMode.NoMerge, mergeMethod=MergeMethod.WeightedAverage, compatFunc=familiesMatch): """ NOTE that mergeTol is a max value for merging when using distance-based merging and a min value when using score-based merging. returns whether or not any points were actually merged """ res = False if mergeMetric == MergeMetric.NoMerge: return res dists = GetFeatFeatDistMatrix(fm, mergeMetric, mergeTol, dirMergeMode, compatFunc) distOrders = [None] * len(dists) for i in range(len(dists)): distV = dists[i] distOrders[i] = [] for j, dist in enumerate(distV): if dist < mergeTol: distOrders[i].append((dist, j)) distOrders[i].sort() #print 'distOrders:' #print distOrders # we now know the "distances" and have rank-ordered list of # each point's neighbors. Work with that. # progressively merge nearest neighbors until there # are no more points left to merge featsInPlay = list(range(fm.GetNumFeatures())) featsToRemove = [] #print '--------------------------------' while featsInPlay: # find two features who are mutual nearest neighbors: fipCopy = featsInPlay[:] for fi in fipCopy: #print '>>>',fi,fipCopy,featsInPlay #print '\t',distOrders[fi] mergeThem = False if not distOrders[fi]: featsInPlay.remove(fi) continue dist, nbr = distOrders[fi][0] if nbr not in featsInPlay: continue if distOrders[nbr][0][1] == fi: #print 'direct:',fi,nbr mergeThem = True else: # it may be that there are several points at about the same distance, # check for that now if (feq(distOrders[nbr][0][0], dist)): for distJ, nbrJ in distOrders[nbr][1:]: if feq(dist, distJ): if nbrJ == fi: #print 'indirect: ',fi,nbr mergeThem = True break else: break #print ' bottom:',mergeThem if mergeThem: break if mergeThem: res = True featI = fm.GetFeature(fi) nbrFeat = fm.GetFeature(nbr) if mergeMethod == MergeMethod.WeightedAverage: newPos = featI.GetPos() * featI.weight + nbrFeat.GetPos( ) * nbrFeat.weight newPos /= (featI.weight + nbrFeat.weight) newWeight = (featI.weight + nbrFeat.weight) / 2 elif mergeMethod == MergeMethod.Average: newPos = featI.GetPos() + nbrFeat.GetPos() newPos /= 2 newWeight = (featI.weight + nbrFeat.weight) / 2 elif mergeMethod == MergeMethod.UseLarger: if featI.weight > nbrFeat.weight: newPos = featI.GetPos() newWeight = featI.weight else: newPos = nbrFeat.GetPos() newWeight = nbrFeat.weight else: raise ValueError("bad mergeMethod") featI.SetPos(newPos) featI.weight = newWeight # nbr and fi are no longer valid targets: #print 'nbr done:',nbr,featsToRemove,featsInPlay featsToRemove.append(nbr) featsInPlay.remove(fi) featsInPlay.remove(nbr) for nbrList in distOrders: try: nbrList.remove(fi) except ValueError: pass try: nbrList.remove(nbr) except ValueError: pass else: #print ">>>> Nothing found, abort" break featsToRemove.sort() for i, fIdx in enumerate(featsToRemove): fm.DropFeature(fIdx - i) return res
def FindBRICSBonds(mol, randomizeOrder=False, silent=True): """ returns the bonds in a molecule that BRICS would cleave >>> from rdkit import Chem >>> m = Chem.MolFromSmiles('CCCOCC') >>> res = list(FindBRICSBonds(m)) >>> res [((3, 2), ('3', '4')), ((3, 4), ('3', '4'))] a more complicated case: >>> m = Chem.MolFromSmiles('CCCOCCC(=O)c1ccccc1') >>> res = list(FindBRICSBonds(m)) >>> res [((3, 2), ('3', '4')), ((3, 4), ('3', '4')), ((6, 8), ('6', '16'))] we can also randomize the order of the results: >>> random.seed(23) >>> res = list(FindBRICSBonds(m,randomizeOrder=True)) >>> sorted(res) [((3, 2), ('3', '4')), ((3, 4), ('3', '4')), ((6, 8), ('6', '16'))] Note that this is a generator function : >>> res = FindBRICSBonds(m) >>> res <generator object ...> >>> next(res) ((3, 2), ('3', '4')) >>> m = Chem.MolFromSmiles('CC=CC') >>> res = list(FindBRICSBonds(m)) >>> sorted(res) [((1, 2), ('7', '7'))] make sure we don't match ring bonds: >>> m = Chem.MolFromSmiles('O=C1NCCC1') >>> list(FindBRICSBonds(m)) [] another nice one, make sure environment 8 doesn't match something connected to a ring atom: >>> m = Chem.MolFromSmiles('CC1(C)CCCCC1') >>> list(FindBRICSBonds(m)) [] """ letter = re.compile('[a-z,A-Z]') indices = list(range(len(bondMatchers))) bondsDone = set() if randomizeOrder: random.shuffle(indices, random=random.random) envMatches = {} for env, patt in iteritems(environMatchers): envMatches[env] = mol.HasSubstructMatch(patt) for gpIdx in indices: if randomizeOrder: compats = bondMatchers[gpIdx][:] random.shuffle(compats, random=random.random) else: compats = bondMatchers[gpIdx] for i1, i2, bType, patt in compats: if not envMatches['L' + i1] or not envMatches['L' + i2]: continue matches = mol.GetSubstructMatches(patt) i1 = letter.sub('', i1) i2 = letter.sub('', i2) for match in matches: if match not in bondsDone and (match[1], match[0]) not in bondsDone: bondsDone.add(match) yield (((match[0], match[1]), (i1, i2)))
def _Pruner(node, level=0): """Recursively finds and removes the nodes whose removals improve classification **Arguments** - node: the tree to be pruned. The pruning data should already be contained within node (i.e. node.GetExamples() should return the pruning data) - level: (optional) the level of recursion, used only in _verbose printing **Returns** the pruned version of node **Notes** - This uses a greedy algorithm which basically does a DFS traversal of the tree, removing nodes whenever possible. - If removing a node does not affect the accuracy, it *will be* removed. We favor smaller trees. """ if _verbose: print(' ' * level, '<%d> ' % level, '>>> Pruner') children = node.GetChildren()[:] bestTree = copy.deepcopy(node) bestErr = 1e6 # # Loop over the children of this node, removing them when doing so # either improves the local error or leaves it unchanged (we're # introducing a bias for simpler trees). # for i in range(len(children)): child = children[i] examples = child.GetExamples() if _verbose: print(' ' * level, '<%d> ' % level, ' Child:', i, child.GetLabel()) bestTree.Print() print() if len(examples): if _verbose: print(' ' * level, '<%d> ' % level, ' Examples', len(examples)) if child.GetTerminal(): if _verbose: print(' ' * level, '<%d> ' % level, ' Terminal') continue if _verbose: print(' ' * level, '<%d> ' % level, ' Nonterminal') workTree = copy.deepcopy(bestTree) # # First recurse on the child (try removing things below it) # newNode = _Pruner(child, level=level + 1) workTree.ReplaceChildIndex(i, newNode) tempErr = _GetLocalError(workTree) if tempErr <= bestErr: bestErr = tempErr bestTree = copy.deepcopy(workTree) if _verbose: print(' ' * level, '<%d> ' % level, '>->->->->->') print(' ' * level, '<%d> ' % level, 'replacing:', i, child.GetLabel()) child.Print() print(' ' * level, '<%d> ' % level, 'with:') newNode.Print() print(' ' * level, '<%d> ' % level, '<-<-<-<-<-<') else: workTree.ReplaceChildIndex(i, child) # # Now try replacing the child entirely # bestGuess = MaxCount(child.GetExamples()) newNode = DecTree.DecTreeNode(workTree, 'L:%d' % (bestGuess), label=bestGuess, isTerminal=1) newNode.SetExamples(child.GetExamples()) workTree.ReplaceChildIndex(i, newNode) if _verbose: print(' ' * level, '<%d> ' % level, 'ATTEMPT:') workTree.Print() newErr = _GetLocalError(workTree) if _verbose: print(' ' * level, '<%d> ' % level, '---> ', newErr, bestErr) if newErr <= bestErr: bestErr = newErr bestTree = copy.deepcopy(workTree) if _verbose: print(' ' * level, '<%d> ' % level, 'PRUNING:') workTree.Print() else: if _verbose: print(' ' * level, '<%d> ' % level, 'FAIL') # whoops... put the child back in: workTree.ReplaceChildIndex(i, child) else: if _verbose: print(' ' * level, '<%d> ' % level, ' No Examples', len(examples)) # # FIX: we need to figure out what to do here (nodes that contain # no examples in the testing set). I can concoct arguments for # leaving them in and for removing them. At the moment they are # left intact. # pass if _verbose: print(' ' * level, '<%d> ' % level, '<<< out') return bestTree
def test11(self): # test coordinate preservation: molblock = """ RDKit 3D 13 14 0 0 0 0 0 0 0 0999 V2000 -1.2004 0.5900 0.6110 C 0 0 0 0 0 0 0 0 0 0 0 0 -2.2328 1.3173 0.0343 C 0 0 0 0 0 0 0 0 0 0 0 0 -3.4299 0.6533 -0.1500 C 0 0 0 0 0 0 0 0 0 0 0 0 -3.3633 -0.7217 -0.3299 C 0 0 0 0 0 0 0 0 0 0 0 0 -2.1552 -1.3791 -0.2207 C 0 0 0 0 0 0 0 0 0 0 0 0 -1.1425 -0.7969 0.5335 C 0 0 0 0 0 0 0 0 0 0 0 0 0.1458 -1.4244 0.4108 O 0 0 0 0 0 0 0 0 0 0 0 0 1.2976 -0.7398 -0.1026 C 0 0 0 0 0 0 0 0 0 0 0 0 2.4889 -0.7939 0.5501 N 0 0 0 0 0 0 0 0 0 0 0 0 3.4615 0.1460 0.3535 C 0 0 0 0 0 0 0 0 0 0 0 0 3.0116 1.4034 -0.0296 C 0 0 0 0 0 0 0 0 0 0 0 0 1.9786 1.4264 -0.9435 C 0 0 0 0 0 0 0 0 0 0 0 0 1.1399 0.3193 -0.9885 C 0 0 0 0 0 0 0 0 0 0 0 0 1 2 2 0 2 3 1 0 3 4 2 0 4 5 1 0 5 6 2 0 6 7 1 0 7 8 1 0 8 9 2 0 9 10 1 0 10 11 2 0 11 12 1 0 12 13 2 0 6 1 1 0 13 8 1 0 M END """ m = Chem.MolFromMolBlock(molblock) pieces = BreakBRICSBonds(m) frags = Chem.GetMolFrags(pieces, asMols=True) self.assertEqual(len(frags), 3) self.assertEqual(frags[0].GetNumAtoms(), 7) self.assertEqual(frags[1].GetNumAtoms(), 3) self.assertEqual(frags[2].GetNumAtoms(), 7) c1 = m.GetConformer() c2 = frags[0].GetConformer() for i in range(6): p1 = c1.GetAtomPosition(i) p2 = c2.GetAtomPosition(i) self.assertEqual((p1 - p2).Length(), 0.0) p1 = c1.GetAtomPosition(6) p2 = c2.GetAtomPosition(6) self.assertEqual((p1 - p2).Length(), 0.0) c2 = frags[2].GetConformer() for i in range(6): p1 = c1.GetAtomPosition(i + 7) p2 = c2.GetAtomPosition(i) self.assertEqual((p1 - p2).Length(), 0.0) p1 = c1.GetAtomPosition(6) p2 = c2.GetAtomPosition(6) self.assertEqual((p1 - p2).Length(), 0.0) c2 = frags[1].GetConformer() for i in range(1): p1 = c1.GetAtomPosition(i + 6) p2 = c2.GetAtomPosition(i) self.assertEqual((p1 - p2).Length(), 0.0) p1 = c1.GetAtomPosition(5) p2 = c2.GetAtomPosition(1) self.assertEqual((p1 - p2).Length(), 0.0) p1 = c1.GetAtomPosition(6) p2 = c2.GetAtomPosition(0) self.assertEqual((p1 - p2).Length(), 0.0) # make sure multiple conformations (include 2D) also work: molblock = """ RDKit 2D 13 14 0 0 0 0 0 0 0 0999 V2000 -1.2990 -0.8654 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -2.5981 -1.6154 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -3.8971 -0.8654 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -3.8971 0.6346 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -2.5981 1.3846 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -1.2990 0.6346 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -0.0000 1.3846 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 1.2990 0.6346 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 1.2990 -0.8654 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 2.5981 -1.6154 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 3.8971 -0.8654 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 3.8971 0.6346 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 2.5981 1.3846 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 1 2 2 0 2 3 1 0 3 4 2 0 4 5 1 0 5 6 2 0 6 7 1 0 7 8 1 0 8 9 2 0 9 10 1 0 10 11 2 0 11 12 1 0 12 13 2 0 6 1 1 0 13 8 1 0 M END """ m2 = Chem.MolFromMolBlock(molblock) m.AddConformer(m2.GetConformer(), assignId=True) self.assertEqual(m.GetNumConformers(), 2) pieces = BreakBRICSBonds(m) frags = Chem.GetMolFrags(pieces, asMols=True) self.assertEqual(len(frags), 3) self.assertEqual(frags[0].GetNumAtoms(), 7) self.assertEqual(frags[1].GetNumAtoms(), 3) self.assertEqual(frags[2].GetNumAtoms(), 7) self.assertEqual(frags[0].GetNumConformers(), 2) self.assertEqual(frags[1].GetNumConformers(), 2) self.assertEqual(frags[2].GetNumConformers(), 2) c1 = m.GetConformer(0) c2 = frags[0].GetConformer(0) for i in range(6): p1 = c1.GetAtomPosition(i) p2 = c2.GetAtomPosition(i) self.assertEqual((p1 - p2).Length(), 0.0) p1 = c1.GetAtomPosition(6) p2 = c2.GetAtomPosition(6) self.assertEqual((p1 - p2).Length(), 0.0) c2 = frags[2].GetConformer(0) for i in range(6): p1 = c1.GetAtomPosition(i + 7) p2 = c2.GetAtomPosition(i) self.assertEqual((p1 - p2).Length(), 0.0) p1 = c1.GetAtomPosition(6) p2 = c2.GetAtomPosition(6) self.assertEqual((p1 - p2).Length(), 0.0) c2 = frags[1].GetConformer(0) for i in range(1): p1 = c1.GetAtomPosition(i + 6) p2 = c2.GetAtomPosition(i) self.assertEqual((p1 - p2).Length(), 0.0) p1 = c1.GetAtomPosition(5) p2 = c2.GetAtomPosition(1) self.assertEqual((p1 - p2).Length(), 0.0) p1 = c1.GetAtomPosition(6) p2 = c2.GetAtomPosition(0) self.assertEqual((p1 - p2).Length(), 0.0) c1 = m.GetConformer(1) c2 = frags[0].GetConformer(1) for i in range(6): p1 = c1.GetAtomPosition(i) p2 = c2.GetAtomPosition(i) self.assertEqual((p1 - p2).Length(), 0.0) p1 = c1.GetAtomPosition(6) p2 = c2.GetAtomPosition(6) self.assertEqual((p1 - p2).Length(), 0.0) c2 = frags[2].GetConformer(1) for i in range(6): p1 = c1.GetAtomPosition(i + 7) p2 = c2.GetAtomPosition(i) self.assertEqual((p1 - p2).Length(), 0.0) p1 = c1.GetAtomPosition(6) p2 = c2.GetAtomPosition(6) self.assertEqual((p1 - p2).Length(), 0.0) c2 = frags[1].GetConformer(1) for i in range(1): p1 = c1.GetAtomPosition(i + 6) p2 = c2.GetAtomPosition(i) self.assertEqual((p1 - p2).Length(), 0.0) p1 = c1.GetAtomPosition(5) p2 = c2.GetAtomPosition(1) self.assertEqual((p1 - p2).Length(), 0.0) p1 = c1.GetAtomPosition(6) p2 = c2.GetAtomPosition(0) self.assertEqual((p1 - p2).Length(), 0.0)
def QuantTreeBoot(examples, attrs, nPossibleVals, nBoundsPerVar, initialVar=None, maxDepth=-1, **kwargs): """ Bootstrapping code for the QuantTree If _initialVar_ is not set, the algorithm will automatically choose the first variable in the tree (the standard greedy approach). Otherwise, _initialVar_ will be used as the first split. """ attrs = list(attrs) for i in range(len(nBoundsPerVar)): if nBoundsPerVar[i] == -1 and i in attrs: attrs.remove(i) tree = QuantTree.QuantTreeNode(None, 'node') nPossibleRes = nPossibleVals[-1] tree._nResultCodes = nPossibleRes resCodes = [int(x[-1]) for x in examples] counts = [0] * nPossibleRes for res in resCodes: counts[res] += 1 if initialVar is None: best, gainHere, qBounds = FindBest(resCodes, examples, nBoundsPerVar, nPossibleRes, nPossibleVals, attrs, **kwargs) else: best = initialVar if nBoundsPerVar[best] > 0: vTable = map(lambda x, z=best: x[z], examples) qBounds, gainHere = Quantize.FindVarMultQuantBounds( vTable, nBoundsPerVar[best], resCodes, nPossibleRes) elif nBoundsPerVar[best] == 0: vTable = ID3.GenVarTable(examples, nPossibleVals, [best])[0] gainHere = entropy.InfoGain(vTable) qBounds = [] else: gainHere = -1e6 qBounds = [] tree.SetName('Var: %d' % (best)) tree.SetData(gainHere) tree.SetLabel(best) tree.SetTerminal(0) tree.SetQuantBounds(qBounds) nextAttrs = list(attrs) if not kwargs.get('recycleVars', 0): nextAttrs.remove(best) indices = list(range(len(examples))) if len(qBounds) > 0: for bound in qBounds: nextExamples = [] for index in list(indices): ex = examples[index] if ex[best] < bound: nextExamples.append(ex) indices.remove(index) if len(nextExamples): tree.AddChildNode( BuildQuantTree(nextExamples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=1, maxDepth=maxDepth, **kwargs)) else: v = numpy.argmax(counts) tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1) # add the last points remaining nextExamples = [] for index in indices: nextExamples.append(examples[index]) if len(nextExamples) != 0: tree.AddChildNode( BuildQuantTree(nextExamples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=1, maxDepth=maxDepth, **kwargs)) else: v = numpy.argmax(counts) tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1) else: for val in range(nPossibleVals[best]): nextExamples = [] for example in examples: if example[best] == val: nextExamples.append(example) if len(nextExamples) != 0: tree.AddChildNode( BuildQuantTree(nextExamples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=1, maxDepth=maxDepth, **kwargs)) else: v = numpy.argmax(counts) tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1) return tree
def FindBRICSBonds(mol,randomizeOrder=False,silent=True): """ returns the bonds in a molecule that BRICS would cleave >>> from rdkit import Chem >>> m = Chem.MolFromSmiles('CCCOCC') >>> res = list(FindBRICSBonds(m)) >>> res [((3, 2), ('3', '4')), ((3, 4), ('3', '4'))] a more complicated case: >>> m = Chem.MolFromSmiles('CCCOCCC(=O)c1ccccc1') >>> res = list(FindBRICSBonds(m)) >>> res [((3, 2), ('3', '4')), ((3, 4), ('3', '4')), ((6, 8), ('6', '16'))] we can also randomize the order of the results: >>> random.seed(23) >>> res = list(FindBRICSBonds(m,randomizeOrder=True)) >>> sorted(res) [((3, 2), ('3', '4')), ((3, 4), ('3', '4')), ((6, 8), ('6', '16'))] Note that this is a generator function : >>> res = FindBRICSBonds(m) >>> res <generator object ...> >>> next(res) ((3, 2), ('3', '4')) >>> m = Chem.MolFromSmiles('CC=CC') >>> res = list(FindBRICSBonds(m)) >>> sorted(res) [((1, 2), ('7', '7'))] make sure we don't match ring bonds: >>> m = Chem.MolFromSmiles('O=C1NCCC1') >>> list(FindBRICSBonds(m)) [] another nice one, make sure environment 8 doesn't match something connected to a ring atom: >>> m = Chem.MolFromSmiles('CC1(C)CCCCC1') >>> list(FindBRICSBonds(m)) [] """ letter = re.compile('[a-z,A-Z]') indices = list(range(len(bondMatchers))) bondsDone=set() if randomizeOrder: random.shuffle(indices,random=random.random) envMatches={} for env,patt in iteritems(environMatchers): envMatches[env]=mol.HasSubstructMatch(patt) for gpIdx in indices: if randomizeOrder: compats =bondMatchers[gpIdx][:] random.shuffle(compats,random=random.random) else: compats = bondMatchers[gpIdx] for i1,i2,bType,patt in compats: if not envMatches['L'+i1] or not envMatches['L'+i2]: continue matches = mol.GetSubstructMatches(patt) i1 = letter.sub('',i1) i2 = letter.sub('',i2) for match in matches: if match not in bondsDone and (match[1],match[0]) not in bondsDone: bondsDone.add(match) yield(((match[0],match[1]),(i1,i2)))
def MergeFeatPoints(fm,mergeMetric=MergeMetric.NoMerge,mergeTol=1.5, dirMergeMode=DirMergeMode.NoMerge, mergeMethod=MergeMethod.WeightedAverage, compatFunc=familiesMatch): """ NOTE that mergeTol is a max value for merging when using distance-based merging and a min value when using score-based merging. returns whether or not any points were actually merged """ res=False if mergeMetric==MergeMetric.NoMerge: return res dists = GetFeatFeatDistMatrix(fm,mergeMetric,mergeTol,dirMergeMode,compatFunc) distOrders = [None]*len(dists) for i in range(len(dists)): distV = dists[i] distOrders[i] = [] for j,dist in enumerate(distV): if dist<mergeTol: distOrders[i].append((dist,j)) distOrders[i].sort() #print 'distOrders:' #print distOrders # we now know the "distances" and have rank-ordered list of # each point's neighbors. Work with that. # progressively merge nearest neighbors until there # are no more points left to merge featsInPlay=list(range(fm.GetNumFeatures())) featsToRemove = [] #print '--------------------------------' while featsInPlay: # find two features who are mutual nearest neighbors: fipCopy=featsInPlay[:] for fi in fipCopy: #print '>>>',fi,fipCopy,featsInPlay #print '\t',distOrders[fi] mergeThem=False if not distOrders[fi]: featsInPlay.remove(fi) continue dist,nbr = distOrders[fi][0] if nbr not in featsInPlay: continue if distOrders[nbr][0][1]==fi: #print 'direct:',fi,nbr mergeThem=True else: # it may be that there are several points at about the same distance, # check for that now if(feq(distOrders[nbr][0][0],dist)): for distJ,nbrJ in distOrders[nbr][1:]: if feq(dist,distJ): if nbrJ==fi: #print 'indirect: ',fi,nbr mergeThem=True break else: break #print ' bottom:',mergeThem if mergeThem: break if mergeThem: res=True featI = fm.GetFeature(fi) nbrFeat = fm.GetFeature(nbr) if mergeMethod==MergeMethod.WeightedAverage: newPos = featI.GetPos()*featI.weight+nbrFeat.GetPos()*nbrFeat.weight newPos /= (featI.weight+nbrFeat.weight) newWeight = (featI.weight+nbrFeat.weight)/2 elif mergeMethod==MergeMethod.Average: newPos = featI.GetPos()+nbrFeat.GetPos() newPos /= 2 newWeight = (featI.weight+nbrFeat.weight)/2 elif mergeMethod==MergeMethod.UseLarger: if featI.weight>nbrFeat.weight: newPos=featI.GetPos() newWeight = featI.weight else: newPos=nbrFeat.GetPos() newWeight = nbrFeat.weight else: raise ValueError("bad mergeMethod") featI.SetPos(newPos) featI.weight = newWeight # nbr and fi are no longer valid targets: #print 'nbr done:',nbr,featsToRemove,featsInPlay featsToRemove.append(nbr) featsInPlay.remove(fi) featsInPlay.remove(nbr) for nbrList in distOrders: try: nbrList.remove(fi) except ValueError: pass try: nbrList.remove(nbr) except ValueError: pass else: #print ">>>> Nothing found, abort" break featsToRemove.sort() for i,fIdx in enumerate(featsToRemove): fm.DropFeature(fIdx-i) return res
def test11(self): # test coordinate preservation: molblock=""" RDKit 3D 13 14 0 0 0 0 0 0 0 0999 V2000 -1.2004 0.5900 0.6110 C 0 0 0 0 0 0 0 0 0 0 0 0 -2.2328 1.3173 0.0343 C 0 0 0 0 0 0 0 0 0 0 0 0 -3.4299 0.6533 -0.1500 C 0 0 0 0 0 0 0 0 0 0 0 0 -3.3633 -0.7217 -0.3299 C 0 0 0 0 0 0 0 0 0 0 0 0 -2.1552 -1.3791 -0.2207 C 0 0 0 0 0 0 0 0 0 0 0 0 -1.1425 -0.7969 0.5335 C 0 0 0 0 0 0 0 0 0 0 0 0 0.1458 -1.4244 0.4108 O 0 0 0 0 0 0 0 0 0 0 0 0 1.2976 -0.7398 -0.1026 C 0 0 0 0 0 0 0 0 0 0 0 0 2.4889 -0.7939 0.5501 N 0 0 0 0 0 0 0 0 0 0 0 0 3.4615 0.1460 0.3535 C 0 0 0 0 0 0 0 0 0 0 0 0 3.0116 1.4034 -0.0296 C 0 0 0 0 0 0 0 0 0 0 0 0 1.9786 1.4264 -0.9435 C 0 0 0 0 0 0 0 0 0 0 0 0 1.1399 0.3193 -0.9885 C 0 0 0 0 0 0 0 0 0 0 0 0 1 2 2 0 2 3 1 0 3 4 2 0 4 5 1 0 5 6 2 0 6 7 1 0 7 8 1 0 8 9 2 0 9 10 1 0 10 11 2 0 11 12 1 0 12 13 2 0 6 1 1 0 13 8 1 0 M END """ m = Chem.MolFromMolBlock(molblock) pieces = BreakBRICSBonds(m) frags = Chem.GetMolFrags(pieces,asMols=True) self.assertEqual(len(frags),3) self.assertEqual(frags[0].GetNumAtoms(),7) self.assertEqual(frags[1].GetNumAtoms(),3) self.assertEqual(frags[2].GetNumAtoms(),7) c1 = m.GetConformer() c2 = frags[0].GetConformer() for i in range(6): p1 = c1.GetAtomPosition(i) p2 = c2.GetAtomPosition(i) self.assertEqual((p1-p2).Length(),0.0) p1 = c1.GetAtomPosition(6) p2 = c2.GetAtomPosition(6) self.assertEqual((p1-p2).Length(),0.0) c2 = frags[2].GetConformer() for i in range(6): p1 = c1.GetAtomPosition(i+7) p2 = c2.GetAtomPosition(i) self.assertEqual((p1-p2).Length(),0.0) p1 = c1.GetAtomPosition(6) p2 = c2.GetAtomPosition(6) self.assertEqual((p1-p2).Length(),0.0) c2 = frags[1].GetConformer() for i in range(1): p1 = c1.GetAtomPosition(i+6) p2 = c2.GetAtomPosition(i) self.assertEqual((p1-p2).Length(),0.0) p1 = c1.GetAtomPosition(5) p2 = c2.GetAtomPosition(1) self.assertEqual((p1-p2).Length(),0.0) p1 = c1.GetAtomPosition(6) p2 = c2.GetAtomPosition(0) self.assertEqual((p1-p2).Length(),0.0) # make sure multiple conformations (include 2D) also work: molblock=""" RDKit 2D 13 14 0 0 0 0 0 0 0 0999 V2000 -1.2990 -0.8654 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -2.5981 -1.6154 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -3.8971 -0.8654 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -3.8971 0.6346 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -2.5981 1.3846 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -1.2990 0.6346 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -0.0000 1.3846 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 1.2990 0.6346 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 1.2990 -0.8654 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 2.5981 -1.6154 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 3.8971 -0.8654 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 3.8971 0.6346 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 2.5981 1.3846 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 1 2 2 0 2 3 1 0 3 4 2 0 4 5 1 0 5 6 2 0 6 7 1 0 7 8 1 0 8 9 2 0 9 10 1 0 10 11 2 0 11 12 1 0 12 13 2 0 6 1 1 0 13 8 1 0 M END """ m2 = Chem.MolFromMolBlock(molblock) m.AddConformer(m2.GetConformer(),assignId=True) self.assertEqual(m.GetNumConformers(),2) pieces = BreakBRICSBonds(m) frags = Chem.GetMolFrags(pieces,asMols=True) self.assertEqual(len(frags),3) self.assertEqual(frags[0].GetNumAtoms(),7) self.assertEqual(frags[1].GetNumAtoms(),3) self.assertEqual(frags[2].GetNumAtoms(),7) self.assertEqual(frags[0].GetNumConformers(),2) self.assertEqual(frags[1].GetNumConformers(),2) self.assertEqual(frags[2].GetNumConformers(),2) c1 = m.GetConformer(0) c2 = frags[0].GetConformer(0) for i in range(6): p1 = c1.GetAtomPosition(i) p2 = c2.GetAtomPosition(i) self.assertEqual((p1-p2).Length(),0.0) p1 = c1.GetAtomPosition(6) p2 = c2.GetAtomPosition(6) self.assertEqual((p1-p2).Length(),0.0) c2 = frags[2].GetConformer(0) for i in range(6): p1 = c1.GetAtomPosition(i+7) p2 = c2.GetAtomPosition(i) self.assertEqual((p1-p2).Length(),0.0) p1 = c1.GetAtomPosition(6) p2 = c2.GetAtomPosition(6) self.assertEqual((p1-p2).Length(),0.0) c2 = frags[1].GetConformer(0) for i in range(1): p1 = c1.GetAtomPosition(i+6) p2 = c2.GetAtomPosition(i) self.assertEqual((p1-p2).Length(),0.0) p1 = c1.GetAtomPosition(5) p2 = c2.GetAtomPosition(1) self.assertEqual((p1-p2).Length(),0.0) p1 = c1.GetAtomPosition(6) p2 = c2.GetAtomPosition(0) self.assertEqual((p1-p2).Length(),0.0) c1 = m.GetConformer(1) c2 = frags[0].GetConformer(1) for i in range(6): p1 = c1.GetAtomPosition(i) p2 = c2.GetAtomPosition(i) self.assertEqual((p1-p2).Length(),0.0) p1 = c1.GetAtomPosition(6) p2 = c2.GetAtomPosition(6) self.assertEqual((p1-p2).Length(),0.0) c2 = frags[2].GetConformer(1) for i in range(6): p1 = c1.GetAtomPosition(i+7) p2 = c2.GetAtomPosition(i) self.assertEqual((p1-p2).Length(),0.0) p1 = c1.GetAtomPosition(6) p2 = c2.GetAtomPosition(6) self.assertEqual((p1-p2).Length(),0.0) c2 = frags[1].GetConformer(1) for i in range(1): p1 = c1.GetAtomPosition(i+6) p2 = c2.GetAtomPosition(i) self.assertEqual((p1-p2).Length(),0.0) p1 = c1.GetAtomPosition(5) p2 = c2.GetAtomPosition(1) self.assertEqual((p1-p2).Length(),0.0) p1 = c1.GetAtomPosition(6) p2 = c2.GetAtomPosition(0) self.assertEqual((p1-p2).Length(),0.0)