def CalcGains(suppl,catalog,topN=-1,actName='',acts=None, nActs=2,reportFreq=10,biasList=None,collectFps=0): """ calculates info gains by constructing fingerprints *DOC* Returns a 2-tuple: 1) gains matrix 2) list of fingerprints """ nBits = catalog.GetFPLength() if topN < 0: topN = nBits if not actName and not acts: actName = suppl[0].GetPropNames()[-1] gains = [0]*nBits if hasattr(suppl,'__len__'): nMols = len(suppl) else: nMols = -1 fpgen = FragmentCatalog.FragFPGenerator() #ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.ENTROPY) if biasList: ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.BIASENTROPY) ranker.SetBiasList(biasList) else: ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.ENTROPY) i = 0 fps = [] for mol in suppl: if not acts: try: act = int(mol.GetProp(actName)) except KeyError: message('ERROR: Molecule has no property: %s\n'%(actName)) message('\tAvailable properties are: %s\n'%(str(mol.GetPropNames()))) raise KeyError(actName) else: act = acts[i] if i and not i%reportFreq: if nMols>0: message('Done %d of %d.\n'%(i,nMols)) else: message('Done %d.\n'%(i)) fp = fpgen.GetFPForMol(mol,catalog) ranker.AccumulateVotes(fp,act) i+=1; if collectFps: fps.append(fp) gains = ranker.GetTopN(topN) return gains,fps
def CalcGainsFromFps(suppl, fps, topN=-1, actName='', acts=None, nActs=2, reportFreq=10, biasList=None): """ calculates info gains from a set of fingerprints *DOC* """ nBits = len(fps[0]) if topN < 0: topN = nBits if not actName and not acts: actName = suppl[0].GetPropNames()[-1] gains = [0] * nBits if hasattr(suppl, '__len__'): nMols = len(suppl) else: nMols = -1 if biasList: ranker = InfoTheory.InfoBitRanker(nBits, nActs, InfoTheory.InfoType.BIASENTROPY) ranker.SetBiasList(biasList) else: ranker = InfoTheory.InfoBitRanker(nBits, nActs, InfoTheory.InfoType.ENTROPY) for i, mol in enumerate(suppl): if not acts: try: act = int(mol.GetProp(actName)) except KeyError: message('ERROR: Molecule has no property: %s\n' % (actName)) message('\tAvailable properties are: %s\n' % (str(mol.GetPropNames()))) raise KeyError, actName else: act = acts[i] if i and not i % reportFreq: if nMols > 0: message('Done %d of %d.\n' % (i, nMols)) else: message('Done %d.\n' % (i)) fp = fps[i] ranker.AccumulateVotes(fp, act) gains = ranker.GetTopN(topN) return gains
def test1BiasRanker(self): nbits = 5000 dbName = os.path.join('../', 'test_data', 'FEW_CDK2.GDB') conn = DbConnect(dbName) fps = getFingerprints(conn) nameAct = getNameAct(conn) sl = len(fps.values()[0]) rnkr = InfoTheory.InfoBitRanker(sl, 2, InfoTheory.InfoType.BIASENTROPY) rnkr.SetBiasList([0]) print("Collecting Votes ....") for key in nameAct.keys(): if nameAct[key] == 100: rnkr.AccumulateVotes(fps[key], 0) if nameAct[key] == 0: rnkr.AccumulateVotes(fps[key], 1) # now do the ranking print("ranking bits ....") topN = rnkr.GetTopN(nbits) # get the combichem ranked list from a file cfile = os.path.join('test_data', 'combiRank.out') combiInfo = ReadCombiInfo(cfile) # now check if the infocontents are the same as the combichem stuff print("Comparing bit info contents ....") for i in range(nbits): assert feq(topN[i, 1], combiInfo[i])
def test0Ranker(self): nbits = 5000 conn = DbConnect(_testDatabase) fps = getFingerprints(conn) nameAct = getNameAct(conn) sl = len(list(fps.values())[0]) rnkr = InfoTheory.InfoBitRanker(sl, 2, InfoTheory.InfoType.ENTROPY) print("Collecting Votes ....") for key in nameAct.keys(): if nameAct[key] == 100: rnkr.AccumulateVotes(fps[key], 0) if nameAct[key] == 0: rnkr.AccumulateVotes(fps[key], 1) # now do the ranking print("ranking bits ....") topN = rnkr.GetTopN(nbits) # get the combichem ranked list from a file cfile = os.path.join('test_data', 'combiRank.out') combiInfo = ReadCombiInfo(cfile) # now check if the infocontents are the same as the combichem stuff print("Comparing bit info contents ....") for i in range(900): assert feq(topN[i, 1], combiInfo[i]) ofile = os.path.join('test_data', 'rdTopBits.txt') rnkr.WriteTopBitsToFile(ofile)
def BuildSigTree(examples, nPossibleRes, ensemble=None, random=0, metric=InfoTheory.InfoType.BIASENTROPY, biasList=[1], depth=0, maxDepth=-1, useCMIM=0, allowCollections=False, verbose=0, **kwargs): """ **Arguments** - examples: the examples to be classified. Each example should be a sequence at least three entries long, with entry 0 being a label, entry 1 a BitVector and entry -1 an activity value - nPossibleRes: the number of result codes possible - ensemble: (optional) if this argument is provided, it should be a sequence which is used to limit the bits which are actually considered as potential descriptors. The default is None (use all bits). - random: (optional) If this argument is nonzero, it specifies the number of bits to be randomly selected for consideration at this node (i.e. this toggles the growth of Random Trees). The default is 0 (no random descriptor selection) - metric: (optional) This is an _InfoTheory.InfoType_ and sets the metric used to rank the bits. The default is _InfoTheory.InfoType.BIASENTROPY_ - biasList: (optional) If provided, this provides a bias list for the bit ranker. See the _InfoTheory.InfoBitRanker_ docs for an explanation of bias. The default value is [1], which biases towards actives. - maxDepth: (optional) the maximum depth to which the tree will be grown The default is -1 (no depth limit). - useCMIM: (optional) if this is >0, the CMIM algorithm (conditional mutual information maximization) will be used to select the descriptors used to build the trees. The value of the variable should be set to the number of descriptors to be used. This option and the ensemble option are mutually exclusive (CMIM will not be used if the ensemble is set), but it happily coexsts with the random argument (to only consider random subsets of the top N CMIM bits) The default is 0 (do not use CMIM) - depth: (optional) the current depth in the tree This is used in the recursion and should not be set by the client. **Returns** a SigTree.SigTreeNode with the root of the decision tree """ if verbose: print(' ' * depth, 'Build') tree = SigTree.SigTreeNode(None, 'node', level=depth) tree.SetData(-666) #tree.SetExamples(examples) # counts of each result code: #resCodes = map(lambda x:int(x[-1]),examples) resCodes = [int(x[-1]) for x in examples] #print('resCodes:',resCodes) counts = [0] * nPossibleRes for res in resCodes: counts[res] += 1 #print(' '*depth,'counts:',counts) nzCounts = numpy.nonzero(counts)[0] if verbose: print(' ' * depth, '\tcounts:', counts) if len(nzCounts) == 1: # bottomed out because there is only one result code left # with any counts (i.e. there's only one type of example # left... this is GOOD!). res = nzCounts[0] tree.SetLabel(res) tree.SetName(str(res)) tree.SetTerminal(1) elif maxDepth >= 0 and depth > maxDepth: # Bottomed out: max depth hit # We don't really know what to do here, so # use the heuristic of picking the most prevalent # result v = numpy.argmax(counts) tree.SetLabel(v) tree.SetName('%d?' % v) tree.SetTerminal(1) else: # find the variable which gives us the best improvement # We do this with an InfoBitRanker: fp = examples[0][1] nBits = fp.GetNumBits() ranker = InfoTheory.InfoBitRanker(nBits, nPossibleRes, metric) if biasList: ranker.SetBiasList(biasList) if CMIM is not None and useCMIM > 0 and not ensemble: ensemble = CMIM.SelectFeatures(examples, useCMIM, bvCol=1) if random: if ensemble: if len(ensemble) > random: picks = _GenerateRandomEnsemble(random, len(ensemble)) availBits = list(take(ensemble, picks)) else: availBits = range(len(ensemble)) else: availBits = _GenerateRandomEnsemble(random, nBits) else: availBits = None if availBits: ranker.SetMaskBits(availBits) #print(' 2:'*depth,availBits) useCollections = isinstance(examples[0][1], VectCollection) for example in examples: #print(' '*depth,example[1].ToBitString(),example[-1]) if not useCollections: ranker.AccumulateVotes(example[1], example[-1]) else: example[1].Reset() ranker.AccumulateVotes(example[1].orVect, example[-1]) try: bitInfo = ranker.GetTopN(1)[0] best = int(bitInfo[0]) gain = bitInfo[1] except Exception: import traceback traceback.print_exc() print('get top n failed') gain = -1.0 if gain <= 0.0: v = numpy.argmax(counts) tree.SetLabel(v) tree.SetName('?%d?' % v) tree.SetTerminal(1) return tree best = int(bitInfo[0]) #print(' '*depth,'\tbest:',bitInfo) if verbose: print(' ' * depth, '\tbest:', bitInfo) # set some info at this node tree.SetName('Bit-%d' % (best)) tree.SetLabel(best) #tree.SetExamples(examples) tree.SetTerminal(0) # loop over possible values of the new variable and # build a subtree for each one onExamples = [] offExamples = [] for example in examples: if example[1][best]: if allowCollections and useCollections: sig = copy.copy(example[1]) sig.DetachVectsNotMatchingBit(best) ex = [example[0], sig] if len(example) > 2: ex.extend(example[2:]) example = ex onExamples.append(example) else: offExamples.append(example) #print(' '*depth,len(offExamples),len(onExamples)) for ex in (offExamples, onExamples): if len(ex) == 0: v = numpy.argmax(counts) tree.AddChild('%d??' % v, label=v, data=0.0, isTerminal=1) else: child = BuildSigTree(ex, nPossibleRes, random=random, ensemble=ensemble, metric=metric, biasList=biasList, depth=depth + 1, maxDepth=maxDepth, verbose=verbose) if child is None: v = numpy.argmax(counts) tree.AddChild('%d???' % v, label=v, data=0.0, isTerminal=1) else: tree.AddChildNode(child) return tree
fpgen = FragmentCatalog.FragFPGenerator() # 汇总所有片段 for m in sdms: fcgen.AddFragsFromMol(m, fcat) # 生成指纹片段 fps = [fpgen.GetFPForMol(x, fcat) for x in sdms] # 信息增益(infoGain)分析,实例化一个排序对象: # InfoBitRanker( # nBits, nBits:指纹长度 # nClasses, nClasses:类别数量,需要和标签满足的关系:0 <= 标签 < 类别数量 # infoType infoType:度量指标。默认使用rdInfoTheory.InfoType.ENTROPY,即信息增益作为比较标准,它反映了使用某个特征进行分类后,系统混乱程度降低的多少,数值越大表明特征越重要。 # ) ranker = InfoTheory.InfoBitRanker(len(fps[0]), 2) # 获取每个分子的活性信息 : GetDoubleProp('ACTIVITY') # 以7为标准对活性离散化 : 大于7为1 , 小于7为0 # 根据指纹和类别进行投票 : AccumulateVotes(fp, act) # 获取前5个重要特征 : GetTopN(5) # 依次输出特征id、信息增益、特征为0类别中的无活性分子数、特征为1类别中的有活性分子数。 acts = [x.GetDoubleProp('ACTIVITY') for x in sdms] print(acts) [ 6.87, 7.7, 7.74, 6.45, 6.89, 8.74, 7.23, 8.74, 6.51, 6.68, 7.47, 8.09, 8.07, 8.51, 8.42, 7.04, 8.46, 8.92, 6.06, 8.32, 8.0, 8.03, 7.74, 7.03, 6.96, 6.77, 5.0, 7.43, 5.0, 7.89, 6.46, 7.4, 6.41, 5.0, 7.06, 5.0, 5.0, 6.49, 8.46, 5.0, 5.0, 6.34, 7.68, 8.82, 7.85, 6.42, 7.12, 7.77, 8.13, 8.29, 5.0, 5.0, 7.31, 6.37, 8.08, 7.61, 8.8, 8.39, 7.72, 5.0, 5.0, 7.55, 7.15,