示例#1
0
文件: BitRank.py 项目: Kaziaa/rdkit-1
def CalcInfoGains(bitVects, actVals, nPossibleActs, nPossibleBitVals=2):
  """  Calculates the information gain for a set of points and activity values

  **Arguments**

    - bitVects: a *sequence* containing *IntVectors*

    - actVals: a *sequence*

    - nPossibleActs: the (integer) number of possible activity values.

    - nPossibleBitVals: (optional) if specified, this integer provides the maximum
      value attainable by the (increasingly inaccurately named) bits in _bitVects_.

   **Returns**

     a list of floats

  """
  if len(bitVects) != len(actVals):
    raise ValueError('var and activity lists should be the same length')
  nBits = len(bitVects[0])
  res = numpy.zeros(nBits, numpy.float)

  for bit in range(nBits):
    counts = FormCounts(bitVects, actVals, bit, nPossibleActs, nPossibleBitVals=nPossibleBitVals)
    res[bit] = entropy.InfoGain(counts)
  return res
示例#2
0
def AnalyzeSparseVects(bitVects, actVals):
    """ #DOC 

  **Arguments**

    - bitVects: a *sequence* containing SBVs

    - actVals: a *sequence*

   **Returns**   

     a list of floats

   **Notes**

      - these need to be bit vects and binary activities

  """
    nPts = len(bitVects)
    if nPts != len(actVals):
        raise ValueError, 'var and activity lists should be the same length'
    nBits = bitVects[0].GetSize()

    actives = numpy.zeros(nBits, numpy.integer)
    inactives = numpy.zeros(nBits, numpy.integer)
    nActives, nInactives = 0, 0
    for i in range(nPts):
        sig, act = bitVects[i], actVals[i]
        onBitList = sig.GetOnBits()
        if act:
            for bit in onBitList:
                actives[bit] += 1
            nActives += 1
        else:
            for bit in onBitList:
                inactives[bit] += 1
            nInactives += 1
    resTbl = numpy.zeros((2, 2), numpy.integer)
    res = []
    gains = []
    counts = []
    for bit in xrange(nBits):
        nAct, nInact = actives[bit], inactives[bit]
        if nAct or nInact:
            resTbl[0, 0] = nAct
            resTbl[1, 0] = nPts - nAct
            resTbl[0, 1] = nInact
            resTbl[1, 1] = nPts - nInact
            gain = entropy.InfoGain(resTbl)
            gains.append(gain)
            res.append((bit, gain, nAct, nInact))
    return res, gains
示例#3
0
文件: ID3.py 项目: Kaziaa/rdkit-1
def ID3Boot(examples,
            attrs,
            nPossibleVals,
            initialVar=None,
            depth=0,
            maxDepth=-1,
            **kwargs):
    """ Bootstrapping code for the ID3 algorithm

    see ID3 for descriptions of the arguments

    If _initialVar_ is not set, the algorithm will automatically
     choose the first variable in the tree (the standard greedy
     approach).  Otherwise, _initialVar_ will be used as the first
     split.

  """
    totEntropy = CalcTotalEntropy(examples, nPossibleVals)
    varTable = GenVarTable(examples, nPossibleVals, attrs)

    tree = DecTree.DecTreeNode(None, 'node')
    # tree.SetExamples(examples)
    tree._nResultCodes = nPossibleVals[-1]

    # <perl>you've got to love any language which will let you
    # do this much work in a single line :-)</perl>
    if initialVar is None:
        best = attrs[numpy.argmax([entropy.InfoGain(x) for x in varTable])]
    else:
        best = initialVar

    tree.SetName('Var: %d' % best)
    tree.SetData(totEntropy)
    tree.SetLabel(best)
    tree.SetTerminal(0)
    nextAttrs = list(attrs)
    if not kwargs.get('recycleVars', 0):
        nextAttrs.remove(best)

    for val in range(nPossibleVals[best]):
        nextExamples = []
        for example in examples:
            if example[best] == val:
                nextExamples.append(example)

        tree.AddChildNode(
            ID3(nextExamples, best, nextAttrs, nPossibleVals, depth, maxDepth,
                **kwargs))
    return tree
示例#4
0
def QuantTreeBoot(examples,
                  attrs,
                  nPossibleVals,
                  nBoundsPerVar,
                  initialVar=None,
                  maxDepth=-1,
                  **kwargs):
    """ Bootstrapping code for the QuantTree

    If _initialVar_ is not set, the algorithm will automatically
     choose the first variable in the tree (the standard greedy
     approach).  Otherwise, _initialVar_ will be used as the first
     split.

  """
    attrs = list(attrs)
    for i in range(len(nBoundsPerVar)):
        if nBoundsPerVar[i] == -1 and i in attrs:
            attrs.remove(i)

    tree = QuantTree.QuantTreeNode(None, 'node')
    nPossibleRes = nPossibleVals[-1]
    tree._nResultCodes = nPossibleRes

    resCodes = [int(x[-1]) for x in examples]
    counts = [0] * nPossibleRes
    for res in resCodes:
        counts[res] += 1
    if initialVar is None:
        best, gainHere, qBounds = FindBest(resCodes, examples, nBoundsPerVar,
                                           nPossibleRes, nPossibleVals, attrs,
                                           **kwargs)
    else:
        best = initialVar
        if nBoundsPerVar[best] > 0:
            vTable = map(lambda x, z=best: x[z], examples)
            qBounds, gainHere = Quantize.FindVarMultQuantBounds(
                vTable, nBoundsPerVar[best], resCodes, nPossibleRes)
        elif nBoundsPerVar[best] == 0:
            vTable = ID3.GenVarTable(examples, nPossibleVals, [best])[0]
            gainHere = entropy.InfoGain(vTable)
            qBounds = []
        else:
            gainHere = -1e6
            qBounds = []

    tree.SetName('Var: %d' % (best))
    tree.SetData(gainHere)
    tree.SetLabel(best)
    tree.SetTerminal(0)
    tree.SetQuantBounds(qBounds)
    nextAttrs = list(attrs)
    if not kwargs.get('recycleVars', 0):
        nextAttrs.remove(best)

    indices = list(range(len(examples)))
    if len(qBounds) > 0:
        for bound in qBounds:
            nextExamples = []
            for index in list(indices):
                ex = examples[index]
                if ex[best] < bound:
                    nextExamples.append(ex)
                    indices.remove(index)

            if len(nextExamples):
                tree.AddChildNode(
                    BuildQuantTree(nextExamples,
                                   best,
                                   nextAttrs,
                                   nPossibleVals,
                                   nBoundsPerVar,
                                   depth=1,
                                   maxDepth=maxDepth,
                                   **kwargs))
            else:
                v = numpy.argmax(counts)
                tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1)
        # add the last points remaining
        nextExamples = []
        for index in indices:
            nextExamples.append(examples[index])
        if len(nextExamples) != 0:
            tree.AddChildNode(
                BuildQuantTree(nextExamples,
                               best,
                               nextAttrs,
                               nPossibleVals,
                               nBoundsPerVar,
                               depth=1,
                               maxDepth=maxDepth,
                               **kwargs))
        else:
            v = numpy.argmax(counts)
            tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1)
    else:
        for val in range(nPossibleVals[best]):
            nextExamples = []
            for example in examples:
                if example[best] == val:
                    nextExamples.append(example)
            if len(nextExamples) != 0:
                tree.AddChildNode(
                    BuildQuantTree(nextExamples,
                                   best,
                                   nextAttrs,
                                   nPossibleVals,
                                   nBoundsPerVar,
                                   depth=1,
                                   maxDepth=maxDepth,
                                   **kwargs))
            else:
                v = numpy.argmax(counts)
                tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1)
    return tree
示例#5
0
def FindBest(resCodes,
             examples,
             nBoundsPerVar,
             nPossibleRes,
             nPossibleVals,
             attrs,
             exIndices=None,
             **kwargs):
    bestGain = -1e6
    best = -1
    bestBounds = []

    if exIndices is None:
        exIndices = list(range(len(examples)))

    if not len(exIndices):
        return best, bestGain, bestBounds

    nToTake = kwargs.get('randomDescriptors', 0)
    if nToTake > 0:
        nAttrs = len(attrs)
        if nToTake < nAttrs:
            ids = list(range(nAttrs))
            random.shuffle(ids, random=random.random)
            tmp = [attrs[x] for x in ids[:nToTake]]
            attrs = tmp

    for var in attrs:
        nBounds = nBoundsPerVar[var]
        if nBounds > 0:
            # vTable = map(lambda x,z=var:x[z],examples)
            try:
                vTable = [examples[x][var] for x in exIndices]
            except IndexError:
                print('index error retrieving variable: %d' % var)
                raise
            qBounds, gainHere = Quantize.FindVarMultQuantBounds(
                vTable, nBounds, resCodes, nPossibleRes)
            # print('\tvar:',var,qBounds,gainHere)
        elif nBounds == 0:
            vTable = ID3.GenVarTable((examples[x] for x in exIndices),
                                     nPossibleVals, [var])[0]
            gainHere = entropy.InfoGain(vTable)
            qBounds = []
        else:
            gainHere = -1e6
            qBounds = []
        if gainHere > bestGain:
            bestGain = gainHere
            bestBounds = qBounds
            best = var
        elif bestGain == gainHere:
            if len(qBounds) < len(bestBounds):
                best = var
                bestBounds = qBounds
    if best == -1:
        print('best unaltered')
        print('\tattrs:', attrs)
        print('\tnBounds:', numpy.take(nBoundsPerVar, attrs))
        print('\texamples:')
        for example in (examples[x] for x in exIndices):
            print('\t\t', example)

    if 0:
        print('BEST:', len(exIndices), best, bestGain, bestBounds)
        if (len(exIndices) < 10):
            print(len(exIndices), len(resCodes), len(examples))
            exs = [examples[x] for x in exIndices]
            vals = [x[best] for x in exs]
            sortIdx = numpy.argsort(vals)
            sortVals = [exs[x] for x in sortIdx]
            sortResults = [resCodes[x] for x in sortIdx]
            for i in range(len(vals)):
                print('   ', i, ['%.4f' % x for x in sortVals[i][1:-1]],
                      sortResults[i])
    return best, bestGain, bestBounds
示例#6
0
def _PyRecurseOnBounds(vals,
                       cuts,
                       which,
                       starts,
                       results,
                       nPossibleRes,
                       varTable=None):
    """ Primarily intended for internal use

   Recursively finds the best quantization boundaries

   **Arguments**

     - vals: a 1D Numeric array with the values of the variables,
       this should be sorted

     - cuts: a list with the indices of the quantization bounds
       (indices are into _starts_ )

     - which: an integer indicating which bound is being adjusted here
       (and index into _cuts_ )

     - starts: a list of potential starting points for quantization bounds

     - results: a 1D Numeric array of integer result codes

     - nPossibleRes: an integer with the number of possible result codes

   **Returns**

     - a 2-tuple containing:

       1) the best information gain found so far

       2) a list of the quantization bound indices ( _cuts_ for the best case)
   
   **Notes**

    - this is not even remotely efficient, which is why a C replacement
      was written

  """
    nBounds = len(cuts)
    maxGain = -1e6
    bestCuts = None
    highestCutHere = len(starts) - nBounds + which
    if varTable is None:
        varTable = _GenVarTable(vals, cuts, starts, results, nPossibleRes)
    while cuts[which] <= highestCutHere:
        varTable = _GenVarTable(vals, cuts, starts, results, nPossibleRes)
        gainHere = entropy.InfoGain(varTable)
        if gainHere > maxGain:
            maxGain = gainHere
            bestCuts = cuts[:]
        # recurse on the next vars if needed
        if which < nBounds - 1:
            gainHere, cutsHere = _RecurseOnBounds(vals,
                                                  cuts[:],
                                                  which + 1,
                                                  starts,
                                                  results,
                                                  nPossibleRes,
                                                  varTable=varTable)
            if gainHere > maxGain:
                maxGain = gainHere
                bestCuts = cutsHere
        # update this cut
        cuts[which] += 1
        for i in range(which + 1, nBounds):
            if cuts[i] == cuts[i - 1]:
                cuts[i] += 1

    return maxGain, bestCuts
示例#7
0
文件: ID3.py 项目: Kaziaa/rdkit-1
def ID3(examples,
        target,
        attrs,
        nPossibleVals,
        depth=0,
        maxDepth=-1,
        **kwargs):
    """ Implements the ID3 algorithm for constructing decision trees.

    From Mitchell's book, page 56

    This is *slightly* modified from Mitchell's book because it supports
      multivalued (non-binary) results.

    **Arguments**

      - examples: a list (nInstances long) of lists of variable values + instance
              values

      - target: an int

      - attrs: a list of ints indicating which variables can be used in the tree

      - nPossibleVals: a list containing the number of possible values of
                   every variable.

      - depth: (optional) the current depth in the tree

      - maxDepth: (optional) the maximum depth to which the tree
                   will be grown

    **Returns**

     a DecTree.DecTreeNode with the decision tree

    **NOTE:** This code cannot bootstrap (start from nothing...)
          use _ID3Boot_ (below) for that.
  """
    varTable = GenVarTable(examples, nPossibleVals, attrs)
    tree = DecTree.DecTreeNode(None, 'node')

    # store the total entropy... in case that is interesting
    totEntropy = CalcTotalEntropy(examples, nPossibleVals)
    tree.SetData(totEntropy)
    # tree.SetExamples(examples)

    # the matrix of results for this target:
    tMat = GenVarTable(examples, nPossibleVals, [target])[0]
    # counts of each result code:
    counts = sum(tMat)
    nzCounts = numpy.nonzero(counts)[0]

    if len(nzCounts) == 1:
        # bottomed out because there is only one result code left
        #  with any counts (i.e. there's only one type of example
        #  left... this is GOOD!).
        res = nzCounts[0]
        tree.SetLabel(res)
        tree.SetName(str(res))
        tree.SetTerminal(1)
    elif len(attrs) == 0 or (maxDepth >= 0 and depth >= maxDepth):
        # Bottomed out: no variables left or max depth hit
        #  We don't really know what to do here, so
        #  use the heuristic of picking the most prevalent
        #  result
        v = numpy.argmax(counts)
        tree.SetLabel(v)
        tree.SetName('%d?' % v)
        tree.SetTerminal(1)
    else:
        # find the variable which gives us the largest information gain

        gains = [entropy.InfoGain(x) for x in varTable]
        best = attrs[numpy.argmax(gains)]

        # remove that variable from the lists of possible variables
        nextAttrs = attrs[:]
        if not kwargs.get('recycleVars', 0):
            nextAttrs.remove(best)

        # set some info at this node
        tree.SetName('Var: %d' % best)
        tree.SetLabel(best)
        # tree.SetExamples(examples)
        tree.SetTerminal(0)

        # loop over possible values of the new variable and
        #  build a subtree for each one
        for val in range(nPossibleVals[best]):
            nextExamples = []
            for example in examples:
                if example[best] == val:
                    nextExamples.append(example)
            if len(nextExamples) == 0:
                # this particular value of the variable has no examples,
                #  so there's not much sense in recursing.
                #  This can (and does) happen.
                v = numpy.argmax(counts)
                tree.AddChild('%d' % v, label=v, data=0.0, isTerminal=1)
            else:
                # recurse
                tree.AddChildNode(
                    ID3(nextExamples, best, nextAttrs, nPossibleVals,
                        depth + 1, maxDepth, **kwargs))
    return tree