Exemplo n.º 1
0
    def test1(self):
        # " testing pruning with known results "
        oPts = [
            [0, 0, 1, 0],
            [0, 1, 1, 1],
            [1, 0, 1, 1],
            [1, 1, 0, 0],
            [1, 1, 1, 1],
        ]
        tPts = oPts + [[0, 1, 1, 0], [0, 1, 1, 0]]
        tree = ID3.ID3Boot(oPts, attrs=range(3), nPossibleVals=[2] * 4)
        err, badEx = CrossValidate.CrossValidate(tree, oPts)
        assert err == 0.0, 'bad initial error'
        assert len(badEx) == 0, 'bad initial error'

        # prune with original data, shouldn't do anything
        f = StringIO()
        with redirect_stdout(f):
            PruneTree._verbose = True
            newTree, err = PruneTree.PruneTree(tree, [], oPts)
            PruneTree._verbose = False
        self.assertIn('Pruner', f.getvalue())
        assert newTree == tree, 'improper pruning'

        # prune with train data
        newTree, err = PruneTree.PruneTree(tree, [], tPts)
        assert newTree != tree, 'bad pruning'
        assert feq(err, 0.14286), 'bad error result'
Exemplo n.º 2
0
def _testChain():
    from rdkit.ML.DecTree import ID3
    oPts= [ \
      [1,0,0,0,1],
      [1,0,0,0,1],
      [1,0,0,0,1],
      [1,0,0,0,1],
      [1,0,0,0,1],
      [1,0,0,0,1],
      [1,0,0,0,1],
      [0,0,1,1,0],
      [0,0,1,1,0],
      [0,0,1,1,1],
      [0,1,0,1,0],
      [0,1,0,1,0],
      [0,1,0,0,1],
      ]
    tPts = oPts

    tree = ID3.ID3Boot(oPts,
                       attrs=range(len(oPts[0]) - 1),
                       nPossibleVals=[2] * len(oPts[0]))
    tree.Print()
    err, badEx = CrossValidate.CrossValidate(tree, oPts)
    print('original error:', err)

    err, badEx = CrossValidate.CrossValidate(tree, tPts)
    print('original holdout error:', err)
    newTree, frac2 = PruneTree(tree, oPts, tPts)
    newTree.Print()
    err, badEx = CrossValidate.CrossValidate(newTree, tPts)
    print('pruned holdout error is:', err)
    print(badEx)
Exemplo n.º 3
0
def _testSpecific():
    from rdkit.ML.DecTree import ID3
    oPts= [ \
      [0,0,1,0],
      [0,1,1,1],
      [1,0,1,1],
      [1,1,0,0],
      [1,1,1,1],
      ]
    tPts = oPts + [[0, 1, 1, 0], [0, 1, 1, 0]]

    tree = ID3.ID3Boot(oPts, attrs=range(3), nPossibleVals=[2] * 4)
    tree.Print()
    err, badEx = CrossValidate.CrossValidate(tree, oPts)
    print('original error:', err)

    err, badEx = CrossValidate.CrossValidate(tree, tPts)
    print('original holdout error:', err)
    newTree, frac2 = PruneTree(tree, oPts, tPts)
    newTree.Print()
    err, badEx = CrossValidate.CrossValidate(newTree, tPts)
    print('pruned holdout error is:', err)
    print(badEx)

    print(len(tree), len(newTree))
Exemplo n.º 4
0
    def _setupMultiTree(self):
        examples = [[0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 2], [0, 1, 1, 2],
                    [1, 0, 0, 2], [1, 0, 1, 2], [1, 1, 0, 2], [1, 1, 1, 0]]

        data = MLData.MLQuantDataSet(examples)
        attrs = range(0, data.GetNVars())
        t1 = ID3.ID3Boot(data.GetAllData(), attrs, data.GetNPossibleVals())
        self.t1 = t1
        self.examples = examples
Exemplo n.º 5
0
def TestTree():
    """ testing code for named trees

    """
    examples1 = [['p1', 0, 1, 0, 0], ['p2', 0, 0, 0, 1], ['p3', 0, 0, 1, 2], ['p4', 0, 1, 1, 2],
                 ['p5', 1, 0, 0, 2], ['p6', 1, 0, 1, 2], ['p7', 1, 1, 0, 2], ['p8', 1, 1, 1, 0]]
    attrs = list(range(1, len(examples1[0]) - 1))
    nPossibleVals = [0, 2, 2, 2, 3]
    t1 = ID3.ID3Boot(examples1, attrs, nPossibleVals, maxDepth=1)
    t1.Print()
Exemplo n.º 6
0
    def _setupPyMultiTree(self):
        from rdkit.ML.InfoTheory import entropy
        ID3.entropy.InfoEntropy = entropy.PyInfoEntropy
        ID3.entropy.InfoGain = entropy.PyInfoGain

        examples = [[0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 2], [0, 1, 1, 2],
                    [1, 0, 0, 2], [1, 0, 1, 2], [1, 1, 0, 2], [1, 1, 1, 0]]

        data = MLData.MLQuantDataSet(examples)
        attrs = range(0, data.GetNVars())
        t1 = ID3.ID3Boot(data.GetAllData(), attrs, data.GetNPossibleVals())
        self.t1 = t1
        self.examples = examples
Exemplo n.º 7
0
  def test1(self):
    " testing pruning with known results "
    oPts= [ \
      [0,0,1,0],
      [0,1,1,1],
      [1,0,1,1],
      [1,1,0,0],
      [1,1,1,1],
      ]
    tPts = oPts+[[0,1,1,0],[0,1,1,0]]
    tree = ID3.ID3Boot(oPts,attrs=range(3),nPossibleVals=[2]*4)
    err,badEx = CrossValidate.CrossValidate(tree,oPts)
    assert err==0.0,'bad initial error'
    assert len(badEx)==0,'bad initial error'

    # prune with original data, shouldn't do anything
    newTree,err = PruneTree.PruneTree(tree,[],oPts)
    assert newTree==tree,'improper pruning'
    
    # prune with train data
    newTree,err = PruneTree.PruneTree(tree,[],tPts)
    assert newTree!=tree,'bad pruning'
    assert feq(err,0.14286),'bad error result'
Exemplo n.º 8
0
def QuantTreeBoot(examples,
                  attrs,
                  nPossibleVals,
                  nBoundsPerVar,
                  initialVar=None,
                  maxDepth=-1,
                  **kwargs):
    """ Bootstrapping code for the QuantTree

    If _initialVar_ is not set, the algorithm will automatically
     choose the first variable in the tree (the standard greedy
     approach).  Otherwise, _initialVar_ will be used as the first
     split.

  """
    attrs = list(attrs)
    for i in range(len(nBoundsPerVar)):
        if nBoundsPerVar[i] == -1 and i in attrs:
            attrs.remove(i)

    tree = QuantTree.QuantTreeNode(None, 'node')
    nPossibleRes = nPossibleVals[-1]
    tree._nResultCodes = nPossibleRes

    resCodes = [int(x[-1]) for x in examples]
    counts = [0] * nPossibleRes
    for res in resCodes:
        counts[res] += 1
    if initialVar is None:
        best, gainHere, qBounds = FindBest(resCodes, examples, nBoundsPerVar,
                                           nPossibleRes, nPossibleVals, attrs,
                                           **kwargs)
    else:
        best = initialVar
        if nBoundsPerVar[best] > 0:
            vTable = map(lambda x, z=best: x[z], examples)
            qBounds, gainHere = Quantize.FindVarMultQuantBounds(
                vTable, nBoundsPerVar[best], resCodes, nPossibleRes)
        elif nBoundsPerVar[best] == 0:
            vTable = ID3.GenVarTable(examples, nPossibleVals, [best])[0]
            gainHere = entropy.InfoGain(vTable)
            qBounds = []
        else:
            gainHere = -1e6
            qBounds = []

    tree.SetName('Var: %d' % (best))
    tree.SetData(gainHere)
    tree.SetLabel(best)
    tree.SetTerminal(0)
    tree.SetQuantBounds(qBounds)
    nextAttrs = list(attrs)
    if not kwargs.get('recycleVars', 0):
        nextAttrs.remove(best)

    indices = list(range(len(examples)))
    if len(qBounds) > 0:
        for bound in qBounds:
            nextExamples = []
            for index in list(indices):
                ex = examples[index]
                if ex[best] < bound:
                    nextExamples.append(ex)
                    indices.remove(index)

            if len(nextExamples):
                tree.AddChildNode(
                    BuildQuantTree(nextExamples,
                                   best,
                                   nextAttrs,
                                   nPossibleVals,
                                   nBoundsPerVar,
                                   depth=1,
                                   maxDepth=maxDepth,
                                   **kwargs))
            else:
                v = numpy.argmax(counts)
                tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1)
        # add the last points remaining
        nextExamples = []
        for index in indices:
            nextExamples.append(examples[index])
        if len(nextExamples) != 0:
            tree.AddChildNode(
                BuildQuantTree(nextExamples,
                               best,
                               nextAttrs,
                               nPossibleVals,
                               nBoundsPerVar,
                               depth=1,
                               maxDepth=maxDepth,
                               **kwargs))
        else:
            v = numpy.argmax(counts)
            tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1)
    else:
        for val in range(nPossibleVals[best]):
            nextExamples = []
            for example in examples:
                if example[best] == val:
                    nextExamples.append(example)
            if len(nextExamples) != 0:
                tree.AddChildNode(
                    BuildQuantTree(nextExamples,
                                   best,
                                   nextAttrs,
                                   nPossibleVals,
                                   nBoundsPerVar,
                                   depth=1,
                                   maxDepth=maxDepth,
                                   **kwargs))
            else:
                v = numpy.argmax(counts)
                tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1)
    return tree
Exemplo n.º 9
0
def FindBest(resCodes,
             examples,
             nBoundsPerVar,
             nPossibleRes,
             nPossibleVals,
             attrs,
             exIndices=None,
             **kwargs):
    bestGain = -1e6
    best = -1
    bestBounds = []

    if exIndices is None:
        exIndices = list(range(len(examples)))

    if not len(exIndices):
        return best, bestGain, bestBounds

    nToTake = kwargs.get('randomDescriptors', 0)
    if nToTake > 0:
        nAttrs = len(attrs)
        if nToTake < nAttrs:
            ids = list(range(nAttrs))
            random.shuffle(ids, random=random.random)
            tmp = [attrs[x] for x in ids[:nToTake]]
            attrs = tmp

    for var in attrs:
        nBounds = nBoundsPerVar[var]
        if nBounds > 0:
            # vTable = map(lambda x,z=var:x[z],examples)
            try:
                vTable = [examples[x][var] for x in exIndices]
            except IndexError:
                print('index error retrieving variable: %d' % var)
                raise
            qBounds, gainHere = Quantize.FindVarMultQuantBounds(
                vTable, nBounds, resCodes, nPossibleRes)
            # print('\tvar:',var,qBounds,gainHere)
        elif nBounds == 0:
            vTable = ID3.GenVarTable((examples[x] for x in exIndices),
                                     nPossibleVals, [var])[0]
            gainHere = entropy.InfoGain(vTable)
            qBounds = []
        else:
            gainHere = -1e6
            qBounds = []
        if gainHere > bestGain:
            bestGain = gainHere
            bestBounds = qBounds
            best = var
        elif bestGain == gainHere:
            if len(qBounds) < len(bestBounds):
                best = var
                bestBounds = qBounds
    if best == -1:
        print('best unaltered')
        print('\tattrs:', attrs)
        print('\tnBounds:', numpy.take(nBoundsPerVar, attrs))
        print('\texamples:')
        for example in (examples[x] for x in exIndices):
            print('\t\t', example)

    if 0:
        print('BEST:', len(exIndices), best, bestGain, bestBounds)
        if (len(exIndices) < 10):
            print(len(exIndices), len(resCodes), len(examples))
            exs = [examples[x] for x in exIndices]
            vals = [x[best] for x in exs]
            sortIdx = numpy.argsort(vals)
            sortVals = [exs[x] for x in sortIdx]
            sortResults = [resCodes[x] for x in sortIdx]
            for i in range(len(vals)):
                print('   ', i, ['%.4f' % x for x in sortVals[i][1:-1]],
                      sortResults[i])
    return best, bestGain, bestBounds
Exemplo n.º 10
0
def GenRandomExamples(nVars=10, randScale=0.3, bitProb=0.5, nExamples=500, seed=(0, 0),
                      addResults=1):
  random.seed(seed[0])
  varWeights = numpy.array([random.random() for _ in range(nVars)]) * randScale
  examples = [None] * nExamples

  for i in range(nExamples):
    varVals = [random.random() > bitProb for _ in range(nVars)]
    temp = numpy.array(varVals) * varWeights
    res = sum(temp)
    if addResults:
      varVals.append(res >= 1.)
    examples[i] = varVals

  nPossibleVals = [2] * (nExamples + 1)
  attrs = list(range(nVars))

  return (examples, attrs, nPossibleVals)


if __name__ == '__main__':  # pragma: nocover
  from rdkit.six.moves import cPickle
  examples, attrs, nPossibleVals = GenRandomExamples()
  outF = open('random.dat.pkl', 'wb+')
  cPickle.dump(examples, outF)
  cPickle.dump(attrs, outF)
  cPickle.dump(nPossibleVals, outF)

  tree = ID3.ID3Boot(examples, attrs, nPossibleVals)
  tree.Pickle('save.pkl')