예제 #1
0
    def testTreeGrow(self):
        " testing tree-based composite "
        self.refCompos = cPickle.load(
            open(
                RDConfig.RDCodeDir +
                '/ML/Composite/test_data/composite_base.pkl', 'rb'))

        composite = Composite.Composite()
        composite._varNames = self.varNames
        composite.SetQuantBounds(self.qBounds, self.nPoss)
        from rdkit.ML.DecTree import CrossValidate
        driver = CrossValidate.CrossValidationDriver
        pruner = None
        composite.Grow(self.examples,
                       self.attrs, [],
                       buildDriver=driver,
                       pruner=pruner,
                       nTries=100,
                       silent=1)
        composite.AverageErrors()
        composite.SortModels()
        self.treeComposite = composite
        assert len(composite) == len(
            self.refCompos), 'length mismatch %d!=%d' % (len(composite),
                                                         len(self.refCompos))
        for i in xrange(len(composite)):
            t1, c1, e1 = composite[i]
            t2, c2, e2 = self.refCompos[i]
            assert e1 == e2, 'error mismatch'
예제 #2
0
    def testTreeGrow(self):
        " testing tree-based composite "
        with open(
                RDConfig.RDCodeDir +
                '/ML/Composite/test_data/composite_base.pkl', 'rb') as pklF:
            self.refCompos = cPickle.load(pklF)

        composite = Composite.Composite()
        composite._varNames = self.varNames
        composite.SetQuantBounds(self.qBounds, self.nPoss)
        from rdkit.ML.DecTree import CrossValidate
        driver = CrossValidate.CrossValidationDriver
        pruner = None
        composite.Grow(self.examples,
                       self.attrs, [],
                       buildDriver=driver,
                       pruner=pruner,
                       nTries=100,
                       silent=1)
        composite.AverageErrors()
        composite.SortModels()

        #with open(RDConfig.RDCodeDir+'/ML/Composite/test_data/composite_base.pkl','wb') as pklF:
        #  cPickle.dump(composite,pklF)

        self.treeComposite = composite
        self.assertEqual(len(composite), len(self.refCompos))
        for i in xrange(len(composite)):
            t1, c1, e1 = composite[i]
            t2, c2, e2 = self.refCompos[i]
            self.assertEqual(e1, e2)
예제 #3
0
 def testQuantize(self):
     # testing data quantization
     qBounds = [[], [1, 2, 3]]
     examples = [['foo', 0], ['foo', 1.5], ['foo', 5.5], ['foo', 2.5]]
     answers = [['foo', 0], ['foo', 1], ['foo', 3], ['foo', 2]]
     nPoss = [0, 4]
     composite = Composite.Composite()
     composite.SetQuantBounds(qBounds, nPoss)
     for i in range(len(examples)):
         qEx = composite.QuantizeExample(examples[i])
         self.assertEqual(qEx, answers[i])
예제 #4
0
 def testQuantize(self):
     " testing data quantization "
     qBounds = [[], [1, 2, 3]]
     examples = [['foo', 0], ['foo', 1.5], ['foo', 5.5], ['foo', 2.5]]
     answers = [['foo', 0], ['foo', 1], ['foo', 3], ['foo', 2]]
     nPoss = [0, 4]
     composite = Composite.Composite()
     composite.SetQuantBounds(qBounds, nPoss)
     for i in xrange(len(examples)):
         qEx = composite.QuantizeExample(examples[i])
         assert qEx == answers[i], 'quantization of %s failed' % (str(
             examples[i]))
예제 #5
0
    def test_exceptions(self):
        compos = Composite.Composite()
        compos.SetQuantBounds([(0.5, ), (0.5, ), (0.5, ), []], [2, 2, 2, 2])
        compos.SetDescriptorNames(('ID', 'D0', 'D1', 'D2', 'Act'))
        compos.SetInputOrder(('ID', 'D2', 'D1', 'D0', 'Act'))
        self.assertEqual(compos._mapOrder, [0, 3, 2, 1, 4])
        # Probes caught exception for ID
        compos.SetInputOrder(('D2', 'D1', 'D0', 'Act'))
        self.assertEqual(compos._mapOrder, [0, 2, 1, 0, 3])
        # Probes caught exception for Act
        compos.SetInputOrder(('ID', 'D2', 'D1', 'D0'))
        self.assertEqual(compos._mapOrder, [0, 3, 2, 1, -1])

        self.assertRaises(ValueError, compos.SetInputOrder, ('Unknown', 'D0'))
예제 #6
0
    def testTreeGrow(self):
        # testing tree-based composite
        with open(
                RDConfig.RDCodeDir +
                '/ML/Composite/test_data/composite_base.pkl', 'r') as pklTF:
            buf = pklTF.read().replace('\r\n', '\n').encode('utf-8')
            pklTF.close()
        with io.BytesIO(buf) as pklF:
            self.refCompos = cPickle.load(pklF)

        composite = Composite.Composite()
        composite._varNames = self.varNames
        composite.SetQuantBounds(self.qBounds, self.nPoss)
        from rdkit.ML.DecTree import CrossValidate
        driver = CrossValidate.CrossValidationDriver
        pruner = None
        composite.Grow(self.examples,
                       self.attrs, [],
                       buildDriver=driver,
                       pruner=pruner,
                       nTries=100,
                       silent=1)
        composite.AverageErrors()
        composite.SortModels(sortOnError=False)
        self.assertEqual(composite.countList, sorted(composite.countList))
        self.assertNotEqual(composite.errList, sorted(composite.errList))
        composite.SortModels()
        self.assertNotEqual(composite.countList, sorted(composite.countList))
        self.assertEqual(composite.errList, sorted(composite.errList))

        # with open(RDConfig.RDCodeDir+'/ML/Composite/test_data/composite_base.pkl','wb') as pklF:
        #   cPickle.dump(composite,pklF)

        self.treeComposite = composite
        self.assertEqual(len(composite), len(self.refCompos))
        for i in range(len(composite)):
            t1, c1, e1 = composite[i]
            t2, c2, e2 = self.refCompos[i]
            self.assertEqual(e1, e2)
            # we used to check for equality here, but since there are redundant errors,
            #  that's non-trivial.
            # assert t1 == t2, 'tree mismatch'
            # assert c1 == c2, 'count mismatch'
        s = str(composite)
        self.assertIn('Composite', s)
        self.assertIn('Model', s)
        self.assertIn('error', s)
예제 #7
0
def RunOnData(details, data, progressCallback=None, saveIt=1, setDescNames=0):
    nExamples = data.GetNPts()
    if details.lockRandom:
        seed = details.randomSeed
    else:
        import random
        seed = (random.randint(0, 1e6), random.randint(0, 1e6))
    DataUtils.InitRandomNumbers(seed)
    testExamples = []
    if details.shuffleActivities == 1:
        DataUtils.RandomizeActivities(data, shuffle=1, runDetails=details)
    elif details.randomActivities == 1:
        DataUtils.RandomizeActivities(data, shuffle=0, runDetails=details)

    namedExamples = data.GetNamedData()
    if details.splitRun == 1:
        trainIdx, testIdx = SplitData.SplitIndices(len(namedExamples),
                                                   details.splitFrac,
                                                   silent=not _verbose)

        trainExamples = [namedExamples[x] for x in trainIdx]
        testExamples = [namedExamples[x] for x in testIdx]
    else:
        testExamples = []
        testIdx = []
        trainIdx = range(len(namedExamples))
        trainExamples = namedExamples

    if details.filterFrac != 0.0:
        # if we're doing quantization on the fly, we need to handle that here:
        if hasattr(details, 'activityBounds') and details.activityBounds:
            tExamples = []
            bounds = details.activityBounds
            for pt in trainExamples:
                pt = pt[:]
                act = pt[-1]
                placed = 0
                bound = 0
                while not placed and bound < len(bounds):
                    if act < bounds[bound]:
                        pt[-1] = bound
                        placed = 1
                    else:
                        bound += 1
                if not placed:
                    pt[-1] = bound
                tExamples.append(pt)
        else:
            bounds = None
            tExamples = trainExamples
        trainIdx, temp = DataUtils.FilterData(tExamples,
                                              details.filterVal,
                                              details.filterFrac,
                                              -1,
                                              indicesOnly=1)
        tmp = [trainExamples[x] for x in trainIdx]
        testExamples += [trainExamples[x] for x in temp]
        trainExamples = tmp

        counts = DataUtils.CountResults(trainExamples, bounds=bounds)
        ks = counts.keys()
        ks.sort()
        message('Result Counts in training set:')
        for k in ks:
            message(str((k, counts[k])))
        counts = DataUtils.CountResults(testExamples, bounds=bounds)
        ks = counts.keys()
        ks.sort()
        message('Result Counts in test set:')
        for k in ks:
            message(str((k, counts[k])))
    nExamples = len(trainExamples)
    message('Training with %d examples' % (nExamples))

    nVars = data.GetNVars()
    attrs = range(1, nVars + 1)
    nPossibleVals = data.GetNPossibleVals()
    for i in range(1, len(nPossibleVals)):
        if nPossibleVals[i - 1] == -1:
            attrs.remove(i)

    if details.pickleDataFileName != '':
        pickleDataFile = open(details.pickleDataFileName, 'wb+')
        cPickle.dump(trainExamples, pickleDataFile)
        cPickle.dump(testExamples, pickleDataFile)
        pickleDataFile.close()

    if details.bayesModel:
        composite = BayesComposite.BayesComposite()
    else:
        composite = Composite.Composite()

    composite._randomSeed = seed
    composite._splitFrac = details.splitFrac
    composite._shuffleActivities = details.shuffleActivities
    composite._randomizeActivities = details.randomActivities

    if hasattr(details, 'filterFrac'):
        composite._filterFrac = details.filterFrac
    if hasattr(details, 'filterVal'):
        composite._filterVal = details.filterVal

    composite.SetModelFilterData(details.modelFilterFrac,
                                 details.modelFilterVal)

    composite.SetActivityQuantBounds(details.activityBounds)
    nPossibleVals = data.GetNPossibleVals()
    if details.activityBounds:
        nPossibleVals[-1] = len(details.activityBounds) + 1

    if setDescNames:
        composite.SetInputOrder(data.GetVarNames())
        composite.SetDescriptorNames(details._descNames)
    else:
        composite.SetDescriptorNames(data.GetVarNames())
    composite.SetActivityQuantBounds(details.activityBounds)
    if details.nModels == 1:
        details.internalHoldoutFrac = 0.0
    if details.useTrees:
        from rdkit.ML.DecTree import CrossValidate, PruneTree
        if details.qBounds != []:
            from rdkit.ML.DecTree import BuildQuantTree
            builder = BuildQuantTree.QuantTreeBoot
        else:
            from rdkit.ML.DecTree import ID3
            builder = ID3.ID3Boot
        driver = CrossValidate.CrossValidationDriver
        pruner = PruneTree.PruneTree

        composite.SetQuantBounds(details.qBounds)
        nPossibleVals = data.GetNPossibleVals()
        if details.activityBounds:
            nPossibleVals[-1] = len(details.activityBounds) + 1
        composite.Grow(trainExamples,
                       attrs,
                       nPossibleVals=[0] + nPossibleVals,
                       buildDriver=driver,
                       pruner=pruner,
                       nTries=details.nModels,
                       pruneIt=details.pruneIt,
                       lessGreedy=details.lessGreedy,
                       needsQuantization=0,
                       treeBuilder=builder,
                       nQuantBounds=details.qBounds,
                       startAt=details.startAt,
                       maxDepth=details.limitDepth,
                       progressCallback=progressCallback,
                       holdOutFrac=details.internalHoldoutFrac,
                       replacementSelection=details.replacementSelection,
                       recycleVars=details.recycleVars,
                       randomDescriptors=details.randomDescriptors,
                       silent=not _verbose)

    elif details.useSigTrees:
        from rdkit.ML.DecTree import CrossValidate
        from rdkit.ML.DecTree import BuildSigTree
        builder = BuildSigTree.SigTreeBuilder
        driver = CrossValidate.CrossValidationDriver
        nPossibleVals = data.GetNPossibleVals()
        if details.activityBounds:
            nPossibleVals[-1] = len(details.activityBounds) + 1
        if hasattr(details, 'sigTreeBiasList'):
            biasList = details.sigTreeBiasList
        else:
            biasList = None
        if hasattr(details, 'useCMIM'):
            useCMIM = details.useCMIM
        else:
            useCMIM = 0
        if hasattr(details, 'allowCollections'):
            allowCollections = details.allowCollections
        else:
            allowCollections = False
        composite.Grow(trainExamples,
                       attrs,
                       nPossibleVals=[0] + nPossibleVals,
                       buildDriver=driver,
                       nTries=details.nModels,
                       needsQuantization=0,
                       treeBuilder=builder,
                       maxDepth=details.limitDepth,
                       progressCallback=progressCallback,
                       holdOutFrac=details.internalHoldoutFrac,
                       replacementSelection=details.replacementSelection,
                       recycleVars=details.recycleVars,
                       randomDescriptors=details.randomDescriptors,
                       biasList=biasList,
                       useCMIM=useCMIM,
                       allowCollection=allowCollections,
                       silent=not _verbose)

    elif details.useKNN:
        from rdkit.ML.KNN import CrossValidate
        from rdkit.ML.KNN import DistFunctions

        driver = CrossValidate.CrossValidationDriver
        dfunc = ''
        if (details.knnDistFunc == "Euclidean"):
            dfunc = DistFunctions.EuclideanDist
        elif (details.knnDistFunc == "Tanimoto"):
            dfunc = DistFunctions.TanimotoDist
        else:
            assert 0, "Bad KNN distance metric value"

        composite.Grow(trainExamples,
                       attrs,
                       nPossibleVals=[0] + nPossibleVals,
                       buildDriver=driver,
                       nTries=details.nModels,
                       needsQuantization=0,
                       numNeigh=details.knnNeighs,
                       holdOutFrac=details.internalHoldoutFrac,
                       distFunc=dfunc)

    elif details.useNaiveBayes or details.useSigBayes:
        from rdkit.ML.NaiveBayes import CrossValidate
        driver = CrossValidate.CrossValidationDriver
        if not (hasattr(details, 'useSigBayes') and details.useSigBayes):
            composite.Grow(trainExamples,
                           attrs,
                           nPossibleVals=[0] + nPossibleVals,
                           buildDriver=driver,
                           nTries=details.nModels,
                           needsQuantization=0,
                           nQuantBounds=details.qBounds,
                           holdOutFrac=details.internalHoldoutFrac,
                           replacementSelection=details.replacementSelection,
                           mEstimateVal=details.mEstimateVal,
                           silent=not _verbose)
        else:
            if hasattr(details, 'useCMIM'):
                useCMIM = details.useCMIM
            else:
                useCMIM = 0

            composite.Grow(trainExamples,
                           attrs,
                           nPossibleVals=[0] + nPossibleVals,
                           buildDriver=driver,
                           nTries=details.nModels,
                           needsQuantization=0,
                           nQuantBounds=details.qBounds,
                           mEstimateVal=details.mEstimateVal,
                           useSigs=True,
                           useCMIM=useCMIM,
                           holdOutFrac=details.internalHoldoutFrac,
                           replacementSelection=details.replacementSelection,
                           silent=not _verbose)


##   elif details.useSVM:
##     from rdkit.ML.SVM import CrossValidate
##     driver = CrossValidate.CrossValidationDriver
##     composite.Grow(trainExamples, attrs, nPossibleVals=[0]+nPossibleVals,
##                    buildDriver=driver, nTries=details.nModels,
##                    needsQuantization=0,
##                    cost=details.svmCost,gamma=details.svmGamma,
##                    weights=details.svmWeights,degree=details.svmDegree,
##                    type=details.svmType,kernelType=details.svmKernel,
##                    coef0=details.svmCoeff,eps=details.svmEps,nu=details.svmNu,
##                    cache_size=details.svmCache,shrinking=details.svmShrink,
##                    dataType=details.svmDataType,
##                    holdOutFrac=details.internalHoldoutFrac,
##                    replacementSelection=details.replacementSelection,
##                    silent=not _verbose)

    else:
        from rdkit.ML.Neural import CrossValidate
        driver = CrossValidate.CrossValidationDriver
        composite.Grow(trainExamples,
                       attrs, [0] + nPossibleVals,
                       nTries=details.nModels,
                       buildDriver=driver,
                       needsQuantization=0)

    composite.AverageErrors()
    composite.SortModels()
    modelList, counts, avgErrs = composite.GetAllData()
    counts = numpy.array(counts)
    avgErrs = numpy.array(avgErrs)
    composite._varNames = data.GetVarNames()

    for i in range(len(modelList)):
        modelList[i].NameModel(composite._varNames)

    # do final statistics
    weightedErrs = counts * avgErrs
    averageErr = sum(weightedErrs) / sum(counts)
    devs = (avgErrs - averageErr)
    devs = devs * counts
    devs = numpy.sqrt(devs * devs)
    avgDev = sum(devs) / sum(counts)
    message('# Overall Average Error: %%% 5.2f, Average Deviation: %%% 6.2f' %
            (100. * averageErr, 100. * avgDev))

    if details.bayesModel:
        composite.Train(trainExamples, verbose=0)

    # blow out the saved examples and then save the composite:
    composite.ClearModelExamples()
    if saveIt:
        composite.Pickle(details.outName)
    details.model = DbModule.binaryHolder(cPickle.dumps(composite))

    badExamples = []
    if not details.detailedRes and (not hasattr(details, 'noScreen')
                                    or not details.noScreen):
        if details.splitRun:
            message('Testing all hold-out examples')
            wrong = testall(composite, testExamples, badExamples)
            message('%d examples (%% %5.2f) were misclassified' %
                    (len(wrong),
                     100. * float(len(wrong)) / float(len(testExamples))))
            _runDetails.holdout_error = float(len(wrong)) / len(testExamples)
        else:
            message('Testing all examples')
            wrong = testall(composite, namedExamples, badExamples)
            message('%d examples (%% %5.2f) were misclassified' %
                    (len(wrong),
                     100. * float(len(wrong)) / float(len(namedExamples))))
            _runDetails.overall_error = float(len(wrong)) / len(namedExamples)

    if details.detailedRes:
        message('\nEntire data set:')
        resTup = ScreenComposite.ShowVoteResults(range(data.GetNPts()), data,
                                                 composite, nPossibleVals[-1],
                                                 details.threshold)
        nGood, nBad, nSkip, avgGood, avgBad, avgSkip, voteTab = resTup
        nPts = len(namedExamples)
        nClass = nGood + nBad
        _runDetails.overall_error = float(nBad) / nClass
        _runDetails.overall_correct_conf = avgGood
        _runDetails.overall_incorrect_conf = avgBad
        _runDetails.overall_result_matrix = repr(voteTab)
        nRej = nClass - nPts
        if nRej > 0:
            _runDetails.overall_fraction_dropped = float(nRej) / nPts

        if details.splitRun:
            message('\nHold-out data:')
            resTup = ScreenComposite.ShowVoteResults(range(len(testExamples)),
                                                     testExamples, composite,
                                                     nPossibleVals[-1],
                                                     details.threshold)
            nGood, nBad, nSkip, avgGood, avgBad, avgSkip, voteTab = resTup
            nPts = len(testExamples)
            nClass = nGood + nBad
            _runDetails.holdout_error = float(nBad) / nClass
            _runDetails.holdout_correct_conf = avgGood
            _runDetails.holdout_incorrect_conf = avgBad
            _runDetails.holdout_result_matrix = repr(voteTab)
            nRej = nClass - nPts
            if nRej > 0:
                _runDetails.holdout_fraction_dropped = float(nRej) / nPts

    if details.persistTblName and details.dbName:
        message('Updating results table %s:%s' %
                (details.dbName, details.persistTblName))
        details.Store(db=details.dbName, table=details.persistTblName)

    if details.badName != '':
        badFile = open(details.badName, 'w+')
        for i in range(len(badExamples)):
            ex = badExamples[i]
            vote = wrong[i]
            outStr = '%s\t%s\n' % (ex, vote)
            badFile.write(outStr)
        badFile.close()

    composite.ClearModelExamples()
    return composite
예제 #8
0
    def testErrorEstimate(self):
        " testing out-of-bag error estimates "

        compos = Composite.Composite()
        compos.SetQuantBounds([(0.5, ), (0.5, ), (0.5, ), []], [2, 2, 2, 2])
        compos.SetDescriptorNames(('D0', 'D1', 'D2', 'Act'))
        compos.SetInputOrder(('ID', 'D0', 'D1', 'D2', 'Act'))
        data = [['A', 0, 0, 0, 0], ['B', 1, 0, 0, 1], ['C', 0, 1, 0, 0],
                ['D', 1, 1, 1, 1]]

        #
        #  Build and validate three simple trees:
        #
        t1 = Node(None, 'D0', 0)
        n = Node(t1, 'D1', 1)
        t1.AddChildNode(n)
        n.AddChildNode(Node(n, '0', 0, isTerminal=1))
        n.AddChildNode(Node(n, '1', 1, isTerminal=1))
        n = Node(t1, 'D2', 2)
        t1.AddChildNode(n)
        n.AddChildNode(Node(n, '1', 1, isTerminal=1))
        n.AddChildNode(Node(n, '0', 0, isTerminal=1))
        assert t1.ClassifyExample(data[0][1:]) == 0
        assert t1.ClassifyExample(data[1][1:]) == 1
        assert t1.ClassifyExample(data[2][1:]) == 1
        assert t1.ClassifyExample(data[3][1:]) == 0
        t1._trainIndices = (0, 1)
        compos.AddModel(t1, .5)

        t2 = Node(None, 'D1', 1)
        n = Node(t2, 'D0', 0)
        t2.AddChildNode(n)
        n.AddChildNode(Node(n, '0', 0, isTerminal=1))
        n.AddChildNode(Node(n, '1', 1, isTerminal=1))
        n = Node(t2, 'D2', 2)
        t2.AddChildNode(n)
        n.AddChildNode(Node(n, '0', 0, isTerminal=1))
        n.AddChildNode(Node(n, '1', 1, isTerminal=1))
        assert t2.ClassifyExample(data[0][1:]) == 0
        assert t2.ClassifyExample(data[1][1:]) == 1
        assert t2.ClassifyExample(data[2][1:]) == 0
        assert t2.ClassifyExample(data[3][1:]) == 1
        t2._trainIndices = (1, 2)
        compos.AddModel(t2, 0.0)

        t3 = Node(None, 'D0', 0)
        n = Node(t3, 'D2', 2)
        t3.AddChildNode(n)
        n.AddChildNode(Node(n, '0', 0, isTerminal=1))
        n.AddChildNode(Node(n, '1', 1, isTerminal=1))
        n = Node(t3, 'D1', 1)
        t3.AddChildNode(n)
        n.AddChildNode(Node(n, '0', 0, isTerminal=1))
        n.AddChildNode(Node(n, '1', 1, isTerminal=1))
        assert t3.ClassifyExample(data[0][1:]) == 0
        assert t3.ClassifyExample(data[1][1:]) == 0
        assert t3.ClassifyExample(data[2][1:]) == 0
        assert t3.ClassifyExample(data[3][1:]) == 1
        t3._trainIndices = (2, 3)
        compos.AddModel(t3, 0.25)

        #
        #  validate the composite itself:
        #
        pred, conf = compos.ClassifyExample(data[0])
        assert pred == 0
        assert conf == 1.0
        pred, conf = compos.ClassifyExample(data[1])
        assert pred == 1
        assert conf == 2. / 3.
        pred, conf = compos.ClassifyExample(data[2])
        assert pred == 0
        assert conf == 2. / 3.
        pred, conf = compos.ClassifyExample(data[3])
        assert pred == 1
        assert conf == 2. / 3.

        #
        #  now test the out-of-bag calculation:
        #
        pred, conf = compos.ClassifyExample(data[0], onlyModels=(1, 2))
        assert pred == 0
        assert conf == 1.0
        pred, conf = compos.ClassifyExample(data[1], onlyModels=(2, ))
        assert pred == 0
        assert conf == 1.0
        pred, conf = compos.ClassifyExample(data[2], onlyModels=(0, ))
        assert pred == 1
        assert conf == 1.0
        pred, conf = compos.ClassifyExample(data[3], onlyModels=(0, 1))
        assert pred == 0
        assert conf == 0.5