Exemplo n.º 1
0
def ClusterFromDetails(details):
    """ Returns the cluster tree

  """
    data = MolSimilarity.GetFingerprints(details)
    if details.maxMols > 0:
        data = data[:details.maxMols]
    if details.outFileName:
        try:
            outF = open(details.outFileName, 'wb+')
        except IOError:
            error("Error: could not open output file %s for writing\n" %
                  (details.outFileName))
            return None
    else:
        outF = None

    if not data:
        return None

    clustTree = ClusterPoints(data,
                              details.metric,
                              details.clusterAlgo,
                              haveLabels=0,
                              haveActs=1)
    if outF:
        cPickle.dump(clustTree, outF)
    return clustTree
Exemplo n.º 2
0
    def Pickle(self, fileName='foo.pkl'):
        """ Writes this forest off to a file so that it can be easily loaded later

     **Arguments**

       fileName is the name of the file to be written

    """
        pFile = open(fileName, 'wb+')
        cPickle.dump(self, pFile, 1)
        pFile.close()
Exemplo n.º 3
0
  def Pickle(self,fileName='foo.pkl'):
    """ Writes this forest off to a file so that it can be easily loaded later

     **Arguments**

       fileName is the name of the file to be written
       
    """
    pFile = open(fileName,'wb+')
    cPickle.dump(self,pFile,1)
    pFile.close()
Exemplo n.º 4
0
 def _writeDetailFile(self, inF, outF):
     while 1:
         try:
             smi, refContribs = cPickle.load(inF)
         except EOFError:
             break
         else:
             mol = Chem.MolFromSmiles(smi)
             if mol:
                 mol = Chem.AddHs(mol, 1)
                 smi2 = Chem.MolToSmiles(mol)
                 contribs = Crippen._GetAtomContribs(mol)
                 cPickle.dump((smi, contribs), outF)
             else:
                 print('Problems with SMILES:', smi)
Exemplo n.º 5
0
 def testPkl(self):
     # Test pickling
     v1 = self.klass(10)
     v1[1] = 1
     v1[2] = 1
     v1[3] = 1
     pklName = 'foo.pkl'
     outF = open(pklName, 'wb+')
     cPickle.dump(v1, outF)
     outF.close()
     inF = open(pklName, 'rb')
     v2 = cPickle.load(inF)
     inF.close()
     os.unlink(pklName)
     assert tuple(v1.GetOnBits()) == tuple(v2.GetOnBits()), 'pkl failed'
Exemplo n.º 6
0
 def testPkl(self):
   # Test pickling
   v1 = self.klass(10)
   v1[1] = 1
   v1[2] = 1
   v1[3] = 1
   pklName = 'foo.pkl'
   outF = open(pklName, 'wb+')
   cPickle.dump(v1, outF)
   outF.close()
   inF = open(pklName, 'rb')
   v2 = cPickle.load(inF)
   inF.close()
   os.unlink(pklName)
   assert tuple(v1.GetOnBits()) == tuple(v2.GetOnBits()), 'pkl failed'
Exemplo n.º 7
0
  def SaveState(self, fileName):
    """ Writes this calculator off to a file so that it can be easily loaded later

     **Arguments**

       - fileName: the name of the file to be written

    """
    try:
      f = open(fileName, 'wb+')
    except Exception:
      logger.error('cannot open output file %s for writing' % (fileName))
      return
    cPickle.dump(self, f)
    f.close()
Exemplo n.º 8
0
 def _writeDetailFile(self,inF,outF):
   while 1:
     try:
       smi,refContribs = cPickle.load(inF)
     except EOFError:
       break
     else:
       mol = Chem.MolFromSmiles(smi)
       if mol:
         mol=Chem.AddHs(mol,1)
         smi2 = Chem.MolToSmiles(mol)
         contribs = Crippen._GetAtomContribs(mol)
         cPickle.dump((smi,contribs),outF)
       else:
         print('Problems with SMILES:',smi)
Exemplo n.º 9
0
    def SaveState(self, fileName):
        """ Writes this calculator off to a file so that it can be easily loaded later

     **Arguments**

       - fileName: the name of the file to be written

    """
        try:
            f = open(fileName, 'wb+')
        except Exception:
            print('cannot open output file %s for writing' % (fileName))
            return
        cPickle.dump(self, f)
        f.close()
Exemplo n.º 10
0
 def testPkl(self):
     """ test pickling 
 """
     v1 = klass(10)
     v1[1] = 1
     v1[2] = 1
     v1[3] = 1
     pklName = "foo.pkl"
     outF = open(pklName, "wb+")
     cPickle.dump(v1, outF)
     outF.close()
     inF = open(pklName, "rb")
     v2 = cPickle.load(inF)
     inF.close()
     os.unlink(pklName)
     assert tuple(v1.GetOnBits()) == tuple(v2.GetOnBits()), "pkl failed"
Exemplo n.º 11
0
    def SaveState(self, fileName):
        """ Writes this calculator off to a file so that it can be easily loaded later

     **Arguments**

       - fileName: the name of the file to be written
       
    """
        from rdkit.six.moves import cPickle
        try:
            f = open(fileName, 'wb+')
        except:
            logger.error('cannot open output file %s for writing' % (fileName))
            return
        cPickle.dump(self, f)
        f.close()
Exemplo n.º 12
0
  def SaveState(self,fileName):
    """ Writes this calculator off to a file so that it can be easily loaded later

     **Arguments**

       - fileName: the name of the file to be written
       
    """
    from rdkit.six.moves import cPickle
    try:
      f = open(fileName,'wb+')
    except:
      print('cannot open output file %s for writing'%(fileName))
      return
    cPickle.dump(self,f)
    f.close()
Exemplo n.º 13
0
  def Pickle(self,fileName='foo.pkl',saveExamples=0):
    """ Writes this composite off to a file so that it can be easily loaded later

     **Arguments**

       - fileName: the name of the file to be written

       - saveExamples: if this is zero, the individual models will have
         their stored examples cleared.
       
    """
    if not saveExamples:
      self.ClearModelExamples()
        
    pFile = open(fileName,'wb+')
    cPickle.dump(self,pFile,1)
    pFile.close()
Exemplo n.º 14
0
  def Pickle(self, fileName='foo.pkl', saveExamples=0):
    """ Writes this composite off to a file so that it can be easily loaded later

     **Arguments**

       - fileName: the name of the file to be written

       - saveExamples: if this is zero, the individual models will have
         their stored examples cleared.

    """
    if not saveExamples:
      self.ClearModelExamples()

    pFile = open(fileName, 'wb+')
    cPickle.dump(self, pFile, 1)
    pFile.close()
Exemplo n.º 15
0
def runIt(inFileName, outFileName, smiCol=0, maxMols=-1, delim=','):
  inF = gzip.open(inFileName, 'r')
  outF = open(outFileName, 'wb+')
  mols = []
  nDone = 0
  for line in inF.readlines():
    if line[0] != '#':
      splitL = line.strip().split(delim)
      smi = splitL[smiCol].strip()
      print(smi)
      mol = Chem.MolFromSmiles(smi)
      if mol:
        contribs = Crippen._GetAtomContribs(mol)
        cPickle.dump((smi, contribs), outF)
      nDone += 1
      if maxMols > 0 and nDone >= maxMols:
        break
  outF.close()
Exemplo n.º 16
0
def runIt(inFileName, outFileName, smiCol=0, maxMols=-1, delim=','):
    inF = gzip.open(inFileName, 'r')
    outF = open(outFileName, 'wb+')
    mols = []
    nDone = 0
    for line in inF.readlines():
        if line[0] != '#':
            splitL = line.strip().split(delim)
            smi = splitL[smiCol].strip()
            print(smi)
            mol = Chem.MolFromSmiles(smi)
            if mol:
                contribs = Crippen._GetAtomContribs(mol)
                cPickle.dump((smi, contribs), outF)
            nDone += 1
            if maxMols > 0 and nDone >= maxMols:
                break
    outF.close()
Exemplo n.º 17
0
 def testPkl(self):
   " testing molecule pickle "
   import tempfile
   f,self.fName = tempfile.mkstemp('.pkl')
   f=None
   self.m = Chem.MolFromSmiles('CC(=O)CC')
   outF = open(self.fName,'wb+')
   cPickle.dump(self.m,outF)
   outF.close()
   inF = open(self.fName,'rb')
   m2 = cPickle.load(inF)
   inF.close()
   try:
     os.unlink(self.fName)
   except:
     pass
   oldSmi = Chem.MolToSmiles(self.m)
   newSmi = Chem.MolToSmiles(m2)
   assert oldSmi==newSmi,'string compare failed'
Exemplo n.º 18
0
 def testPkl(self):
   " testing molecule pickle "
   import tempfile
   f, self.fName = tempfile.mkstemp('.pkl')
   f = None
   self.m = Chem.MolFromSmiles('CC(=O)CC')
   outF = open(self.fName, 'wb+')
   cPickle.dump(self.m, outF)
   outF.close()
   inF = open(self.fName, 'rb')
   m2 = cPickle.load(inF)
   inF.close()
   try:
     os.unlink(self.fName)
   except Exception:
     pass
   oldSmi = Chem.MolToSmiles(self.m)
   newSmi = Chem.MolToSmiles(m2)
   assert oldSmi == newSmi, 'string compare failed'
Exemplo n.º 19
0
    def testPkl(self):
        " testing molecule pickle "
        import tempfile

        f, self.fName = tempfile.mkstemp(".pkl")
        f = None
        self.m = Chem.MolFromSmiles("CC(=O)CC")
        outF = open(self.fName, "wb+")
        cPickle.dump(self.m, outF)
        outF.close()
        inF = open(self.fName, "rb")
        m2 = cPickle.load(inF)
        inF.close()
        try:
            os.unlink(self.fName)
        except:
            pass
        oldSmi = Chem.MolToSmiles(self.m)
        newSmi = Chem.MolToSmiles(m2)
        assert oldSmi == newSmi, "string compare failed"
Exemplo n.º 20
0
def ClusterFromDetails(details):
  """ Returns the cluster tree

  """
  data = MolSimilarity.GetFingerprints(details)
  if details.maxMols > 0:
    data = data[:details.maxMols]
  if details.outFileName:
    try:
      outF = open(details.outFileName, 'wb+')
    except IOError:
      error("Error: could not open output file %s for writing\n" % (details.outFileName))
      return None
  else:
    outF = None

  if not data:
    return None

  clustTree = ClusterPoints(data, details.metric, details.clusterAlgo, haveLabels=0, haveActs=1)
  if outF:
    cPickle.dump(clustTree, outF)
  return clustTree
Exemplo n.º 21
0
def WritePickledData(outName,data):
  """ writes either a .qdat.pkl or a .dat.pkl file

    **Arguments**

      - outName: the name of the file to be used

      - data: either an _MLData.MLDataSet_ or an _MLData.MLQuantDataSet_

  """
  varNames = data.GetVarNames()
  qBounds = data.GetQuantBounds()
  ptNames = data.GetPtNames()
  examples = data.GetAllData()
  with open(outName,'wb+') as outFile:
    cPickle.dump(varNames,outFile)
    cPickle.dump(qBounds,outFile)  
    cPickle.dump(ptNames,outFile)  
    cPickle.dump(examples,outFile)
Exemplo n.º 22
0
def WritePickledData(outName, data):
    """ writes either a .qdat.pkl or a .dat.pkl file

    **Arguments**

      - outName: the name of the file to be used

      - data: either an _MLData.MLDataSet_ or an _MLData.MLQuantDataSet_

  """
    varNames = data.GetVarNames()
    qBounds = data.GetQuantBounds()
    ptNames = data.GetPtNames()
    examples = data.GetAllData()
    with open(outName, 'wb+') as outFile:
        cPickle.dump(varNames, outFile)
        cPickle.dump(qBounds, outFile)
        cPickle.dump(ptNames, outFile)
        cPickle.dump(examples, outFile)
Exemplo n.º 23
0
def RunOnData(details, data, progressCallback=None, saveIt=1, setDescNames=0):
    nExamples = data.GetNPts()
    if details.lockRandom:
        seed = details.randomSeed
    else:
        import random
        seed = (random.randint(0, 1e6), random.randint(0, 1e6))
    DataUtils.InitRandomNumbers(seed)
    testExamples = []
    if details.shuffleActivities == 1:
        DataUtils.RandomizeActivities(data, shuffle=1, runDetails=details)
    elif details.randomActivities == 1:
        DataUtils.RandomizeActivities(data, shuffle=0, runDetails=details)

    namedExamples = data.GetNamedData()
    if details.splitRun == 1:
        trainIdx, testIdx = SplitData.SplitIndices(len(namedExamples),
                                                   details.splitFrac,
                                                   silent=not _verbose)

        trainExamples = [namedExamples[x] for x in trainIdx]
        testExamples = [namedExamples[x] for x in testIdx]
    else:
        testExamples = []
        testIdx = []
        trainIdx = range(len(namedExamples))
        trainExamples = namedExamples

    if details.filterFrac != 0.0:
        # if we're doing quantization on the fly, we need to handle that here:
        if hasattr(details, 'activityBounds') and details.activityBounds:
            tExamples = []
            bounds = details.activityBounds
            for pt in trainExamples:
                pt = pt[:]
                act = pt[-1]
                placed = 0
                bound = 0
                while not placed and bound < len(bounds):
                    if act < bounds[bound]:
                        pt[-1] = bound
                        placed = 1
                    else:
                        bound += 1
                if not placed:
                    pt[-1] = bound
                tExamples.append(pt)
        else:
            bounds = None
            tExamples = trainExamples
        trainIdx, temp = DataUtils.FilterData(tExamples,
                                              details.filterVal,
                                              details.filterFrac,
                                              -1,
                                              indicesOnly=1)
        tmp = [trainExamples[x] for x in trainIdx]
        testExamples += [trainExamples[x] for x in temp]
        trainExamples = tmp

        counts = DataUtils.CountResults(trainExamples, bounds=bounds)
        ks = counts.keys()
        ks.sort()
        message('Result Counts in training set:')
        for k in ks:
            message(str((k, counts[k])))
        counts = DataUtils.CountResults(testExamples, bounds=bounds)
        ks = counts.keys()
        ks.sort()
        message('Result Counts in test set:')
        for k in ks:
            message(str((k, counts[k])))
    nExamples = len(trainExamples)
    message('Training with %d examples' % (nExamples))

    nVars = data.GetNVars()
    attrs = range(1, nVars + 1)
    nPossibleVals = data.GetNPossibleVals()
    for i in range(1, len(nPossibleVals)):
        if nPossibleVals[i - 1] == -1:
            attrs.remove(i)

    if details.pickleDataFileName != '':
        pickleDataFile = open(details.pickleDataFileName, 'wb+')
        cPickle.dump(trainExamples, pickleDataFile)
        cPickle.dump(testExamples, pickleDataFile)
        pickleDataFile.close()

    if details.bayesModel:
        composite = BayesComposite.BayesComposite()
    else:
        composite = Composite.Composite()

    composite._randomSeed = seed
    composite._splitFrac = details.splitFrac
    composite._shuffleActivities = details.shuffleActivities
    composite._randomizeActivities = details.randomActivities

    if hasattr(details, 'filterFrac'):
        composite._filterFrac = details.filterFrac
    if hasattr(details, 'filterVal'):
        composite._filterVal = details.filterVal

    composite.SetModelFilterData(details.modelFilterFrac,
                                 details.modelFilterVal)

    composite.SetActivityQuantBounds(details.activityBounds)
    nPossibleVals = data.GetNPossibleVals()
    if details.activityBounds:
        nPossibleVals[-1] = len(details.activityBounds) + 1

    if setDescNames:
        composite.SetInputOrder(data.GetVarNames())
        composite.SetDescriptorNames(details._descNames)
    else:
        composite.SetDescriptorNames(data.GetVarNames())
    composite.SetActivityQuantBounds(details.activityBounds)
    if details.nModels == 1:
        details.internalHoldoutFrac = 0.0
    if details.useTrees:
        from rdkit.ML.DecTree import CrossValidate, PruneTree
        if details.qBounds != []:
            from rdkit.ML.DecTree import BuildQuantTree
            builder = BuildQuantTree.QuantTreeBoot
        else:
            from rdkit.ML.DecTree import ID3
            builder = ID3.ID3Boot
        driver = CrossValidate.CrossValidationDriver
        pruner = PruneTree.PruneTree

        composite.SetQuantBounds(details.qBounds)
        nPossibleVals = data.GetNPossibleVals()
        if details.activityBounds:
            nPossibleVals[-1] = len(details.activityBounds) + 1
        composite.Grow(trainExamples,
                       attrs,
                       nPossibleVals=[0] + nPossibleVals,
                       buildDriver=driver,
                       pruner=pruner,
                       nTries=details.nModels,
                       pruneIt=details.pruneIt,
                       lessGreedy=details.lessGreedy,
                       needsQuantization=0,
                       treeBuilder=builder,
                       nQuantBounds=details.qBounds,
                       startAt=details.startAt,
                       maxDepth=details.limitDepth,
                       progressCallback=progressCallback,
                       holdOutFrac=details.internalHoldoutFrac,
                       replacementSelection=details.replacementSelection,
                       recycleVars=details.recycleVars,
                       randomDescriptors=details.randomDescriptors,
                       silent=not _verbose)

    elif details.useSigTrees:
        from rdkit.ML.DecTree import CrossValidate
        from rdkit.ML.DecTree import BuildSigTree
        builder = BuildSigTree.SigTreeBuilder
        driver = CrossValidate.CrossValidationDriver
        nPossibleVals = data.GetNPossibleVals()
        if details.activityBounds:
            nPossibleVals[-1] = len(details.activityBounds) + 1
        if hasattr(details, 'sigTreeBiasList'):
            biasList = details.sigTreeBiasList
        else:
            biasList = None
        if hasattr(details, 'useCMIM'):
            useCMIM = details.useCMIM
        else:
            useCMIM = 0
        if hasattr(details, 'allowCollections'):
            allowCollections = details.allowCollections
        else:
            allowCollections = False
        composite.Grow(trainExamples,
                       attrs,
                       nPossibleVals=[0] + nPossibleVals,
                       buildDriver=driver,
                       nTries=details.nModels,
                       needsQuantization=0,
                       treeBuilder=builder,
                       maxDepth=details.limitDepth,
                       progressCallback=progressCallback,
                       holdOutFrac=details.internalHoldoutFrac,
                       replacementSelection=details.replacementSelection,
                       recycleVars=details.recycleVars,
                       randomDescriptors=details.randomDescriptors,
                       biasList=biasList,
                       useCMIM=useCMIM,
                       allowCollection=allowCollections,
                       silent=not _verbose)

    elif details.useKNN:
        from rdkit.ML.KNN import CrossValidate
        from rdkit.ML.KNN import DistFunctions

        driver = CrossValidate.CrossValidationDriver
        dfunc = ''
        if (details.knnDistFunc == "Euclidean"):
            dfunc = DistFunctions.EuclideanDist
        elif (details.knnDistFunc == "Tanimoto"):
            dfunc = DistFunctions.TanimotoDist
        else:
            assert 0, "Bad KNN distance metric value"

        composite.Grow(trainExamples,
                       attrs,
                       nPossibleVals=[0] + nPossibleVals,
                       buildDriver=driver,
                       nTries=details.nModels,
                       needsQuantization=0,
                       numNeigh=details.knnNeighs,
                       holdOutFrac=details.internalHoldoutFrac,
                       distFunc=dfunc)

    elif details.useNaiveBayes or details.useSigBayes:
        from rdkit.ML.NaiveBayes import CrossValidate
        driver = CrossValidate.CrossValidationDriver
        if not (hasattr(details, 'useSigBayes') and details.useSigBayes):
            composite.Grow(trainExamples,
                           attrs,
                           nPossibleVals=[0] + nPossibleVals,
                           buildDriver=driver,
                           nTries=details.nModels,
                           needsQuantization=0,
                           nQuantBounds=details.qBounds,
                           holdOutFrac=details.internalHoldoutFrac,
                           replacementSelection=details.replacementSelection,
                           mEstimateVal=details.mEstimateVal,
                           silent=not _verbose)
        else:
            if hasattr(details, 'useCMIM'):
                useCMIM = details.useCMIM
            else:
                useCMIM = 0

            composite.Grow(trainExamples,
                           attrs,
                           nPossibleVals=[0] + nPossibleVals,
                           buildDriver=driver,
                           nTries=details.nModels,
                           needsQuantization=0,
                           nQuantBounds=details.qBounds,
                           mEstimateVal=details.mEstimateVal,
                           useSigs=True,
                           useCMIM=useCMIM,
                           holdOutFrac=details.internalHoldoutFrac,
                           replacementSelection=details.replacementSelection,
                           silent=not _verbose)


##   elif details.useSVM:
##     from rdkit.ML.SVM import CrossValidate
##     driver = CrossValidate.CrossValidationDriver
##     composite.Grow(trainExamples, attrs, nPossibleVals=[0]+nPossibleVals,
##                    buildDriver=driver, nTries=details.nModels,
##                    needsQuantization=0,
##                    cost=details.svmCost,gamma=details.svmGamma,
##                    weights=details.svmWeights,degree=details.svmDegree,
##                    type=details.svmType,kernelType=details.svmKernel,
##                    coef0=details.svmCoeff,eps=details.svmEps,nu=details.svmNu,
##                    cache_size=details.svmCache,shrinking=details.svmShrink,
##                    dataType=details.svmDataType,
##                    holdOutFrac=details.internalHoldoutFrac,
##                    replacementSelection=details.replacementSelection,
##                    silent=not _verbose)

    else:
        from rdkit.ML.Neural import CrossValidate
        driver = CrossValidate.CrossValidationDriver
        composite.Grow(trainExamples,
                       attrs, [0] + nPossibleVals,
                       nTries=details.nModels,
                       buildDriver=driver,
                       needsQuantization=0)

    composite.AverageErrors()
    composite.SortModels()
    modelList, counts, avgErrs = composite.GetAllData()
    counts = numpy.array(counts)
    avgErrs = numpy.array(avgErrs)
    composite._varNames = data.GetVarNames()

    for i in range(len(modelList)):
        modelList[i].NameModel(composite._varNames)

    # do final statistics
    weightedErrs = counts * avgErrs
    averageErr = sum(weightedErrs) / sum(counts)
    devs = (avgErrs - averageErr)
    devs = devs * counts
    devs = numpy.sqrt(devs * devs)
    avgDev = sum(devs) / sum(counts)
    message('# Overall Average Error: %%% 5.2f, Average Deviation: %%% 6.2f' %
            (100. * averageErr, 100. * avgDev))

    if details.bayesModel:
        composite.Train(trainExamples, verbose=0)

    # blow out the saved examples and then save the composite:
    composite.ClearModelExamples()
    if saveIt:
        composite.Pickle(details.outName)
    details.model = DbModule.binaryHolder(cPickle.dumps(composite))

    badExamples = []
    if not details.detailedRes and (not hasattr(details, 'noScreen')
                                    or not details.noScreen):
        if details.splitRun:
            message('Testing all hold-out examples')
            wrong = testall(composite, testExamples, badExamples)
            message('%d examples (%% %5.2f) were misclassified' %
                    (len(wrong),
                     100. * float(len(wrong)) / float(len(testExamples))))
            _runDetails.holdout_error = float(len(wrong)) / len(testExamples)
        else:
            message('Testing all examples')
            wrong = testall(composite, namedExamples, badExamples)
            message('%d examples (%% %5.2f) were misclassified' %
                    (len(wrong),
                     100. * float(len(wrong)) / float(len(namedExamples))))
            _runDetails.overall_error = float(len(wrong)) / len(namedExamples)

    if details.detailedRes:
        message('\nEntire data set:')
        resTup = ScreenComposite.ShowVoteResults(range(data.GetNPts()), data,
                                                 composite, nPossibleVals[-1],
                                                 details.threshold)
        nGood, nBad, nSkip, avgGood, avgBad, avgSkip, voteTab = resTup
        nPts = len(namedExamples)
        nClass = nGood + nBad
        _runDetails.overall_error = float(nBad) / nClass
        _runDetails.overall_correct_conf = avgGood
        _runDetails.overall_incorrect_conf = avgBad
        _runDetails.overall_result_matrix = repr(voteTab)
        nRej = nClass - nPts
        if nRej > 0:
            _runDetails.overall_fraction_dropped = float(nRej) / nPts

        if details.splitRun:
            message('\nHold-out data:')
            resTup = ScreenComposite.ShowVoteResults(range(len(testExamples)),
                                                     testExamples, composite,
                                                     nPossibleVals[-1],
                                                     details.threshold)
            nGood, nBad, nSkip, avgGood, avgBad, avgSkip, voteTab = resTup
            nPts = len(testExamples)
            nClass = nGood + nBad
            _runDetails.holdout_error = float(nBad) / nClass
            _runDetails.holdout_correct_conf = avgGood
            _runDetails.holdout_incorrect_conf = avgBad
            _runDetails.holdout_result_matrix = repr(voteTab)
            nRej = nClass - nPts
            if nRej > 0:
                _runDetails.holdout_fraction_dropped = float(nRej) / nPts

    if details.persistTblName and details.dbName:
        message('Updating results table %s:%s' %
                (details.dbName, details.persistTblName))
        details.Store(db=details.dbName, table=details.persistTblName)

    if details.badName != '':
        badFile = open(details.badName, 'w+')
        for i in range(len(badExamples)):
            ex = badExamples[i]
            vote = wrong[i]
            outStr = '%s\t%s\n' % (ex, vote)
            badFile.write(outStr)
        badFile.close()

    composite.ClearModelExamples()
    return composite
Exemplo n.º 24
0
from rdkit.Chem.PyMol import MolViewer
from rdkit.Chem.Subshape import SubshapeBuilder, SubshapeObjects, SubshapeAligner
from rdkit.six.moves import cPickle
import copy

m1 = Chem.MolFromMolFile('test_data/square1.mol')
m2 = Chem.MolFromMolFile('test_data/square2.mol')

b = SubshapeBuilder.SubshapeBuilder()
b.gridDims = (10., 10., 5)
b.gridSpacing = 0.4
b.winRad = 2.0
if 1:
  print('m1:')
  s1 = b.GenerateSubshapeShape(m1)
  cPickle.dump(s1, open('test_data/square1.shp.pkl', 'wb+'))
  print('m2:')
  s2 = b.GenerateSubshapeShape(m2)
  cPickle.dump(s2, open('test_data/square2.shp.pkl', 'wb+'))
  ns1 = b.CombineSubshapes(s1, s2)
  b.GenerateSubshapeSkeleton(ns1)
  cPickle.dump(ns1, open('test_data/combined.shp.pkl', 'wb+'))
else:
  s1 = cPickle.load(open('test_data/square1.shp.pkl', 'rb'))
  s2 = cPickle.load(open('test_data/square2.shp.pkl', 'rb'))
  #ns1 = cPickle.load(file('test_data/combined.shp.pkl','rb'))
  ns1 = cPickle.load(open('test_data/combined.shp.pkl', 'rb'))

v = MolViewer()
SubshapeObjects.DisplaySubshape(v, s1, 'shape1')
SubshapeObjects.DisplaySubshape(v, ns1, 'ns1')
Exemplo n.º 25
0
  def Pickle(self, fileName='foo.pkl'):
    """ Pickles the tree and writes it to disk

    """
    with open(fileName, 'wb+') as pFile:
      cPickle.dump(self, pFile)
Exemplo n.º 26
0
    def Pickle(self, fileName="foo.pkl"):
        """ Pickles the tree and writes it to disk

    """
        with open(fileName, "wb+") as pFile:
            cPickle.dump(self, pFile)
Exemplo n.º 27
0
def FingerprintsFromDetails(details, reportFreq=10):
  data = None
  if details.dbName and details.tableName:
    from rdkit.Dbase.DbConnection import DbConnect
    from rdkit.Dbase import DbInfo
    from rdkit.ML.Data import DataUtils
    try:
      conn = DbConnect(details.dbName, details.tableName)
    except Exception:
      import traceback
      error('Problems establishing connection to database: %s|%s\n' % (details.dbName,
                                                                       details.tableName))
      traceback.print_exc()
    if not details.idName:
      details.idName = DbInfo.GetColumnNames(details.dbName, details.tableName)[0]
    dataSet = DataUtils.DBToData(details.dbName, details.tableName,
                                 what='%s,%s' % (details.idName, details.smilesName))
    idCol = 0
    smiCol = 1
  elif details.inFileName and details.useSmiles:
    from rdkit.ML.Data import DataUtils
    conn = None
    if not details.idName:
      details.idName = 'ID'
    try:
      dataSet = DataUtils.TextFileToData(details.inFileName,
                                         onlyCols=[details.idName, details.smilesName])
    except IOError:
      import traceback
      error('Problems reading from file %s\n' % (details.inFileName))
      traceback.print_exc()

    idCol = 0
    smiCol = 1
  elif details.inFileName and details.useSD:
    conn = None
    dataset = None
    if not details.idName:
      details.idName = 'ID'
    dataSet = []
    try:
      s = Chem.SDMolSupplier(details.inFileName)
    except Exception:
      import traceback
      error('Problems reading from file %s\n' % (details.inFileName))
      traceback.print_exc()
    else:
      while 1:
        try:
          m = s.next()
        except StopIteration:
          break
        if m:
          dataSet.append(m)
          if reportFreq > 0 and not len(dataSet) % reportFreq:
            message('Read %d molecules\n' % (len(dataSet)))
            if details.maxMols > 0 and len(dataSet) >= details.maxMols:
              break

    for i, mol in enumerate(dataSet):
      if mol.HasProp(details.idName):
        nm = mol.GetProp(details.idName)
      else:
        nm = mol.GetProp('_Name')
      dataSet[i] = (nm, mol)
  else:
    dataSet = None

  fps = None
  if dataSet and not details.useSD:
    data = dataSet.GetNamedData()
    if not details.molPklName:
      fps = apply(FingerprintsFromSmiles, (data, idCol, smiCol), details.__dict__)
    else:
      fps = apply(FingerprintsFromPickles, (data, idCol, smiCol), details.__dict__)
  elif dataSet and details.useSD:
    fps = apply(FingerprintsFromMols, (dataSet, ), details.__dict__)

  if fps:
    if details.outFileName:
      outF = open(details.outFileName, 'wb+')
      for i in range(len(fps)):
        cPickle.dump(fps[i], outF)
      outF.close()
    dbName = details.outDbName or details.dbName
    if details.outTableName and dbName:
      from rdkit.Dbase.DbConnection import DbConnect
      from rdkit.Dbase import DbUtils, DbModule
      conn = DbConnect(dbName)
      #
      #  We don't have a db open already, so we'll need to figure out
      #    the types of our columns...
      #
      colTypes = DbUtils.TypeFinder(data, len(data), len(data[0]))
      typeStrs = DbUtils.GetTypeStrings([details.idName, details.smilesName], colTypes,
                                        keyCol=details.idName)
      cols = '%s, %s %s' % (typeStrs[0], details.fpColName, DbModule.binaryTypeName)

      # FIX: we should really check to see if the table
      #  is already there and, if so, add the appropriate
      #  column.

      #
      # create the new table
      #
      if details.replaceTable or \
         details.outTableName.upper() not in [x.upper() for x in conn.GetTableNames()]:
        conn.AddTable(details.outTableName, cols)

      #
      # And add the data
      #
      for ID, fp in fps:
        tpl = ID, DbModule.binaryHolder(fp.ToBinary())
        conn.InsertData(details.outTableName, tpl)
      conn.Commit()
  return fps
Exemplo n.º 28
0
        for example in examples:
            res = net.ClassifyExample(example[:-1])
            print("%f -> %f" % (example[-1], res))

        return net

    def runProfile(command):
        import random

        random.seed(23)
        import profile, pstats

        datFile = "%s.prof.dat" % (command)
        profile.run("%s()" % command, datFile)
        stats = pstats.Stats(datFile)
        stats.strip_dirs()
        stats.sort_stats("time").print_stats()

    if 0:
        net = testXor()
        print("Xor:", net)
        from rdkit.six.moves import cPickle

        outF = open("xornet.pkl", "wb+")
        cPickle.dump(net, outF)
        outF.close()
    else:
        # runProfile('testLinear')
        net = testLinear()
        # net = testOr()
Exemplo n.º 29
0
  cat = None
  obls = None
  if details.doBuild:
    if not suppl:
      message("We require inData to generate a catalog\n")
      sys.exit(-2)
    message("Building catalog\n")
    t1 = time.time()
    cat = BuildCatalog(suppl, maxPts=details.numMols, minPath=details.minPath,
                       maxPath=details.maxPath)
    t2 = time.time()
    message("\tThat took %.2f seconds.\n" % (t2 - t1))
    if details.catalogName:
      message("Dumping catalog data\n")
      cPickle.dump(cat, open(details.catalogName, 'wb+'))
  elif details.catalogName:
    message("Loading catalog\n")
    cat = cPickle.load(open(details.catalogName, 'rb'))
    if details.onBitsName:
      try:
        obls = cPickle.load(open(details.onBitsName, 'rb'))
      except Exception:
        obls = None
      else:
        if len(obls) < (inD.count('\n') - 1):
          obls = None
  scores = None
  if details.doScore:
    if not suppl:
      message("We require inData to score molecules\n")
Exemplo n.º 30
0
 obls = None
 if details.doBuild:
     if not suppl:
         message("We require inData to generate a catalog\n")
         sys.exit(-2)
     message("Building catalog\n")
     t1 = time.time()
     cat = BuildCatalog(suppl,
                        maxPts=details.numMols,
                        minPath=details.minPath,
                        maxPath=details.maxPath)
     t2 = time.time()
     message("\tThat took %.2f seconds.\n" % (t2 - t1))
     if details.catalogName:
         message("Dumping catalog data\n")
         cPickle.dump(cat, open(details.catalogName, 'wb+'))
 elif details.catalogName:
     message("Loading catalog\n")
     cat = cPickle.load(open(details.catalogName, 'rb'))
     if details.onBitsName:
         try:
             obls = cPickle.load(open(details.onBitsName, 'rb'))
         except Exception:
             obls = None
         else:
             if len(obls) < (inD.count('\n') - 1):
                 obls = None
 scores = None
 if details.doScore:
     if not suppl:
         message("We require inData to score molecules\n")
Exemplo n.º 31
0
        cmpd = Chem.AddHs(cmpd)
        AllChem.EmbedMolecule(cmpd)
        AllChem.UFFOptimizeMolecule(cmpd)
        AllChem.CanonicalizeMol(cmpd)
        # print(Chem.MolToMolBlock(cmpd), file=file('testmol.mol', 'w+'))
    else:
        cmpd = Chem.MolFromMolFile('testmol.mol')
    builder = SubshapeBuilder()
    if 1:
        shape = builder.GenerateSubshapeShape(cmpd)
    v = MolViewer()
    if 1:
        tmpFile = tempfile.mktemp('.grd')
        v.server.deleteAll()
        Geometry.WriteGridToFile(shape.grid, tmpFile)
        time.sleep(1)
        v.ShowMol(cmpd, name='testMol', showOnly=True)
        v.server.loadSurface(tmpFile, 'testGrid', '', 2.5)
    v.server.resetCGO('*')

    with open('subshape.pkl', 'w+') as f:
        cPickle.dump(shape, f)
    for i, pt in enumerate(shape.skelPts):
        v.server.sphere(tuple(pt.location), .5, (1, 0, 1), 'Pt-%d' % i)
        if not hasattr(pt, 'shapeDirs'):
            continue
        momBeg = pt.location - pt.shapeDirs[0]
        momEnd = pt.location + pt.shapeDirs[0]
        v.server.cylinder(tuple(momBeg), tuple(momEnd), .1, (1, 0, 1),
                          'v-%d' % i)
Exemplo n.º 32
0
c1.drawPolygon([(100, 100), (100, 200), (200, 200), (200, 100)],
               fillColor=pid.Color(0, 0, 1))
c1.drawLines([(100, 100, 200, 200), (100, 200, 200, 100)],
             color=pid.Color(0, 1, 0),
             width=2)

# because the log has been instantiated with clear() as the loggerFlushCommand,
# this will blow out the log as well as the contents of the canvas.
c1.clear()

# draw some more stuff
c1.drawPolygon([(100, 100), (100, 200), (200, 200), (200, 100)],
               fillColor=pid.Color(1, 0, 0))
c1.drawLines([(100, 100, 200, 200), (100, 200, 200, 100)],
             color=pid.Color(0, 0, 0),
             width=2)
# and write the resulting file.
c1.save()

# save the log by pickling it.
from rdkit.six.moves import cPickle
cPickle.dump(c1._LoggerGetLog(), open('foo.pkl', 'wb+'))

# create a new canvas
c2 = pidPIL.PILCanvas(sz, 'foo.png')
# read the pickled log back in
t = cPickle.load(open('foo.pkl', 'rb'))
# and play the log on the new canvas
Logger.replay(t, c2)
# there should now be a file 'foo.png' with the image
Exemplo n.º 33
0
def RunOnData(details, data, progressCallback=None, saveIt=1, setDescNames=0):
  if details.lockRandom:
    seed = details.randomSeed
  else:
    import random
    seed = (random.randint(0, 1e6), random.randint(0, 1e6))
  DataUtils.InitRandomNumbers(seed)
  testExamples = []
  if details.shuffleActivities == 1:
    DataUtils.RandomizeActivities(data, shuffle=1, runDetails=details)
  elif details.randomActivities == 1:
    DataUtils.RandomizeActivities(data, shuffle=0, runDetails=details)

  namedExamples = data.GetNamedData()
  if details.splitRun == 1:
    trainIdx, testIdx = SplitData.SplitIndices(
      len(namedExamples), details.splitFrac, silent=not _verbose)

    trainExamples = [namedExamples[x] for x in trainIdx]
    testExamples = [namedExamples[x] for x in testIdx]
  else:
    testExamples = []
    testIdx = []
    trainIdx = list(range(len(namedExamples)))
    trainExamples = namedExamples

  if details.filterFrac != 0.0:
    # if we're doing quantization on the fly, we need to handle that here:
    if hasattr(details, 'activityBounds') and details.activityBounds:
      tExamples = []
      bounds = details.activityBounds
      for pt in trainExamples:
        pt = pt[:]
        act = pt[-1]
        placed = 0
        bound = 0
        while not placed and bound < len(bounds):
          if act < bounds[bound]:
            pt[-1] = bound
            placed = 1
          else:
            bound += 1
        if not placed:
          pt[-1] = bound
        tExamples.append(pt)
    else:
      bounds = None
      tExamples = trainExamples
    trainIdx, temp = DataUtils.FilterData(tExamples, details.filterVal, details.filterFrac, -1,
                                          indicesOnly=1)
    tmp = [trainExamples[x] for x in trainIdx]
    testExamples += [trainExamples[x] for x in temp]
    trainExamples = tmp

    counts = DataUtils.CountResults(trainExamples, bounds=bounds)
    ks = counts.keys()
    ks.sort()
    message('Result Counts in training set:')
    for k in ks:
      message(str((k, counts[k])))
    counts = DataUtils.CountResults(testExamples, bounds=bounds)
    ks = counts.keys()
    ks.sort()
    message('Result Counts in test set:')
    for k in ks:
      message(str((k, counts[k])))
  nExamples = len(trainExamples)
  message('Training with %d examples' % (nExamples))

  nVars = data.GetNVars()
  attrs = list(range(1, nVars + 1))
  nPossibleVals = data.GetNPossibleVals()
  for i in range(1, len(nPossibleVals)):
    if nPossibleVals[i - 1] == -1:
      attrs.remove(i)

  if details.pickleDataFileName != '':
    pickleDataFile = open(details.pickleDataFileName, 'wb+')
    cPickle.dump(trainExamples, pickleDataFile)
    cPickle.dump(testExamples, pickleDataFile)
    pickleDataFile.close()

  if details.bayesModel:
    composite = BayesComposite.BayesComposite()
  else:
    composite = Composite.Composite()

  composite._randomSeed = seed
  composite._splitFrac = details.splitFrac
  composite._shuffleActivities = details.shuffleActivities
  composite._randomizeActivities = details.randomActivities

  if hasattr(details, 'filterFrac'):
    composite._filterFrac = details.filterFrac
  if hasattr(details, 'filterVal'):
    composite._filterVal = details.filterVal

  composite.SetModelFilterData(details.modelFilterFrac, details.modelFilterVal)

  composite.SetActivityQuantBounds(details.activityBounds)
  nPossibleVals = data.GetNPossibleVals()
  if details.activityBounds:
    nPossibleVals[-1] = len(details.activityBounds) + 1

  if setDescNames:
    composite.SetInputOrder(data.GetVarNames())
    composite.SetDescriptorNames(details._descNames)
  else:
    composite.SetDescriptorNames(data.GetVarNames())
  composite.SetActivityQuantBounds(details.activityBounds)
  if details.nModels == 1:
    details.internalHoldoutFrac = 0.0
  if details.useTrees:
    from rdkit.ML.DecTree import CrossValidate, PruneTree
    if details.qBounds != []:
      from rdkit.ML.DecTree import BuildQuantTree
      builder = BuildQuantTree.QuantTreeBoot
    else:
      from rdkit.ML.DecTree import ID3
      builder = ID3.ID3Boot
    driver = CrossValidate.CrossValidationDriver
    pruner = PruneTree.PruneTree

    composite.SetQuantBounds(details.qBounds)
    nPossibleVals = data.GetNPossibleVals()
    if details.activityBounds:
      nPossibleVals[-1] = len(details.activityBounds) + 1
    composite.Grow(
      trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver, pruner=pruner,
      nTries=details.nModels, pruneIt=details.pruneIt, lessGreedy=details.lessGreedy,
      needsQuantization=0, treeBuilder=builder, nQuantBounds=details.qBounds,
      startAt=details.startAt, maxDepth=details.limitDepth, progressCallback=progressCallback,
      holdOutFrac=details.internalHoldoutFrac, replacementSelection=details.replacementSelection,
      recycleVars=details.recycleVars, randomDescriptors=details.randomDescriptors,
      silent=not _verbose)

  elif details.useSigTrees:
    from rdkit.ML.DecTree import CrossValidate
    from rdkit.ML.DecTree import BuildSigTree
    builder = BuildSigTree.SigTreeBuilder
    driver = CrossValidate.CrossValidationDriver
    nPossibleVals = data.GetNPossibleVals()
    if details.activityBounds:
      nPossibleVals[-1] = len(details.activityBounds) + 1
    if hasattr(details, 'sigTreeBiasList'):
      biasList = details.sigTreeBiasList
    else:
      biasList = None
    if hasattr(details, 'useCMIM'):
      useCMIM = details.useCMIM
    else:
      useCMIM = 0
    if hasattr(details, 'allowCollections'):
      allowCollections = details.allowCollections
    else:
      allowCollections = False
    composite.Grow(
      trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver,
      nTries=details.nModels, needsQuantization=0, treeBuilder=builder, maxDepth=details.limitDepth,
      progressCallback=progressCallback, holdOutFrac=details.internalHoldoutFrac,
      replacementSelection=details.replacementSelection, recycleVars=details.recycleVars,
      randomDescriptors=details.randomDescriptors, biasList=biasList, useCMIM=useCMIM,
      allowCollection=allowCollections, silent=not _verbose)

  elif details.useKNN:
    from rdkit.ML.KNN import CrossValidate
    from rdkit.ML.KNN import DistFunctions

    driver = CrossValidate.CrossValidationDriver
    dfunc = ''
    if (details.knnDistFunc == "Euclidean"):
      dfunc = DistFunctions.EuclideanDist
    elif (details.knnDistFunc == "Tanimoto"):
      dfunc = DistFunctions.TanimotoDist
    else:
      assert 0, "Bad KNN distance metric value"

    composite.Grow(trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver,
                   nTries=details.nModels, needsQuantization=0, numNeigh=details.knnNeighs,
                   holdOutFrac=details.internalHoldoutFrac, distFunc=dfunc)

  elif details.useNaiveBayes or details.useSigBayes:
    from rdkit.ML.NaiveBayes import CrossValidate
    driver = CrossValidate.CrossValidationDriver
    if not (hasattr(details, 'useSigBayes') and details.useSigBayes):
      composite.Grow(trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver,
                     nTries=details.nModels, needsQuantization=0, nQuantBounds=details.qBounds,
                     holdOutFrac=details.internalHoldoutFrac,
                     replacementSelection=details.replacementSelection,
                     mEstimateVal=details.mEstimateVal, silent=not _verbose)
    else:
      if hasattr(details, 'useCMIM'):
        useCMIM = details.useCMIM
      else:
        useCMIM = 0

      composite.Grow(trainExamples, attrs, nPossibleVals=[0] + nPossibleVals, buildDriver=driver,
                     nTries=details.nModels, needsQuantization=0, nQuantBounds=details.qBounds,
                     mEstimateVal=details.mEstimateVal, useSigs=True, useCMIM=useCMIM,
                     holdOutFrac=details.internalHoldoutFrac,
                     replacementSelection=details.replacementSelection, silent=not _verbose)

    # #   elif details.useSVM:
    # #     from rdkit.ML.SVM import CrossValidate
    # #     driver = CrossValidate.CrossValidationDriver
    # #     composite.Grow(trainExamples, attrs, nPossibleVals=[0]+nPossibleVals,
    # #                    buildDriver=driver, nTries=details.nModels,
    # #                    needsQuantization=0,
    # #                    cost=details.svmCost,gamma=details.svmGamma,
    # #                    weights=details.svmWeights,degree=details.svmDegree,
    # #                    type=details.svmType,kernelType=details.svmKernel,
    # #                    coef0=details.svmCoeff,eps=details.svmEps,nu=details.svmNu,
    # #                    cache_size=details.svmCache,shrinking=details.svmShrink,
    # #                    dataType=details.svmDataType,
    # #                    holdOutFrac=details.internalHoldoutFrac,
    # #                    replacementSelection=details.replacementSelection,
    # #                    silent=not _verbose)

  else:
    from rdkit.ML.Neural import CrossValidate
    driver = CrossValidate.CrossValidationDriver
    composite.Grow(trainExamples, attrs, [0] + nPossibleVals, nTries=details.nModels,
                   buildDriver=driver, needsQuantization=0)

  composite.AverageErrors()
  composite.SortModels()
  modelList, counts, avgErrs = composite.GetAllData()
  counts = numpy.array(counts)
  avgErrs = numpy.array(avgErrs)
  composite._varNames = data.GetVarNames()

  for i in range(len(modelList)):
    modelList[i].NameModel(composite._varNames)

  # do final statistics
  weightedErrs = counts * avgErrs
  averageErr = sum(weightedErrs) / sum(counts)
  devs = (avgErrs - averageErr)
  devs = devs * counts
  devs = numpy.sqrt(devs * devs)
  avgDev = sum(devs) / sum(counts)
  message('# Overall Average Error: %%% 5.2f, Average Deviation: %%% 6.2f' %
          (100. * averageErr, 100. * avgDev))

  if details.bayesModel:
    composite.Train(trainExamples, verbose=0)

  # blow out the saved examples and then save the composite:
  composite.ClearModelExamples()
  if saveIt:
    composite.Pickle(details.outName)
  details.model = DbModule.binaryHolder(cPickle.dumps(composite))

  badExamples = []
  if not details.detailedRes and (not hasattr(details, 'noScreen') or not details.noScreen):
    if details.splitRun:
      message('Testing all hold-out examples')
      wrong = testall(composite, testExamples, badExamples)
      message('%d examples (%% %5.2f) were misclassified' % (len(wrong), 100. * float(len(wrong)) /
                                                             float(len(testExamples))))
      _runDetails.holdout_error = float(len(wrong)) / len(testExamples)
    else:
      message('Testing all examples')
      wrong = testall(composite, namedExamples, badExamples)
      message('%d examples (%% %5.2f) were misclassified' % (len(wrong), 100. * float(len(wrong)) /
                                                             float(len(namedExamples))))
      _runDetails.overall_error = float(len(wrong)) / len(namedExamples)

  if details.detailedRes:
    message('\nEntire data set:')
    resTup = ScreenComposite.ShowVoteResults(
      range(data.GetNPts()), data, composite, nPossibleVals[-1], details.threshold)
    nGood, nBad, nSkip, avgGood, avgBad, avgSkip, voteTab = resTup
    nPts = len(namedExamples)
    nClass = nGood + nBad
    _runDetails.overall_error = float(nBad) / nClass
    _runDetails.overall_correct_conf = avgGood
    _runDetails.overall_incorrect_conf = avgBad
    _runDetails.overall_result_matrix = repr(voteTab)
    nRej = nClass - nPts
    if nRej > 0:
      _runDetails.overall_fraction_dropped = float(nRej) / nPts

    if details.splitRun:
      message('\nHold-out data:')
      resTup = ScreenComposite.ShowVoteResults(
        range(len(testExamples)), testExamples, composite, nPossibleVals[-1], details.threshold)
      nGood, nBad, nSkip, avgGood, avgBad, avgSkip, voteTab = resTup
      nPts = len(testExamples)
      nClass = nGood + nBad
      _runDetails.holdout_error = float(nBad) / nClass
      _runDetails.holdout_correct_conf = avgGood
      _runDetails.holdout_incorrect_conf = avgBad
      _runDetails.holdout_result_matrix = repr(voteTab)
      nRej = nClass - nPts
      if nRej > 0:
        _runDetails.holdout_fraction_dropped = float(nRej) / nPts

  if details.persistTblName and details.dbName:
    message('Updating results table %s:%s' % (details.dbName, details.persistTblName))
    details.Store(db=details.dbName, table=details.persistTblName)

  if details.badName != '':
    badFile = open(details.badName, 'w+')
    for i in range(len(badExamples)):
      ex = badExamples[i]
      vote = wrong[i]
      outStr = '%s\t%s\n' % (ex, vote)
      badFile.write(outStr)
    badFile.close()

  composite.ClearModelExamples()
  return composite
Exemplo n.º 34
0
    t.TrainOnLine(examples, net, errTol=0.1, useAvgErr=0)
    print('classifications:')
    for example in examples:
      res = net.ClassifyExample(example[:-1])
      print('%f -> %f' % (example[-1], res))

    return net

  def runProfile(command):
    import random
    random.seed(23)
    import profile
    import pstats
    datFile = '%s.prof.dat' % (command)
    profile.run('%s()' % command, datFile)
    stats = pstats.Stats(datFile)
    stats.strip_dirs()
    stats.sort_stats('time').print_stats()

  if 0:
    net = testXor()
    print('Xor:', net)
    from rdkit.six.moves import cPickle
    outF = open('xornet.pkl', 'wb+')
    cPickle.dump(net, outF)
    outF.close()
  else:
    # runProfile('testLinear')
    net = testLinear()
    # net = testOr()
Exemplo n.º 35
0
        cmpd = Chem.MolFromSmiles('C1=CC=C1C#CC1=CC=C1')
        cmpd = Chem.AddHs(cmpd)
        AllChem.EmbedMolecule(cmpd)
        AllChem.UFFOptimizeMolecule(cmpd)
        AllChem.CanonicalizeMol(cmpd)
        print >> file('testmol.mol', 'w+'), Chem.MolToMolBlock(cmpd)
    else:
        cmpd = Chem.MolFromMolFile('testmol.mol')
    builder = SubshapeBuilder()
    if 1:
        shape = builder.GenerateSubshapeShape(cmpd)
    v = MolViewer()
    if 1:
        import tempfile
        tmpFile = tempfile.mktemp('.grd')
        v.server.deleteAll()
        Geometry.WriteGridToFile(shape.grid, tmpFile)
        time.sleep(1)
        v.ShowMol(cmpd, name='testMol', showOnly=True)
        v.server.loadSurface(tmpFile, 'testGrid', '', 2.5)
    v.server.resetCGO('*')

    cPickle.dump(shape, file('subshape.pkl', 'w+'))
    for i, pt in enumerate(shape.skelPts):
        v.server.sphere(tuple(pt.location), .5, (1, 0, 1), 'Pt-%d' % i)
        if not hasattr(pt, 'shapeDirs'): continue
        momBeg = pt.location - pt.shapeDirs[0]
        momEnd = pt.location + pt.shapeDirs[0]
        v.server.cylinder(tuple(momBeg), tuple(momEnd), .1, (1, 0, 1),
                          'v-%d' % i)
Exemplo n.º 36
0
def FingerprintsFromDetails(details, reportFreq=10):
    data = None
    if details.dbName and details.tableName:
        from rdkit.Dbase.DbConnection import DbConnect
        from rdkit.Dbase import DbInfo
        from rdkit.ML.Data import DataUtils
        try:
            conn = DbConnect(details.dbName, details.tableName)
        except Exception:
            import traceback
            error('Problems establishing connection to database: %s|%s\n' %
                  (details.dbName, details.tableName))
            traceback.print_exc()
        if not details.idName:
            details.idName = DbInfo.GetColumnNames(details.dbName,
                                                   details.tableName)[0]
        dataSet = DataUtils.DBToData(details.dbName,
                                     details.tableName,
                                     what='%s,%s' %
                                     (details.idName, details.smilesName))
        idCol = 0
        smiCol = 1
    elif details.inFileName and details.useSmiles:
        from rdkit.ML.Data import DataUtils
        conn = None
        if not details.idName:
            details.idName = 'ID'
        try:
            dataSet = DataUtils.TextFileToData(
                details.inFileName,
                onlyCols=[details.idName, details.smilesName])
        except IOError:
            import traceback
            error('Problems reading from file %s\n' % (details.inFileName))
            traceback.print_exc()

        idCol = 0
        smiCol = 1
    elif details.inFileName and details.useSD:
        conn = None
        dataset = None
        if not details.idName:
            details.idName = 'ID'
        dataSet = []
        try:
            s = Chem.SDMolSupplier(details.inFileName)
        except Exception:
            import traceback
            error('Problems reading from file %s\n' % (details.inFileName))
            traceback.print_exc()
        else:
            while 1:
                try:
                    m = s.next()
                except StopIteration:
                    break
                if m:
                    dataSet.append(m)
                    if reportFreq > 0 and not len(dataSet) % reportFreq:
                        message('Read %d molecules\n' % (len(dataSet)))
                        if details.maxMols > 0 and len(
                                dataSet) >= details.maxMols:
                            break

        for i, mol in enumerate(dataSet):
            if mol.HasProp(details.idName):
                nm = mol.GetProp(details.idName)
            else:
                nm = mol.GetProp('_Name')
            dataSet[i] = (nm, mol)
    else:
        dataSet = None

    fps = None
    if dataSet and not details.useSD:
        data = dataSet.GetNamedData()
        if not details.molPklName:
            fps = apply(FingerprintsFromSmiles, (data, idCol, smiCol),
                        details.__dict__)
        else:
            fps = apply(FingerprintsFromPickles, (data, idCol, smiCol),
                        details.__dict__)
    elif dataSet and details.useSD:
        fps = apply(FingerprintsFromMols, (dataSet, ), details.__dict__)

    if fps:
        if details.outFileName:
            outF = open(details.outFileName, 'wb+')
            for i in range(len(fps)):
                cPickle.dump(fps[i], outF)
            outF.close()
        dbName = details.outDbName or details.dbName
        if details.outTableName and dbName:
            from rdkit.Dbase.DbConnection import DbConnect
            from rdkit.Dbase import DbUtils, DbModule
            conn = DbConnect(dbName)
            #
            #  We don't have a db open already, so we'll need to figure out
            #    the types of our columns...
            #
            colTypes = DbUtils.TypeFinder(data, len(data), len(data[0]))
            typeStrs = DbUtils.GetTypeStrings(
                [details.idName, details.smilesName],
                colTypes,
                keyCol=details.idName)
            cols = '%s, %s %s' % (typeStrs[0], details.fpColName,
                                  DbModule.binaryTypeName)

            # FIX: we should really check to see if the table
            #  is already there and, if so, add the appropriate
            #  column.

            #
            # create the new table
            #
            if details.replaceTable or \
               details.outTableName.upper() not in [x.upper() for x in conn.GetTableNames()]:
                conn.AddTable(details.outTableName, cols)

            #
            # And add the data
            #
            for ID, fp in fps:
                tpl = ID, DbModule.binaryHolder(fp.ToBinary())
                conn.InsertData(details.outTableName, tpl)
            conn.Commit()
    return fps
Exemplo n.º 37
0
def GenRandomExamples(nVars=10, randScale=0.3, bitProb=0.5, nExamples=500, seed=(0, 0),
                      addResults=1):
  random.seed(seed[0])
  varWeights = numpy.array([random.random() for _ in range(nVars)]) * randScale
  examples = [None] * nExamples

  for i in range(nExamples):
    varVals = [random.random() > bitProb for _ in range(nVars)]
    temp = numpy.array(varVals) * varWeights
    res = sum(temp)
    if addResults:
      varVals.append(res >= 1.)
    examples[i] = varVals

  nPossibleVals = [2] * (nExamples + 1)
  attrs = list(range(nVars))

  return (examples, attrs, nPossibleVals)


if __name__ == '__main__':  # pragma: nocover
  from rdkit.six.moves import cPickle
  examples, attrs, nPossibleVals = GenRandomExamples()
  outF = open('random.dat.pkl', 'wb+')
  cPickle.dump(examples, outF)
  cPickle.dump(attrs, outF)
  cPickle.dump(nPossibleVals, outF)

  tree = ID3.ID3Boot(examples, attrs, nPossibleVals)
  tree.Pickle('save.pkl')
Exemplo n.º 38
0
    cmpd = Chem.MolFromSmiles('C1=CC=C1C#CC1=CC=C1')
    cmpd = Chem.AddHs(cmpd)
    AllChem.EmbedMolecule(cmpd)
    AllChem.UFFOptimizeMolecule(cmpd)
    AllChem.CanonicalizeMol(cmpd)
    print >>file('testmol.mol','w+'),Chem.MolToMolBlock(cmpd)
  else:
    cmpd = Chem.MolFromMolFile('testmol.mol')
  builder=SubshapeBuilder()
  if 1:
    shape=builder.GenerateSubshapeShape(cmpd)
  v = MolViewer()
  if 1:
    import tempfile
    tmpFile = tempfile.mktemp('.grd')
    v.server.deleteAll()
    Geometry.WriteGridToFile(shape.grid,tmpFile)
    time.sleep(1)
    v.ShowMol(cmpd,name='testMol',showOnly=True)
    v.server.loadSurface(tmpFile,'testGrid','',2.5)
  v.server.resetCGO('*')

  cPickle.dump(shape,file('subshape.pkl','w+'))
  for i,pt in enumerate(shape.skelPts):
    v.server.sphere(tuple(pt.location),.5,(1,0,1),'Pt-%d'%i)
    if not hasattr(pt,'shapeDirs'): continue
    momBeg = pt.location-pt.shapeDirs[0]
    momEnd = pt.location+pt.shapeDirs[0]
    v.server.cylinder(tuple(momBeg),tuple(momEnd),.1,(1,0,1),'v-%d'%i)

Exemplo n.º 39
0
# create a logged canvas and draw on it
sz = (300,300)
c1 = Logger.Logger(pidSVG.SVGCanvas,sz,'foo.svg',loggerFlushCommand='clear')
c1.drawPolygon([(100,100),(100,200),(200,200),(200,100)],fillColor=pid.Color(0,0,1))
c1.drawLines([(100,100,200,200),(100,200,200,100)],color=pid.Color(0,1,0),width=2)

# because the log has been instantiated with clear() as the loggerFlushCommand, 
# this will blow out the log as well as the contents of the canvas.
c1.clear()

# draw some more stuff
c1.drawPolygon([(100,100),(100,200),(200,200),(200,100)],fillColor=pid.Color(1,0,0))
c1.drawLines([(100,100,200,200),(100,200,200,100)],color=pid.Color(0,0,0),width=2)
# and write the resulting file.
c1.save()

# save the log by pickling it.
from rdkit.six.moves import cPickle
cPickle.dump(c1._LoggerGetLog(),open('foo.pkl','wb+'))

# create a new canvas
c2 = pidPIL.PILCanvas(sz,'foo.png')
# read the pickled log back in 
t = cPickle.load(open('foo.pkl','rb'))
# and play the log on the new canvas
Logger.replay(t,c2)
# there should now be a file 'foo.png' with the image