示例#1
0
from rdkit.SimDivFilters import rdSimDivPickers as rdsimdiv
import numpy
from rdkit import RDRandom
RDRandom.seed(23)


pkr = rdsimdiv.MaxMinPicker()

n = 1000
m = 80
dataPts = []
for i in range(n) :
    pt = numpy.zeros(2, 'd')
    pt[0] = 10.*RDRandom.random()
    pt[1] = 10.*RDRandom.random()
    dataPts.append(pt)

# compute the distance matrix
distMat = numpy.zeros(n*(n-1)/2, 'd')
for i in range(n-1) :
    itab = n*i - ((i+1)*(i+2))/2
    pt1 = dataPts[i]
    for j in range(i+1, n) :
        id = itab + j
        pt2 = dataPts[j]
        diff = pt2 - pt1
        
        dist = numpy.sqrt(numpy.dot(diff, diff))
        distMat[id] = dist
        
示例#2
0
from rdkit.SimDivFilters import rdSimDivPickers as rdsimdiv
import numpy
from rdkit import RDRandom
RDRandom.seed(23)

pkr = rdsimdiv.MaxMinPicker()

n = 1000
m = 80
dataPts = []
for i in range(n):
    pt = numpy.zeros(2, 'd')
    pt[0] = 10. * RDRandom.random()
    pt[1] = 10. * RDRandom.random()
    dataPts.append(pt)

# compute the distance matrix
distMat = numpy.zeros(n * (n - 1) / 2, 'd')
for i in range(n - 1):
    itab = n * i - ((i + 1) * (i + 2)) / 2
    pt1 = dataPts[i]
    for j in range(i + 1, n):
        id = itab + j
        pt2 = dataPts[j]
        diff = pt2 - pt1

        dist = numpy.sqrt(numpy.dot(diff, diff))
        distMat[id] = dist

    # now do the picking
res = pkr.Pick(distMat, n, m)
示例#3
0
def SplitIndices(nPts,frac,silent=1,legacy=0,replacement=0):
  """ splits a set of indices into a data set into 2 pieces

    **Arguments**

     - nPts: the total number of points

     - frac: the fraction of the data to be put in the first data set

     - silent: (optional) toggles display of stats

     - legacy: (optional) use the legacy splitting approach

     - replacement: (optional) use selection with replacement

   **Returns**

     a 2-tuple containing the two sets of indices.

   **Notes**

     - the _legacy_ splitting approach uses randomly-generated floats
       and compares them to _frac_.  This is provided for
       backwards-compatibility reasons.

     - the default splitting approach uses a random permutation of
       indices which is split into two parts.

     - selection with replacement can generate duplicates.


  **Usage**:

  We'll start with a set of indices and pick from them using
  the three different approaches:
  >>> from rdkit.ML.Data import DataUtils

  The base approach always returns the same number of compounds in
  each set and has no duplicates:
  >>> DataUtils.InitRandomNumbers((23,42))
  >>> test,train = SplitIndices(10,.5)
  >>> test
  [1, 5, 6, 4, 2]
  >>> train
  [3, 0, 7, 8, 9]

  >>> test,train = SplitIndices(10,.5)
  >>> test
  [5, 2, 9, 8, 7]
  >>> train
  [6, 0, 3, 1, 4]


  The legacy approach can return varying numbers, but still has no
  duplicates.  Note the indices come back ordered:
  >>> DataUtils.InitRandomNumbers((23,42))
  >>> test,train = SplitIndices(10,.5,legacy=1)
  >>> test
  [0, 1, 2, 3, 4, 7, 9]
  >>> train
  [5, 6, 8]
  >>> test,train = SplitIndices(10,.5,legacy=1)
  >>> test
  [4, 5, 7, 8, 9]
  >>> train
  [0, 1, 2, 3, 6]

  The replacement approach returns a fixed number in the training set,
  a variable number in the test set and can contain duplicates in the
  training set. 
  >>> DataUtils.InitRandomNumbers((23,42))
  >>> test,train = SplitIndices(10,.5,replacement=1)
  >>> test
  [1, 1, 3, 0, 1]
  >>> train
  [2, 4, 5, 6, 7, 8, 9]
  >>> test,train = SplitIndices(10,.5,replacement=1)
  >>> test
  [9, 5, 4, 8, 0]
  >>> train
  [1, 2, 3, 6, 7]
  
  """
  if frac<0. or frac > 1.:
    raise ValueError('frac must be between 0.0 and 1.0 (frac=%f)'%(frac))

  if replacement:
    nTrain = int(nPts*frac)
    resData = [None]*nTrain
    resTest = []
    for i in range(nTrain):
      val = int(RDRandom.random()*nPts)
      if val==nPts: val = nPts-1
      resData[i] = val
    for i in range(nPts):
      if i not in resData:
        resTest.append(i)
  elif legacy:
    resData = []
    resTest = []
    for i in range(nPts):
      val = RDRandom.random()
      if val < frac:
        resData.append(i)
      else:
        resTest.append(i)
  else:
    perm = range(nPts)
    random.shuffle(perm)
    nTrain = int(nPts*frac)
    
    resData = list(perm[:nTrain])
    resTest = list(perm[nTrain:])
        
  if not silent:
    print 'Training with %d (of %d) points.'%(len(resData),nPts)
    print '\t%d points are in the hold-out set.'%(len(resTest))
  return resData,resTest
示例#4
0
def SplitIndices(nPts, frac, silent=1, legacy=0, replacement=0):
    """ splits a set of indices into a data set into 2 pieces

    **Arguments**

     - nPts: the total number of points

     - frac: the fraction of the data to be put in the first data set

     - silent: (optional) toggles display of stats

     - legacy: (optional) use the legacy splitting approach

     - replacement: (optional) use selection with replacement

   **Returns**

     a 2-tuple containing the two sets of indices.

   **Notes**

     - the _legacy_ splitting approach uses randomly-generated floats
       and compares them to _frac_.  This is provided for
       backwards-compatibility reasons.

     - the default splitting approach uses a random permutation of
       indices which is split into two parts.

     - selection with replacement can generate duplicates.


  **Usage**:

  We'll start with a set of indices and pick from them using
  the three different approaches:
  >>> from rdkit.ML.Data import DataUtils

  The base approach always returns the same number of compounds in
  each set and has no duplicates:
  >>> DataUtils.InitRandomNumbers((23,42))
  >>> test,train = SplitIndices(10,.5)
  >>> test
  [1, 5, 6, 4, 2]
  >>> train
  [3, 0, 7, 8, 9]

  >>> test,train = SplitIndices(10,.5)
  >>> test
  [5, 2, 9, 8, 7]
  >>> train
  [6, 0, 3, 1, 4]


  The legacy approach can return varying numbers, but still has no
  duplicates.  Note the indices come back ordered:
  >>> DataUtils.InitRandomNumbers((23,42))
  >>> test,train = SplitIndices(10,.5,legacy=1)
  >>> test
  [0, 1, 2, 3, 4, 7, 9]
  >>> train
  [5, 6, 8]
  >>> test,train = SplitIndices(10,.5,legacy=1)
  >>> test
  [4, 5, 7, 8, 9]
  >>> train
  [0, 1, 2, 3, 6]

  The replacement approach returns a fixed number in the training set,
  a variable number in the test set and can contain duplicates in the
  training set. 
  >>> DataUtils.InitRandomNumbers((23,42))
  >>> test,train = SplitIndices(10,.5,replacement=1)
  >>> test
  [1, 1, 3, 0, 1]
  >>> train
  [2, 4, 5, 6, 7, 8, 9]
  >>> test,train = SplitIndices(10,.5,replacement=1)
  >>> test
  [9, 5, 4, 8, 0]
  >>> train
  [1, 2, 3, 6, 7]
  
  """
    if frac < 0. or frac > 1.:
        raise ValueError('frac must be between 0.0 and 1.0 (frac=%f)' % (frac))

    if replacement:
        nTrain = int(nPts * frac)
        resData = [None] * nTrain
        resTest = []
        for i in range(nTrain):
            val = int(RDRandom.random() * nPts)
            if val == nPts: val = nPts - 1
            resData[i] = val
        for i in range(nPts):
            if i not in resData:
                resTest.append(i)
    elif legacy:
        resData = []
        resTest = []
        for i in range(nPts):
            val = RDRandom.random()
            if val < frac:
                resData.append(i)
            else:
                resTest.append(i)
    else:
        perm = range(nPts)
        random.shuffle(perm)
        nTrain = int(nPts * frac)

        resData = list(perm[:nTrain])
        resTest = list(perm[nTrain:])

    if not silent:
        print 'Training with %d (of %d) points.' % (len(resData), nPts)
        print '\t%d points are in the hold-out set.' % (len(resTest))
    return resData, resTest