from rdkit.SimDivFilters import rdSimDivPickers as rdsimdiv import numpy from rdkit import RDRandom RDRandom.seed(23) pkr = rdsimdiv.MaxMinPicker() n = 1000 m = 80 dataPts = [] for i in range(n) : pt = numpy.zeros(2, 'd') pt[0] = 10.*RDRandom.random() pt[1] = 10.*RDRandom.random() dataPts.append(pt) # compute the distance matrix distMat = numpy.zeros(n*(n-1)/2, 'd') for i in range(n-1) : itab = n*i - ((i+1)*(i+2))/2 pt1 = dataPts[i] for j in range(i+1, n) : id = itab + j pt2 = dataPts[j] diff = pt2 - pt1 dist = numpy.sqrt(numpy.dot(diff, diff)) distMat[id] = dist
from rdkit.SimDivFilters import rdSimDivPickers as rdsimdiv import numpy from rdkit import RDRandom RDRandom.seed(23) pkr = rdsimdiv.MaxMinPicker() n = 1000 m = 80 dataPts = [] for i in range(n): pt = numpy.zeros(2, 'd') pt[0] = 10. * RDRandom.random() pt[1] = 10. * RDRandom.random() dataPts.append(pt) # compute the distance matrix distMat = numpy.zeros(n * (n - 1) / 2, 'd') for i in range(n - 1): itab = n * i - ((i + 1) * (i + 2)) / 2 pt1 = dataPts[i] for j in range(i + 1, n): id = itab + j pt2 = dataPts[j] diff = pt2 - pt1 dist = numpy.sqrt(numpy.dot(diff, diff)) distMat[id] = dist # now do the picking res = pkr.Pick(distMat, n, m)
def SplitIndices(nPts,frac,silent=1,legacy=0,replacement=0): """ splits a set of indices into a data set into 2 pieces **Arguments** - nPts: the total number of points - frac: the fraction of the data to be put in the first data set - silent: (optional) toggles display of stats - legacy: (optional) use the legacy splitting approach - replacement: (optional) use selection with replacement **Returns** a 2-tuple containing the two sets of indices. **Notes** - the _legacy_ splitting approach uses randomly-generated floats and compares them to _frac_. This is provided for backwards-compatibility reasons. - the default splitting approach uses a random permutation of indices which is split into two parts. - selection with replacement can generate duplicates. **Usage**: We'll start with a set of indices and pick from them using the three different approaches: >>> from rdkit.ML.Data import DataUtils The base approach always returns the same number of compounds in each set and has no duplicates: >>> DataUtils.InitRandomNumbers((23,42)) >>> test,train = SplitIndices(10,.5) >>> test [1, 5, 6, 4, 2] >>> train [3, 0, 7, 8, 9] >>> test,train = SplitIndices(10,.5) >>> test [5, 2, 9, 8, 7] >>> train [6, 0, 3, 1, 4] The legacy approach can return varying numbers, but still has no duplicates. Note the indices come back ordered: >>> DataUtils.InitRandomNumbers((23,42)) >>> test,train = SplitIndices(10,.5,legacy=1) >>> test [0, 1, 2, 3, 4, 7, 9] >>> train [5, 6, 8] >>> test,train = SplitIndices(10,.5,legacy=1) >>> test [4, 5, 7, 8, 9] >>> train [0, 1, 2, 3, 6] The replacement approach returns a fixed number in the training set, a variable number in the test set and can contain duplicates in the training set. >>> DataUtils.InitRandomNumbers((23,42)) >>> test,train = SplitIndices(10,.5,replacement=1) >>> test [1, 1, 3, 0, 1] >>> train [2, 4, 5, 6, 7, 8, 9] >>> test,train = SplitIndices(10,.5,replacement=1) >>> test [9, 5, 4, 8, 0] >>> train [1, 2, 3, 6, 7] """ if frac<0. or frac > 1.: raise ValueError('frac must be between 0.0 and 1.0 (frac=%f)'%(frac)) if replacement: nTrain = int(nPts*frac) resData = [None]*nTrain resTest = [] for i in range(nTrain): val = int(RDRandom.random()*nPts) if val==nPts: val = nPts-1 resData[i] = val for i in range(nPts): if i not in resData: resTest.append(i) elif legacy: resData = [] resTest = [] for i in range(nPts): val = RDRandom.random() if val < frac: resData.append(i) else: resTest.append(i) else: perm = range(nPts) random.shuffle(perm) nTrain = int(nPts*frac) resData = list(perm[:nTrain]) resTest = list(perm[nTrain:]) if not silent: print 'Training with %d (of %d) points.'%(len(resData),nPts) print '\t%d points are in the hold-out set.'%(len(resTest)) return resData,resTest
def SplitIndices(nPts, frac, silent=1, legacy=0, replacement=0): """ splits a set of indices into a data set into 2 pieces **Arguments** - nPts: the total number of points - frac: the fraction of the data to be put in the first data set - silent: (optional) toggles display of stats - legacy: (optional) use the legacy splitting approach - replacement: (optional) use selection with replacement **Returns** a 2-tuple containing the two sets of indices. **Notes** - the _legacy_ splitting approach uses randomly-generated floats and compares them to _frac_. This is provided for backwards-compatibility reasons. - the default splitting approach uses a random permutation of indices which is split into two parts. - selection with replacement can generate duplicates. **Usage**: We'll start with a set of indices and pick from them using the three different approaches: >>> from rdkit.ML.Data import DataUtils The base approach always returns the same number of compounds in each set and has no duplicates: >>> DataUtils.InitRandomNumbers((23,42)) >>> test,train = SplitIndices(10,.5) >>> test [1, 5, 6, 4, 2] >>> train [3, 0, 7, 8, 9] >>> test,train = SplitIndices(10,.5) >>> test [5, 2, 9, 8, 7] >>> train [6, 0, 3, 1, 4] The legacy approach can return varying numbers, but still has no duplicates. Note the indices come back ordered: >>> DataUtils.InitRandomNumbers((23,42)) >>> test,train = SplitIndices(10,.5,legacy=1) >>> test [0, 1, 2, 3, 4, 7, 9] >>> train [5, 6, 8] >>> test,train = SplitIndices(10,.5,legacy=1) >>> test [4, 5, 7, 8, 9] >>> train [0, 1, 2, 3, 6] The replacement approach returns a fixed number in the training set, a variable number in the test set and can contain duplicates in the training set. >>> DataUtils.InitRandomNumbers((23,42)) >>> test,train = SplitIndices(10,.5,replacement=1) >>> test [1, 1, 3, 0, 1] >>> train [2, 4, 5, 6, 7, 8, 9] >>> test,train = SplitIndices(10,.5,replacement=1) >>> test [9, 5, 4, 8, 0] >>> train [1, 2, 3, 6, 7] """ if frac < 0. or frac > 1.: raise ValueError('frac must be between 0.0 and 1.0 (frac=%f)' % (frac)) if replacement: nTrain = int(nPts * frac) resData = [None] * nTrain resTest = [] for i in range(nTrain): val = int(RDRandom.random() * nPts) if val == nPts: val = nPts - 1 resData[i] = val for i in range(nPts): if i not in resData: resTest.append(i) elif legacy: resData = [] resTest = [] for i in range(nPts): val = RDRandom.random() if val < frac: resData.append(i) else: resTest.append(i) else: perm = range(nPts) random.shuffle(perm) nTrain = int(nPts * frac) resData = list(perm[:nTrain]) resTest = list(perm[nTrain:]) if not silent: print 'Training with %d (of %d) points.' % (len(resData), nPts) print '\t%d points are in the hold-out set.' % (len(resTest)) return resData, resTest