def test1HierarchPick(self): fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers', 'Wrap', 'test_data', 'points.csv') with open(fname) as infil: lines = infil.readlines() self.dataPts = numpy.zeros((len(lines), 2), 'd') labels = [] i = 0 for line in lines: tlst = line.strip().split(',') self.dataPts[i, 0] = float(tlst[1]) self.dataPts[i, 1] = float(tlst[2]) labels.append(int(tlst[3])) i += 1 self.dMat = rdmmc.GetEuclideanDistMat(self.dataPts) pkr = rdSimDivPickers.HierarchicalClusterPicker( rdSimDivPickers.ClusterMethod.WARD) clusters = pkr.Cluster(self.dMat, i, 2) # check that each of the clusters have the same label for cl in clusters: clbl = labels[cl[0]] for id in cl: assert clbl == labels[id] hierarch = pkr.Pick(self.dMat, i, 2) self.assertEqual(tuple(hierarch), (1, 30))
def ClusterBits(self, corrMat): # clustering code actually needs distances so, take 1/val for each element in corMat distMat = 1 / corrMat pkr = rdsimdiv.HierarchicalClusterPicker(self._type) cls = pkr.Cluster(distMat, len(self._bidList), self._nClusters) # map the clusters to the actual bit ids self._clusters = [] for cl in cls: self._clusters.append([self._bidList[i] for i in cl])
def testNonUniqueCrash(self): from rdkit import DataStructs sz = 10 nbits = 20 nBitsToSet = int(nbits * .3) N = 12 vs = [] for i in range(sz): bv = DataStructs.ExplicitBitVect(nbits) for j in range(nBitsToSet): val = int(nbits * random.random()) bv.SetBit(val) vs.append(bv) vs.append(bv) def taniFunc(i, j, bvs=vs): d = 1 - DataStructs.FingerprintSimilarity(bvs[i], bvs[j]) return d picker = rdSimDivPickers.MaxMinPicker() try: mm1 = picker.LazyPick(taniFunc, len(vs), N) except: ok = False else: ok = True self.assertTrue(ok) self.assertEqual(len(mm1), N) picker = None picker = rdSimDivPickers.MaxMinPicker() try: mm2 = picker.LazyBitVectorPick(vs, len(vs), N) except: ok = False else: ok = True self.assertTrue(ok) self.assertEqual(len(mm2), N) self.assertEqual(tuple(mm2), tuple(mm1)) picker = None ds = [] nvs = len(vs) for i in range(nvs): for j in range(i + 1, nvs): d = taniFunc(i, j) ds.append(d) m = numpy.array(ds) picker = rdSimDivPickers.HierarchicalClusterPicker( rdSimDivPickers.ClusterMethod.WARD) p1 = list(picker.Pick(m, nvs, N))
def testIssue208(self): sz = 10 N = 3 m = [] for i in range(sz): for j in range(i + 1, sz): m.append(random.random()) m = numpy.array(m) picker = rdSimDivPickers.HierarchicalClusterPicker(rdSimDivPickers.ClusterMethod.WARD) p1 = list(picker.Pick(m, sz, N)) p1.sort() p2 = list(picker.Pick(m, sz, N)) p2.sort() self.assertEqual(p1, p2)
def testInts(self): """ make sure we can handle ints too """ sz = 10 N = 3 m = [] for i in range(sz): for j in range(i + 1, sz): m.append(int(100 * random.random())) m = numpy.array(m) picker = rdSimDivPickers.HierarchicalClusterPicker(rdSimDivPickers.ClusterMethod.WARD) p1 = list(picker.Pick(m, sz, N)) p1.sort() p2 = list(picker.Pick(m, sz, N)) p2.sort() self.assertEqual(p1, p2)
def testNonUniqueCrash(self): from rdkit import DataStructs sz = 300 nbits = 40 nBitsToSet = int(nbits * .3) N = 8 vs = [] for i in range(sz): bv = DataStructs.ExplicitBitVect(nbits) for j in range(nBitsToSet): val = int(nbits * random.random()) bv.SetBit(val) vs.append(bv) vs.append(bv) def taniFunc(i, j, bvs=vs): d = 1 - DataStructs.FingerprintSimilarity(bvs[i], bvs[j]) return d picker = rdSimDivPickers.MaxMinPicker() mm1 = picker.LazyPick(taniFunc, len(vs), N) self.assertEqual(len(mm1), N) picker = None picker = rdSimDivPickers.MaxMinPicker() mm2 = picker.LazyBitVectorPick(vs, len(vs), N) self.assertEqual(len(mm2), N) picker = rdSimDivPickers.MaxMinPicker() mm3 = picker.LazyBitVectorPick(vs, len(vs), N) self.assertEqual(len(mm3), N) # we get the occasional dupe randomly, # make sure we don't get three dupes in a row self.assertTrue(tuple(mm2) != tuple(mm1)) or (tuple(mm3) != tuple(mm1)) picker = None ds = [] nvs = len(vs) for i in range(nvs): for j in range(i + 1, nvs): d = taniFunc(i, j) ds.append(d) m = numpy.array(ds) picker = rdSimDivPickers.HierarchicalClusterPicker( rdSimDivPickers.ClusterMethod.WARD) p1 = list(picker.Pick(m, nvs, N))
def test1HierarchPick(self): infil = open("test_data/points.csv", 'r') lines = infil.readlines() infil.close() self.dataPts = numpy.zeros((len(lines), 2), 'd') labels = [] i = 0 for line in lines: tlst = line.strip().split(',') self.dataPts[i, 0] = float(tlst[1]) self.dataPts[i, 1] = float(tlst[2]) labels.append(int(tlst[3])) i += 1 self.dMat = rdmmc.GetEuclideanDistMat(self.dataPts) pkr = rdSimDivPickers.HierarchicalClusterPicker( rdSimDivPickers.ClusterMethod.WARD) clusters = pkr.Cluster(self.dMat, i, 2) # check that each of the clusters have the same label for cl in clusters: clbl = labels[cl[0]] for id in cl: assert clbl == labels[id] hierarch = pkr.Pick(self.dMat, i, 2) assert tuple(hierarch) == (1, 30)