def test0SubstructLibrary(self): for fpholderCls in [None, rdSubstructLibrary.PatternHolder]: for holder in [rdSubstructLibrary.MolHolder(), rdSubstructLibrary.CachedMolHolder(), rdSubstructLibrary.CachedSmilesMolHolder()]: if fpholderCls: fpholder = fpholderCls() else: fpholder = None slib_ = rdSubstructLibrary.SubstructLibrary(holder, fpholder) for i in range(100): m = Chem.MolFromSmiles("c1ccccc1") self.assertEqual(slib_.AddMol(m), i) libs = [slib_] if rdSubstructLibrary.SubstructLibraryCanSerialize(): serialized1 = pickle.loads(pickle.dumps(slib_)) serialized2 = rdSubstructLibrary.SubstructLibrary(slib_.Serialize()) libs.append(serialized1) libs.append(serialized2) for slib in libs: res = slib.GetMatches(m) t2 = time.time() self.assertTrue(len(res) == 100) res = slib.GetMatches(m) self.assertEqual(len(res), 100) self.assertTrue(set(res) == set(list(range(100)))) res = slib.GetMatches(m, maxResults=100); self.assertEqual(len(res), 100) self.assertEqual(len(slib.GetMatches(m, startIdx=0, endIdx=100)), 100) self.assertTrue(slib.HasMatch(m)) self.assertEqual(slib.CountMatches(m), 100)
def testOptions(self): mols = makeStereoExamples() * 10 for holderCls in [ rdSubstructLibrary.MolHolder, rdSubstructLibrary.CachedMolHolder, rdSubstructLibrary.CachedSmilesMolHolder, rdSubstructLibrary.CachedTrustedSmilesMolHolder, ]: holder = holderCls() slib_ = rdSubstructLibrary.SubstructLibrary(holder, None) for mol in mols: slib_.AddMol(mol) libs = [slib_] if rdSubstructLibrary.SubstructLibraryCanSerialize(): serialized1 = pickle.loads(pickle.dumps(slib_)) serialized2 = rdSubstructLibrary.SubstructLibrary( slib_.Serialize()) libs.append(serialized1) libs.append(serialized2) for slib in libs: core = Chem.MolFromSmarts("C-1-C-C-O-C(-*)(-*)1") res = slib.GetMatches(core) self.assertEqual( len(res), len([ x for x in mols if x.HasSubstructMatch(core, useChirality=True) ])) core = Chem.MolFromSmarts("C-1-C-C-O-C(-[O])(-[N])1") core.SetProp("core", "core") res = slib.GetMatches(core, useChirality=False) self.assertEqual( len(res), len([ x for x in mols if x.HasSubstructMatch(core, useChirality=False) ])) core = Chem.MolFromSmarts("C-1-C-C-O-[C@@](-[O])(-[N])1") res = slib.GetMatches(core, useChirality=False) self.assertEqual( len(res), len([ x for x in mols if x.HasSubstructMatch(core, useChirality=False) ])) core = Chem.MolFromSmarts("C-1-C-C-O-[C@@](-[O])(-[N])1") res = slib.GetMatches(core) self.assertEqual( len(res), len([ x for x in mols if x.HasSubstructMatch(core, useChirality=True) ]))
def test_PatternHolder(self): fname = os.path.join(os.environ["RDBASE"], "Data", "NCI", "first_5K.smi") suppl = Chem.SmilesMolSupplier(fname, delimiter="\t", titleLine=False) mols1 = rdSubstructLibrary.CachedTrustedSmilesMolHolder() fps1 = rdSubstructLibrary.PatternHolder(2048) ssslib1 = rdSubstructLibrary.SubstructLibrary(mols1, fps1) mols2 = rdSubstructLibrary.CachedTrustedSmilesMolHolder() fps2 = rdSubstructLibrary.PatternHolder() ssslib2 = rdSubstructLibrary.SubstructLibrary(mols2, fps2) RDLogger.DisableLog('rdApp.error') for i in range(0, 1000, 10): try: mol = suppl[i] except Exception: continue if (not mol): continue mols1.AddSmiles(Chem.MolToSmiles(mol)) fps1.AddFingerprint(fps1.MakeFingerprint(mol)) ssslib2.AddMol(mol) RDLogger.EnableLog('rdApp.error') query = Chem.MolFromSmarts("N") self.assertIsNotNone(query) matches1 = sorted(ssslib1.GetMatches(query)) matches2 = sorted(ssslib2.GetMatches(query)) self.assertEqual(len(matches1), len(matches2)) self.assertTrue(all([m1 == matches2[i] for i, m1 in enumerate(matches1)]))
def test_init_from_and_to_stream(self): mols = makeStereoExamples() * 10 holder = rdSubstructLibrary.CachedSmilesMolHolder() # one day I'll fix this, but we need to write text but read binary # grrr.... something about the python_streambuf handler. slib = rdSubstructLibrary.SubstructLibrary(holder, None) for mol in mols: holder.AddSmiles(Chem.MolToSmiles(mol, isomericSmiles=True)) if rdSubstructLibrary.SubstructLibraryCanSerialize(): fd, path = tempfile.mkstemp() with open(path, 'w') as file: slib.ToStream(file) with open(path, 'rb') as file: slib2 = rdSubstructLibrary.SubstructLibrary() slib2.InitFromStream(file) self.assertEqual(len(slib), len(slib2)) from io import StringIO, BytesIO s = StringIO() slib.ToStream(s) sb = BytesIO(s.getvalue().encode("ascii")) self.assertTrue(len(sb.getvalue()) > 0) slib3 = rdSubstructLibrary.SubstructLibrary() slib3.InitFromStream(sb) self.assertEqual(len(slib), len(slib2))
def test_addpatterns(self): pdb_ligands = [ "CCS(=O)(=O)c1ccc(OC)c(Nc2ncc(-c3cccc(-c4ccccn4)c3)o2)c1", "COc1ccc(S(=O)(=O)NCC2CC2)cc1Nc1ncc(-c2cccc(-c3cccnc3)c2)o1", "COc1ccc(-c2oc3ncnc(N)c3c2-c2ccc(NC(=O)Nc3cc(C(F)(F)F)ccc3F)cc2)cc1", "COC(=O)Nc1nc2ccc(Oc3ccc(NC(=O)Nc4cc(C(F)(F)F)ccc4F)cc3)cc2[nH]1", "COc1cc(Nc2ncnc(-c3cccnc3Nc3ccccc3)n2)cc(OC)c1OC", "O=C(Nc1ccc(Oc2ccccc2)cc1)c1cccnc1NCc1ccncc1", "O=C(Nc1ccc(Oc2ccccc2)cc1)c1cccnc1NCc1ccncc1", "CNC(=O)c1cc(Oc2ccc3[nH]c(Nc4ccc(Cl)c(C(F)(F)F)c4)nc3c2)ccn1", "CNC(=O)c1cc(Oc2ccc3oc(Nc4ccc(Cl)c(OCC5CCC[NH+]5C)c4)nc3c2)ccn1", "CNC(=O)c1cc(Oc2ccc3oc(Nc4ccc(Cl)c(OCC5CCC[NH+]5C)c4)nc3c2)ccn1", "COc1cc2nccc(Oc3ccc4c(c3)OCCN4C(=O)Nc3ccc(Cl)cc3)c2cc1OC", "CNC(=O)c1c(C)oc2cc(Oc3cc[nH+]c4cc(OCCN5CCOCC5)ccc34)ccc12", "COc1cc2[nH+]ccc(Oc3ccc4c(C(=O)Nc5ccc(Cl)cc5)cccc4c3)c2cc1OC", "COc1cc2[nH+]ccc(Oc3ccc4c(C(=O)Nc5ccc(Cl)cc5)cccc4c3)c2cc1OC", "COc1cc2[nH+]ccc(Oc3ccc4c(C(=O)NC5CC5)cccc4c3)c2cc1OC", "COc1cc2[nH+]ccc(Oc3ccc4c(C(=O)NC5CC5)cccc4c3)c2cc1OC", "Cc1ccc(C(=O)Nc2cc(CCC[NH+](C)C)cc(C(F)(F)F)c2)cc1Nc1ncccc1-c1ccncn1", "COc1cc(Nc2nccc(Nc3ccc4c(C)n[nH]c4c3)n2)cc(OC)c1OC", "COc1cc(Nc2nccc(N(C)c3ccc4c(C)n[nH]c4c3)n2)cc(OC)c1OC", "Cc1ccn(-c2ccc3c(c2)NCC3(C)C)c(=O)c1-c1ccc2nc(N)ncc2c1", "Cc1ccn(-c2ccc3c(c2)NCC3(C)C)c(=O)c1-c1ccc2nc(N)ncc2c1", "Cc1ccc(C(=O)NCCC2CCCC2)cc1C(=O)Nc1ccc(N)nc1", "Cc1ccc(C(=O)NCCC2CCCC2)cc1C(=O)Nc1ccc(N)nc1", "Cc1ccn(-c2cccc(C(F)(F)F)c2)c(=O)c1-c1ccc2nc(N)ncc2c1", "Cc1ccn(-c2cccc(C(F)(F)F)c2)c(=O)c1-c1ccc2nc(N)ncc2c1", "O=C(Nc1cncnc1)c1c(Cl)ccc2c(Nc3cccc(C(F)(F)F)c3)noc12", "O=C(Nc1cncnc1)c1c(Cl)ccc2c(Nc3cccc(C(F)(F)F)c3)noc12", "CC1(C)CNc2cc(NC(=O)c3cccnc3NCc3ccncc3)ccc21", "CC1(C)CNc2cc(NC(=O)c3cccnc3NCc3ccncc3)ccc21" ] for patterns in [ rdSubstructLibrary.PatternHolder(), rdSubstructLibrary.TautomerPatternHolder() ]: mols = [Chem.MolFromSmiles(smi) for smi in pdb_ligands] holder = rdSubstructLibrary.CachedMolHolder() slib_with_patterns = rdSubstructLibrary.SubstructLibrary( holder, patterns) for mol in mols: slib_with_patterns.AddMol(mol) for nthreads in [1, 2, 0]: slib_without_patterns = rdSubstructLibrary.SubstructLibrary( holder, None) rdSubstructLibrary.AddPatterns(slib_without_patterns, nthreads) # check for seg fault # were the fingerprints really created slib_without_patterns.GetFpHolder().GetFingerprint(0) for mol in mols: l1 = slib_with_patterns.CountMatches(mol) l2 = slib_without_patterns.CountMatches(mol) self.assertTrue(l1) self.assertEqual(l1, l2)
def test0SubstructLibrary(self): for fpholderCls in [None, rdSubstructLibrary.PatternHolder]: for holder in [ rdSubstructLibrary.MolHolder(), rdSubstructLibrary.CachedMolHolder(), rdSubstructLibrary.CachedSmilesMolHolder() ]: if fpholderCls: fpholder = fpholderCls() else: fpholder = None slib = rdSubstructLibrary.SubstructLibrary(holder, fpholder) for i in range(100): m = Chem.MolFromSmiles("c1ccccc1") self.assertEqual(slib.AddMol(m), i) res = slib.GetMatches(m) t2 = time.time() self.assertTrue(len(res) == 100) res = slib.GetMatches(m) self.assertEqual(len(res), 100) self.assertTrue(set(res) == set(list(range(100)))) res = slib.GetMatches(m, maxResults=100) self.assertEqual(len(res), 100) self.assertEqual( len(slib.GetMatches(m, startIdx=0, endIdx=100)), 100) self.assertTrue(slib.HasMatch(m)) self.assertEqual(slib.CountMatches(m), 100)
def testSearchOrder(self): ssl = rdSubstructLibrary.SubstructLibrary() for smi in ("CCCOC", "CCCCOCC", "CCOC", "COC", "CCCCCOC"): ssl.AddMol(Chem.MolFromSmiles(smi)) ssl.SetSearchOrder((3, 2, 0, 1, 4)) self.assertEqual(ssl.GetSearchOrder(), (3, 2, 0, 1, 4)) qm = Chem.MolFromSmiles('COC') self.assertEqual(list(ssl.GetMatches(qm, maxResults=2)), [3, 2])
def testMolBundles(self): ssl = rdSubstructLibrary.SubstructLibrary() for smi in ('CCOC', 'CCNC', 'COOCOO', 'CCNC', 'CCCC'): ssl.AddMol(Chem.MolFromSmiles(smi)) bndl = Chem.MolBundle() for smi in ('COC', 'CCC'): bndl.AddMol(Chem.MolFromSmiles(smi)) self.assertEqual(list(ssl.GetMatches(bndl)), [0, 4]) bndl.AddMol(Chem.MolFromSmiles('CN')) self.assertEqual(list(sorted(ssl.GetMatches(bndl))), [0, 1, 3, 4])
def testSubstructParameters(self): ssl = rdSubstructLibrary.SubstructLibrary() for smi in ('C[C@H](F)Cl', 'C[C@@H](F)Cl', 'CC(F)Cl'): ssl.AddMol(Chem.MolFromSmiles(smi)) bndl = Chem.MolBundle() for smi in ('C[C@H](F)Cl', ): bndl.AddMol(Chem.MolFromSmiles(smi)) params = Chem.SubstructMatchParameters() self.assertEqual(list(sorted(ssl.GetMatches(bndl, params))), [0, 1, 2]) params.useChirality = True self.assertEqual(list(sorted(ssl.GetMatches(bndl, params))), [0])
def test_basic_addpatterns(self): # add mols pdb_ligands = [ "CCS(=O)(=O)c1ccc(OC)c(Nc2ncc(-c3cccc(-c4ccccn4)c3)o2)c1", "COc1ccc(S(=O)(=O)NCC2CC2)cc1Nc1ncc(-c2cccc(-c3cccnc3)c2)o1", "COc1ccc(-c2oc3ncnc(N)c3c2-c2ccc(NC(=O)Nc3cc(C(F)(F)F)ccc3F)cc2)cc1", "COC(=O)Nc1nc2ccc(Oc3ccc(NC(=O)Nc4cc(C(F)(F)F)ccc4F)cc3)cc2[nH]1", "COc1cc(Nc2ncnc(-c3cccnc3Nc3ccccc3)n2)cc(OC)c1OC", "O=C(Nc1ccc(Oc2ccccc2)cc1)c1cccnc1NCc1ccncc1", "O=C(Nc1ccc(Oc2ccccc2)cc1)c1cccnc1NCc1ccncc1", "CNC(=O)c1cc(Oc2ccc3[nH]c(Nc4ccc(Cl)c(C(F)(F)F)c4)nc3c2)ccn1", "CNC(=O)c1cc(Oc2ccc3oc(Nc4ccc(Cl)c(OCC5CCC[NH+]5C)c4)nc3c2)ccn1", "CNC(=O)c1cc(Oc2ccc3oc(Nc4ccc(Cl)c(OCC5CCC[NH+]5C)c4)nc3c2)ccn1", "COc1cc2nccc(Oc3ccc4c(c3)OCCN4C(=O)Nc3ccc(Cl)cc3)c2cc1OC", "CNC(=O)c1c(C)oc2cc(Oc3cc[nH+]c4cc(OCCN5CCOCC5)ccc34)ccc12", "COc1cc2[nH+]ccc(Oc3ccc4c(C(=O)Nc5ccc(Cl)cc5)cccc4c3)c2cc1OC", "COc1cc2[nH+]ccc(Oc3ccc4c(C(=O)Nc5ccc(Cl)cc5)cccc4c3)c2cc1OC", "COc1cc2[nH+]ccc(Oc3ccc4c(C(=O)NC5CC5)cccc4c3)c2cc1OC", "COc1cc2[nH+]ccc(Oc3ccc4c(C(=O)NC5CC5)cccc4c3)c2cc1OC", "Cc1ccc(C(=O)Nc2cc(CCC[NH+](C)C)cc(C(F)(F)F)c2)cc1Nc1ncccc1-c1ccncn1", "COc1cc(Nc2nccc(Nc3ccc4c(C)n[nH]c4c3)n2)cc(OC)c1OC", "COc1cc(Nc2nccc(N(C)c3ccc4c(C)n[nH]c4c3)n2)cc(OC)c1OC", "Cc1ccn(-c2ccc3c(c2)NCC3(C)C)c(=O)c1-c1ccc2nc(N)ncc2c1", "Cc1ccn(-c2ccc3c(c2)NCC3(C)C)c(=O)c1-c1ccc2nc(N)ncc2c1", "Cc1ccc(C(=O)NCCC2CCCC2)cc1C(=O)Nc1ccc(N)nc1", "Cc1ccc(C(=O)NCCC2CCCC2)cc1C(=O)Nc1ccc(N)nc1", "Cc1ccn(-c2cccc(C(F)(F)F)c2)c(=O)c1-c1ccc2nc(N)ncc2c1", "Cc1ccn(-c2cccc(C(F)(F)F)c2)c(=O)c1-c1ccc2nc(N)ncc2c1", "O=C(Nc1cncnc1)c1c(Cl)ccc2c(Nc3cccc(C(F)(F)F)c3)noc12", "O=C(Nc1cncnc1)c1c(Cl)ccc2c(Nc3cccc(C(F)(F)F)c3)noc12", "CC1(C)CNc2cc(NC(=O)c3cccnc3NCc3ccncc3)ccc21", "CC1(C)CNc2cc(NC(=O)c3cccnc3NCc3ccncc3)ccc21" ] for holder in [ rdSubstructLibrary.CachedSmilesMolHolder(), rdSubstructLibrary.CachedTrustedSmilesMolHolder() ]: for smi in pdb_ligands: holder.AddSmiles(smi) for patttern in [ None, rdSubstructLibrary.PatternHolder(), rdSubstructLibrary.TautomerPatternHolder() ]: lib = rdSubstructLibrary.SubstructLibrary(holder) rdSubstructLibrary.AddPatterns(lib, numThreads=-1) self.assertEqual(len(lib.GetMolHolder()), len(lib.GetFpHolder())) for smi in pdb_ligands: self.assertTrue(lib.CountMatches(Chem.MolFromSmiles(smi)))
def testOptions(self): mols = makeStereoExamples() * 10 for holderCls in [ rdSubstructLibrary.MolHolder, rdSubstructLibrary.CachedMolHolder, rdSubstructLibrary.CachedSmilesMolHolder, rdSubstructLibrary.CachedTrustedSmilesMolHolder, ]: holder = holderCls() slib = rdSubstructLibrary.SubstructLibrary(holder, None) for mol in mols: slib.AddMol(mol) core = Chem.MolFromSmarts("C-1-C-C-O-C(-*)(-*)1") res = slib.GetMatches(core) self.assertEqual( len(res), len([ x for x in mols if x.HasSubstructMatch(core, useChirality=True) ])) core = Chem.MolFromSmarts("C-1-C-C-O-C(-[O])(-[N])1") core.SetProp("core", "core") res = slib.GetMatches(core, useChirality=False) self.assertEqual( len(res), len([ x for x in mols if x.HasSubstructMatch(core, useChirality=False) ])) core = Chem.MolFromSmarts("C-1-C-C-O-[C@@](-[O])(-[N])1") res = slib.GetMatches(core, useChirality=False) self.assertEqual( len(res), len([ x for x in mols if x.HasSubstructMatch(core, useChirality=False) ])) core = Chem.MolFromSmarts("C-1-C-C-O-[C@@](-[O])(-[N])1") res = slib.GetMatches(core) self.assertEqual( len(res), len([ x for x in mols if x.HasSubstructMatch(core, useChirality=True) ]))
def testSearchOrder2(self): ssl = rdSubstructLibrary.SubstructLibrary() for smi in ("CCCOC", "CCCCOCC", "CCOC", "COC", "CCCCCOC"): ssl.AddMol(Chem.MolFromSmiles(smi)) def setSearchSmallestFirst(sslib): searchOrder = list(range(len(sslib))) holder = sslib.GetMolHolder() searchOrder.sort( key=lambda x, holder=holder: holder.GetMol(x).GetNumAtoms()) sslib.SetSearchOrder(searchOrder) setSearchSmallestFirst(ssl) qm = Chem.MolFromSmiles('COC') self.assertEqual(list(ssl.GetMatches(qm)), [3, 2, 0, 1, 4])
def main(directory: str, chebml_version: str): """Download the ChEBML data.""" os.makedirs(directory, exist_ok=True) bradley_path = os.path.join(directory, 'jm020472j_s2.xls') if not os.path.exists(bradley_path): try: wget.download(bradley_url, out=directory) except: click.echo('There goes ACS stopping science') chembl_url = ( f'ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/' f'chembl_{chebml_version}/chembl_{chebml_version}.sdf.gz') sdf_path = os.path.join(directory, f'chembl_{chebml_version}.sdf.gz') if not os.path.exists(sdf_path): wget.download(chembl_url, out=directory) sss_path = os.path.join(directory, f'chembl{chebml_version}_sssdata.pkl') if not os.path.exists(sss_path): click.echo(f'RDKit Version: {rdBase.rdkitVersion}') data = [] with gzip.GzipFile(sdf_path) as gz: suppl = Chem.ForwardSDMolSupplier(gz) for mol in tqdm(suppl, desc=f'Processing ChEBML {chebml_version}', unit_scale=True): if mol is None or mol.GetNumAtoms() > 50: continue fp = Chem.PatternFingerprint(mol) smi = Chem.MolToSmiles(mol) data.append((smi, fp)) click.echo(f'Outputting to {sss_path}') with open(sss_path, 'wb') as file: mols = rdSubstructLibrary.CachedTrustedSmilesMolHolder() fps = rdSubstructLibrary.PatternHolder() for smi, fp in data: mols.AddSmiles(smi) fps.AddFingerprint(fp) library = rdSubstructLibrary.SubstructLibrary(mols, fps) pickle.dump(library, file, protocol=pickle.HIGHEST_PROTOCOL) click.echo('Done ;)')
def read_in_lib(input_smi): t1 = time.time() mols = rdSubstructLibrary.CachedTrustedSmilesMolHolder() fps = rdSubstructLibrary.PatternHolder() with open(input_smi, 'r') as inf: ls = [x.split() for x in inf] ls.pop(0) with open(input_smi.replace(".smi", ".pkl"), 'rb') as pklf: for l in tqdm.tqdm(ls): smi = l[1] mols.AddSmiles(smi) fp = pickle.load(pklf) fps.AddFingerprint(fp) library = rdSubstructLibrary.SubstructLibrary(mols, fps) t2 = time.time() print("That took %.2f seconds. The library has %d molecules." % (t2 - t1, len(library))) return library
def testBinaryCache(self): mols = makeStereoExamples() * 10 holder = rdSubstructLibrary.CachedMolHolder() slib = rdSubstructLibrary.SubstructLibrary(holder, None) for mol in mols: holder.AddBinary(mol.ToBinary()) core = Chem.MolFromSmarts("C-1-C-C-O-C(-*)(-*)1") res = slib.GetMatches(core) self.assertEqual( len(res), len([ x for x in mols if x.HasSubstructMatch(core, useChirality=True) ])) core = Chem.MolFromSmarts("C-1-C-C-O-C(-[O])(-[N])1") core.SetProp("core", "core") res = slib.GetMatches(core, useChirality=False) self.assertEqual( len(res), len([ x for x in mols if x.HasSubstructMatch(core, useChirality=False) ])) core = Chem.MolFromSmarts("C-1-C-C-O-[C@@](-[O])(-[N])1") res = slib.GetMatches(core, useChirality=False) self.assertEqual( len(res), len([ x for x in mols if x.HasSubstructMatch(core, useChirality=False) ])) core = Chem.MolFromSmarts("C-1-C-C-O-[C@@](-[O])(-[N])1") res = slib.GetMatches(core) self.assertEqual( len(res), len([ x for x in mols if x.HasSubstructMatch(core, useChirality=True) ]))
def test1SubstructLibrary(self): for fpholderCls in [None, rdSubstructLibrary.PatternHolder]: for holder in [ rdSubstructLibrary.MolHolder(), rdSubstructLibrary.CachedMolHolder(), rdSubstructLibrary.CachedSmilesMolHolder() ]: if fpholderCls: fpholder = fpholderCls() else: fpholder = None slib = rdSubstructLibrary.SubstructLibrary(holder, fpholder) mols = [] for i in range(100): m = Chem.MolFromSmiles("c1ccccc1") self.assertEqual(slib.AddMol(m), i * 2) mols.append(m) m2 = Chem.MolFromSmiles("CCCC") self.assertEqual(slib.AddMol(m2), i * 2 + 1) mols.append(m2) res = slib.GetMatches(m) self.assertEqual(len(res), 100) self.assertEqual(set(res), set(list(range(0, 200, 2)))) res = slib.GetMatches(m2) self.assertEqual(len(res), 100) self.assertTrue(set(res) == set(list(range(1, 200, 2)))) res = slib.GetMatches(m) self.assertEqual(len(res), 100) res = slib.GetMatches(m, maxResults=100) self.assertEqual(len(res), 100) self.assertEqual( len(slib.GetMatches(m, startIdx=0, endIdx=50 * 2)), 50) self.assertEqual( len(slib.GetMatches(m2, startIdx=1, endIdx=50 * 2 + 1)), 50) self.assertTrue(slib.HasMatch(m)) self.assertTrue(slib.HasMatch(m2)) self.assertEqual(slib.CountMatches(m), 100) self.assertEqual(slib.CountMatches(m2), 100)
def testSearchOrder(self): for keyholder in [None, rdSubstructLibrary.KeyFromPropHolder()]: ssl = rdSubstructLibrary.SubstructLibrary( rdSubstructLibrary.MolHolder(), keyholder) for idx, smi in enumerate( ("CCCOC", "CCCCOCC", "CCOC", "COC", "CCCCCOC")): m = Chem.MolFromSmiles(smi) m.SetProp("_Name", str(idx)) ssl.AddMol(m) ssl.SetSearchOrder((3, 2, 0, 1, 4)) self.assertEqual(ssl.GetSearchOrder(), (3, 2, 0, 1, 4)) qm = Chem.MolFromSmiles('COC') self.assertEqual(list(ssl.GetMatches(qm, maxResults=2)), [3, 2]) self.assertEqual(list(ssl.GetMatches(qm, maxResults=2)), [3, 2]) if keyholder: self.assertEqual(keyholder.GetPropName(), "_Name") self.assertEqual( list(ssl.GetKeyHolder().GetKeys( ssl.GetMatches(qm, maxResults=2))), ['3', '2'])
def __init__(self, proj, datapath, dbpath, chembldb, flimit=1e-3, MinClusterSize=20, clustering='UPGMA', calcDists=True, calcScores=False): self.proj=proj self.datapath=datapath self.dbpath=dbpath self.chembldb=chembldb self.flimit=flimit self.MinClusterSize=MinClusterSize self.clustering=clustering self.calcScores=calcScores self.calcDists=calcDists # load data self.moldata_proj, self.distdata_proj=utilsDataPrep.PrepareData(self.proj,self.datapath,distMeasure='Tanimoto',FP='Morgan2', calcDists=self.calcDists) if arthor is not None: if not os.path.isdir(dbpath): os.mkdir(dbpath) # set up project database for arthor substructure matching df=self.moldata_proj[['Structure','ID']] df.to_csv('./arthor/{0}.smi'.format(self.proj), header=None, index=None, sep=' ') os.system('smi2atdb -j 0 -t {0}{1}.smi {0}{1}.atdb'.format(self.dbpath,self.proj)) os.system('atdb2fp -j 0 {0}{1}.atdb'.format(self.dbpath,self.proj)) self.proj_db=arthor.SubDb('{0}{1}.atdb'.format(self.dbpath,self.proj)) else: if type(dbpath)==rdSubstructLibrary.SubstructLibrary: self.proj_db = dbpath self.db_size = len(self.proj_db) else: if not os.path.exists(dbpath): print("creating database") mols = rdSubstructLibrary.CachedTrustedSmilesMolHolder() fps = rdSubstructLibrary.PatternHolder() for smi in self.moldata_proj['Structure']: m = Chem.MolFromSmiles(smi) mols.AddSmiles(Chem.MolToSmiles(m)) fps.AddFingerprint(Chem.PatternFingerprint(m)) self.proj_db = rdSubstructLibrary.SubstructLibrary(mols,fps) self.db_size = len(mols) pickle.dump(self.proj_db,open(dbpath,'wb+')) else: self.proj_db = pickle.load(open(dbpath,'rb')) self.db_size = len(self.proj_db)
def testRingSmartsWithTrustedSmiles(self): pat = Chem.MolFromSmarts("[C&R1]") pat2 = Chem.MolFromSmarts("C@C") # ring bond holder = rdSubstructLibrary.CachedTrustedSmilesMolHolder() lib = rdSubstructLibrary.SubstructLibrary(holder) lib.AddMol(Chem.MolFromSmiles("C1CC1")) # make sure we can get an unsanitized molecule that fails (no ring info) print("Testing atom rings") with self.assertRaises(RuntimeError): holder.GetMol(0).HasSubstructMatch(pat) print("testing bond rings") with self.assertRaises(RuntimeError): holder.GetMol(0).HasSubstructMatch(pat2) # shouldn't throw print("searching atom rings") self.assertEqual(len(lib.GetMatches(pat)), 1) self.assertEqual(lib.CountMatches(pat), 1) print("searching bond rings") self.assertEqual(len(lib.GetMatches(pat2)), 1) self.assertEqual(lib.CountMatches(pat2), 1) print("done")
def testPropHolder(self): for propname in [None, 'foo']: if propname is None: keyholder = rdSubstructLibrary.KeyFromPropHolder() else: keyholder = rdSubstructLibrary.KeyFromPropHolder(propname) library = rdSubstructLibrary.SubstructLibrary( rdSubstructLibrary.MolHolder(), keyholder) m = Chem.MolFromSmiles('CCC') if propname is None: self.assertEqual(keyholder.GetPropName(), "_Name") else: self.assertEqual(keyholder.GetPropName(), propname) if propname: m.SetProp(propname, 'Z11234') else: m.SetProp("_Name", 'Z11234') library.AddMol(m) indices = library.GetMatches(m) self.assertEqual(['Z11234'], list(library.GetKeyHolder().GetKeys(indices)))
def test1SubstructLibrary(self): for keyholderCls in [None, rdSubstructLibrary.KeyFromPropHolder]: for fpholderCls in [None, rdSubstructLibrary.PatternHolder]: for holder in [ rdSubstructLibrary.MolHolder(), rdSubstructLibrary.CachedMolHolder(), rdSubstructLibrary.CachedSmilesMolHolder() ]: if fpholderCls: fpholder = fpholderCls() else: fpholder = None if keyholderCls: keyholder = keyholderCls() self.assertEqual(keyholder.GetPropName(), "_Name") else: keyholder = None slib_ = rdSubstructLibrary.SubstructLibrary( holder, fpholder, keyholder) mols = [] for i in range(100): m = Chem.MolFromSmiles("c1ccccc1") m.SetProp("_Name", str(i * 2)) self.assertEqual(slib_.AddMol(m), i * 2) mols.append(m) m2 = Chem.MolFromSmiles("CCCC") m2.SetProp("_Name", str(i * 2 + 1)) self.assertEqual(slib_.AddMol(m2), i * 2 + 1) mols.append(m2) libs = [slib_] if rdSubstructLibrary.SubstructLibraryCanSerialize(): serialized1 = pickle.loads(pickle.dumps(slib_)) serialized2 = rdSubstructLibrary.SubstructLibrary( slib_.Serialize()) libs.append(serialized1) libs.append(serialized2) for slib in libs: res = slib.GetMatches(m) self.assertEqual(len(res), 100) self.assertEqual(set(res), set(list(range(0, 200, 2)))) if keyholderCls: self.assertEqual( [str(idx) for idx in res], [str(idx) for idx in range(0, 200, 2)]) res = slib.GetMatches(m2) self.assertEqual(len(res), 100) self.assertTrue( set(res) == set(list(range(1, 200, 2)))) if keyholderCls: self.assertEqual( [str(idx) for idx in res], [str(idx) for idx in range(1, 200, 2)]) res = slib.GetMatches(m) self.assertEqual(len(res), 100) res = slib.GetMatches(m, maxResults=100) self.assertEqual(len(res), 100) self.assertEqual( len(slib.GetMatches(m, startIdx=0, endIdx=50 * 2)), 50) self.assertEqual( len( slib.GetMatches(m2, startIdx=1, endIdx=50 * 2 + 1)), 50) self.assertTrue(slib.HasMatch(m)) self.assertTrue(slib.HasMatch(m2)) self.assertEqual(slib.CountMatches(m), 100) self.assertEqual(slib.CountMatches(m2), 100)