def testAppend(self): try: fname = tempfile.mktemp() + ".smi" fname2 = tempfile.mktemp() + "-2.smi" storefname = tempfile.mktemp() + ".store" with open(fname, 'w') as f: f.write(one_smiles) opts = make_store.MakeStorageOptions(storage=storefname, smilesfile=fname, hasHeader=False, batchsize=1, smilesColumn=0, nameColumn=1, seperator=" ", descriptors="RDKit2DSubset", index_inchikey=True) make_store.make_store(opts) with contextlib.closing(DescriptaStore(storefname)) as store: self.assertEqual(store.lookupName("0"), 0) self.assertEqual( store.lookupInchiKey("UHOVQNZJYSORNB-UHFFFAOYSA-N"), [0]) self.assertEqual(store.descriptors().get(0), (True, 78.046950192, 0.0, 1.0, 0.0, 1.0)) # now append some junk with open(fname2, 'w') as f: f.write(two_smiles) opts.smilesfile = fname2 append_store.append_store(opts) with contextlib.closing(DescriptaStore(storefname)) as store: self.assertEqual(len(store), 2) self.assertEqual(store.lookupName("0"), 0) self.assertEqual( store.lookupInchiKey("UHOVQNZJYSORNB-UHFFFAOYSA-N"), [0, 1]) self.assertEqual(store.descriptors().get(0), (True, 78.046950192, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.lookupName("1"), 1) self.assertEqual( store.lookupInchiKey("UHOVQNZJYSORNB-UHFFFAOYSA-N"), [0, 1]) self.assertEqual(store.descriptors().get(1), (True, 78.046950192, 0.0, 1.0, 0.0, 1.0)) finally: if os.path.exists(fname): os.unlink(fname) if os.path.exists(fname2): os.unlink(fname2) if os.path.exists(storefname): shutil.rmtree(storefname)
def testCanonicalSmiles2(self): try: fname = tempfile.mktemp() + ".smi" storefname = tempfile.mktemp() + ".store" with open(fname, 'w') as f: f.write(many_smiles) opts = make_store.MakeStorageOptions(storage=storefname, smilesfile=fname, hasHeader=False, smilesColumn=0, nameColumn=1, seperator=" ", descriptors="Canonicalize", index_inchikey=False) make_store.make_store(opts) with contextlib.closing(DescriptaStore(storefname)) as store: counts = [] for i in range(10): r = store.descriptors().get(i) counts.append(r[0]) counts.sort() self.assertEqual(counts, list(range(8, 18))) finally: if os.path.exists(fname): os.unlink(fname) if os.path.exists(storefname): shutil.rmtree(storefname)
def testChiralMorgans(self): try: fname = tempfile.mktemp()+".smi" storefname = tempfile.mktemp()+".store" with open(fname, 'w') as f: f.write(many_smiles), opts = make_store.MakeStorageOptions( storage=storefname, smilesfile=fname, hasHeader=False, smilesColumn=0, nameColumn=1, seperator=" ", descriptors="MorganChiral3Counts", index_inchikey=True ) make_store.make_store(opts) generator = DescriptorGenerator.REGISTRY["MorganChiral3Counts".lower()] with contextlib.closing(DescriptaStore(storefname)) as store: for i in range(10): r = store.descriptors().get(i) self.assertEqual(r, expected_chiral_data[i]) finally: if os.path.exists(fname): os.unlink(fname) if os.path.exists(storefname): shutil.rmtree(storefname)
def testNonesWithCalcFlags(self): try: fname = tempfile.mktemp() + ".smi" storefname = tempfile.mktemp() + ".store" with open(fname, 'w') as f: f.write(one_smiles) opts = make_store.MakeStorageOptions( storage=storefname, smilesfile=fname, hasHeader=False, smilesColumn=0, nameColumn=1, seperator=" ", descriptors="NANDescriptorsWithCalcFlags", index_inchikey=True) make_store.make_store(opts) with contextlib.closing(DescriptaStore(storefname)) as store: self.assertFalse(store.descriptors().get(0)[0]) finally: if os.path.exists(fname): os.unlink(fname) if os.path.exists(storefname): shutil.rmtree(storefname)
def testRDKitFPBits(self): try: fname = tempfile.mktemp() + ".smi" storefname = tempfile.mktemp() + ".store" with open(fname, 'w') as f: f.write("\n".join( ['{0} {1}'.format(v, k) for k, v in testSmiles.items()])) opts = make_store.MakeStorageOptions(storage=storefname, smilesfile=fname, hasHeader=False, smilesColumn=0, nameColumn=1, seperator=" ", descriptors="RDKitFPBits", index_inchikey=False) make_store.make_store(opts) with contextlib.closing(DescriptaStore(storefname)) as store: for i in range(4): r = store.descriptors().get(i) self.assertEqual(r, expected_RDKFP_data[i]) finally: if os.path.exists(fname): os.unlink(fname) if os.path.exists(storefname): shutil.rmtree(storefname)
def testNormalized(self): try: fname = tempfile.mktemp() + ".smi" storefname = tempfile.mktemp() + ".store" with open(fname, 'w') as f: f.write(many_smiles), opts = make_store.MakeStorageOptions( storage=storefname, smilesfile=fname, hasHeader=False, smilesColumn=0, nameColumn=1, seperator=" ", descriptors="RDKit2DNormalized", index_inchikey=True) make_store.make_store(opts) generator = DescriptorGenerator.REGISTRY[ "RDKit2DNormalized".lower()] results = [] with contextlib.closing(DescriptaStore(storefname)) as store: for i in range(10): r = store.descriptors().get(i) compare_results(self, r, expected[i]) finally: if os.path.exists(fname): os.unlink(fname) if os.path.exists(storefname): shutil.rmtree(storefname)
def testManyNoInchi(self): try: fname = tempfile.mktemp() + ".smi" storefname = tempfile.mktemp() + ".store" with open(fname, 'w') as f: f.write(many_smiles) opts = make_store.MakeStorageOptions(storage=storefname, smilesfile=fname, hasHeader=False, batchsize=1, smilesColumn=0, nameColumn=1, seperator=" ", descriptors="RDKit2DSubset", index_inchikey=False) make_store.make_store(opts) origdata = many_smiles.split("\n") with contextlib.closing(DescriptaStore(storefname)) as store: for i in range(10): self.assertEqual(store.lookupName(str(i)), i) self.assertEqual(store.descriptors().get(0), (True, 78.046950192, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(1), (True, 92.062600256, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(2), (True, 106.07825032, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(3), (True, 120.093900384, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(4), (True, 134.109550448, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(5), (True, 148.125200512, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(6), (True, 162.140850576, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(7), (True, 176.15650064, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(8), (True, 190.172150704, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(9), (True, 204.187800768, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(10), (False, 0.0, 0.0, 0.0, 0.0, 0.0)) for i in range(10): m = store.molIndex().getRDMol(i) smiles, name = store.molIndex().get(i) self.assertEqual(name, str(i)) self.assertEqual(smiles, origdata[i].split()[0]) finally: if os.path.exists(fname): os.unlink(fname) if os.path.exists(storefname): shutil.rmtree(storefname)
def testColCache(self): try: fname = tempfile.mktemp() + ".smi" storefname = tempfile.mktemp() + ".store" with open(fname, 'w') as f: f.write(many_smiles) opts = make_store.MakeStorageOptions(storage=storefname, smilesfile=fname, hasHeader=False, smilesColumn=0, nameColumn=1, seperator=" ", descriptors="RDKit2DSubset", index_inchikey=True) make_store.make_store(opts) with contextlib.closing(DescriptaStore(storefname)) as store: cols = [] # get normal data for idx, _ in enumerate(store.db.colnames): col = list(store.db.getColByIdx(idx)) cols.append(col) # cache the columns store.db.cacheColumns() # make sure the datafiles are written for idx, _ in enumerate(store.db.colnames): fn = os.path.join(store.db.colCacheDir, str(idx)) self.assertTrue(os.path.exists(fn), fn) for idx, _ in enumerate(store.db.colnames): col = list(store.db.getColByIdx(idx)) self.assertEqual(col, cols[idx]) # swap a data file idx = 0, 1 fn0 = os.path.join(store.db.colCacheDir, str(0)) fn1 = os.path.join(store.db.colCacheDir, str(1)) shutil.move(fn0, fn0 + ".bak") shutil.move(fn1, fn0) shutil.move(fn0 + ".bak", fn1) try: col = list(store.db.getColByIdx(0)) caught = false except struct.error: caught = True self.assertTrue( caught, "moving cache file should have broken the cache") finally: if os.path.exists(fname): os.unlink(fname) if os.path.exists(storefname): shutil.rmtree(storefname)
def main(): opts = parser.parse_args() store = DescriptaStore(opts.storage) N = len(store) gen = store.getDescriptorCalculator() randomize = True if opts.samples == -1: randomize = False opts.samples = len(store) next = .05 for i in range(opts.samples): if i and float(i) / opts.samples > next: print("Validated %2.2f%%" % (next * 100)) next += .05 if randomize: idx = random.randint(0, N - 1) else: idx = i v = store.descriptors().get(idx) smiles = store.molIndex().getMol(idx) name = None try: name = store.molIndex().getName(idx) except: pass res = gen.process(smiles) if res is None: assert v == tuple([0] * len(v)) continue data = [] for x in gen.process(smiles): if math.isnan(x): data.append(None) else: data.append(x) v2 = [] for x in v: if math.isnan(x): v2.append(None) else: v2.append(x) assert v2 == data, "idx:%s smiles:%s name:%s \n%r\n\t%r" % ( idx, smiles, name, v, data)
def testOffByOne(self): try: fname = tempfile.mktemp() + ".smi" storefname = tempfile.mktemp() + ".store" with open(fname, 'w') as f: f.write(one_smiles) opts = make_store.MakeStorageOptions(storage=storefname, smilesfile=fname, hasHeader=False, batchsize=1, smilesColumn=0, nameColumn=1, seperator=" ", descriptors="RDKit2DSubset", index_inchikey=True) make_store.make_store(opts) with contextlib.closing(DescriptaStore(storefname)) as store: self.assertEqual(store.lookupName("0"), 0) self.assertEqual( store.lookupInchiKey("UHOVQNZJYSORNB-UHFFFAOYSA-N"), [0]) self.assertEqual(store.descriptors().get(0), (True, 78.046950192, 0.0, 1.0, 0.0, 1.0)) try: store.lookupInchiKey("MY DOG HAS FLEAS") self.assertTrue(False) # should not get here except KeyError: pass finally: if os.path.exists(fname): os.unlink(fname) if os.path.exists(storefname): shutil.rmtree(storefname)
parser.add_argument("--output-name", action="store_true", default=True, help="outputs the molecule name (if available)") # to do? #parser.add_argument("--output-smiles", action="store_true", # help="outputs the smiles column") #parser.add_argument("--output-inchi", action="store_true", # help="outputs an inchi column (if available)") parser.add_argument("--verbose", action="store_true", help="Verbose logging") opts = parser.parse_args() store = DescriptaStore(opts.storage) if opts.seperator == ",": writer = csv.writer(sys.stdout) elif opts.seperator == "\t": writer = csv.writer(sys.stdout, dialiect=csv.excel_tab) if opts.keep_missing_descriptors: opts.keep_calculatedflags = True if opts.output_name: writer.writerow(['Name'] + store.getDescriptorNames(opts.keep_calculatedflags)) else: writer.writerow(store.getDescriptorNames(opts.keep_calculatedflags)) if not opts.namefile: indices = range(len(store))
def testAppend(self): try: fname = tempfile.mktemp() + ".smi" storefname = tempfile.mktemp() + ".store" with open(fname, 'w') as f: f.write(many_smiles) opts = make_store.MakeStorageOptions(storage=storefname, smilesfile=fname, hasHeader=False, smilesColumn=0, nameColumn=1, seperator=" ", descriptors="RDKit2DSubset", index_inchikey=True) make_store.make_store(opts) with contextlib.closing(DescriptaStore(storefname)) as store: for i in range(10): self.assertEqual(store.lookupName(str(i)), i) for i in range(10): m = store.molIndex().getRDMol(i) inchi = AllChem.InchiToInchiKey(AllChem.MolToInchi(m)) self.assertEqual(store.lookupInchiKey(inchi), [i]) self.assertEqual(store.descriptors().get(0), (True, 78.046950192, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(1), (True, 92.062600256, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(2), (True, 106.07825032, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(3), (True, 120.093900384, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(4), (True, 134.109550448, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(5), (True, 148.125200512, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(6), (True, 162.140850576, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(7), (True, 176.15650064, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(8), (True, 190.172150704, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(9), (True, 204.187800768, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(10), (False, 0.0, 0.0, 0.0, 0.0, 0.0)) fname = tempfile.mktemp() + ".smi" with open(fname, 'w') as f: f.write(many_smiles2) opts.smilesfile = fname append_store.append_store(opts) with contextlib.closing(DescriptaStore(storefname)) as store: for i in range(10): self.assertEqual(store.lookupName(str(i)), i) for i in range(10): m = store.molIndex().getRDMol(i) inchi = AllChem.InchiToInchiKey(AllChem.MolToInchi(m)) m = store.molIndex().getRDMol(i + 11) self.assertTrue(m != None) inchi2 = AllChem.InchiToInchiKey(AllChem.MolToInchi(m)) self.assertEqual(inchi, inchi2) self.assertEqual(store.lookupInchiKey(inchi), [i, i + 11]) for i in range(2): self.assertEqual(store.descriptors().get(11 + 0), (True, 78.046950192, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(11 + 1), (True, 92.062600256, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(11 + 2), (True, 106.07825032, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(11 + 3), (True, 120.093900384, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(11 + 4), (True, 134.109550448, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(11 + 5), (True, 148.125200512, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(11 + 6), (True, 162.140850576, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(11 + 7), (True, 176.15650064, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(11 + 8), (True, 190.172150704, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(11 + 9), (True, 204.187800768, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(11 + 10), (False, 0.0, 0.0, 0.0, 0.0, 0.0)) finally: if os.path.exists(fname): os.unlink(fname) if os.path.exists(storefname): shutil.rmtree(storefname)
def testMany(self): try: fname = tempfile.mktemp() + ".smi" storefname = tempfile.mktemp() + ".store" with open(fname, 'w') as f: f.write(many_smiles) opts = make_store.MakeStorageOptions(storage=storefname, smilesfile=fname, hasHeader=False, smilesColumn=0, nameColumn=1, seperator=" ", descriptors="RDKit2DSubset", index_inchikey=True) make_store.make_store(opts) with contextlib.closing(DescriptaStore(storefname)) as store: for i in range(10): self.assertEqual(store.lookupName(str(i)), i) self.assertEqual(store.descriptors().get(0), (True, 78.046950192, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(1), (True, 92.062600256, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(2), (True, 106.07825032, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(3), (True, 120.093900384, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(4), (True, 134.109550448, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(5), (True, 148.125200512, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(6), (True, 162.140850576, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(7), (True, 176.15650064, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(8), (True, 190.172150704, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(9), (True, 204.187800768, 0.0, 1.0, 0.0, 1.0)) self.assertEqual(store.descriptors().get(10), (False, 0.0, 0.0, 0.0, 0.0, 0.0)) self.assertEqual( store.descriptors().getDict(7), toDict((True, 176.15650064, 0.0, 1.0, 0.0, 1.0))) calc = store.getDescriptorCalculator() for i in range(10): m = store.molIndex().getRDMol(i) sm = AllChem.MolToSmiles(m) inchi = AllChem.InchiToInchiKey(AllChem.MolToInchi(m)) self.assertEqual(store.lookupInchiKey(inchi), [i]) v = store.descriptors().get(i) sv = tuple(calc.process(sm)) self.assertEqual(v, sv) finally: if os.path.exists(fname): os.unlink(fname) if os.path.exists(storefname): shutil.rmtree(storefname)
from descriptastorus import DescriptaStore import argparse, os, logging, shutil, time, random import sys from rdkit import rdBase rdBase.DisableLog("rdApp.*") parser = argparse.ArgumentParser() parser.add_argument("storage", help="directory in which to store the descriptors") parser.add_argument("storage2", help="directory in which to store the descriptors") opts = parser.parse_args() store1 = DescriptaStore(opts.storage) store2 = DescriptaStore(opts.storage2) if store1.getDescriptorNames() != store2.getDescriptorNames(): logging.warning("Storages are not compatible, columns are different") s1 = set(store1.getDescriptorNames()) s2 = set(store2.getDescriptorNames()) if s1 == s2: logging.warning("Columns are the same but reordered") else: logging.warning("Extra columns in store1:\n\t%s", "\n\t".join(s1-s2)) logging.warning("Extra columns in store2:\n\t%s", "\n\t".join(s2-s1)) for i,(a,b) in enumerate(zip(store1.getDescriptorNames(), store2.getDescriptorNames())):