def testRingEncoding(self): ringconverter = ds.Converter(rings=True) allconverter = ds.Converter(rings=True, branches=True) data = [ # smi, DeepSmiles/RC, DeepSMILES/RC+PN (if different) ("C1CCCC1", "CCCCC5"), ("C%10CCC%10", "CCCC4"), ("C1CCCCCCCCC1", "CCCCCCCCCC%10"), ("C1CC(OC)CC1", "CCC(OC)CC5", "CCCOC))CC5"), (r"N1CC=C/1\Br", r"NCC=C/4\Br"), (r"N\1CC=C1\Br", r"NCC=C/4\Br"), (r"C1=C/CCCCCC/1", "C=C/CCCCCC/8"), (r"C\1=C/CCCCCC1", "C=C/CCCCCC/8"), (r"C\1=C/CCCCCC/1", "C=C/CCCCCC/8"), ("C1N[C@@]12CO2", "CN[C@@]3CO3"), ("[C@@]12(NC1)CO2", "[C@@](NC3)CO3", "[C@@]NC3))CO3"), ("CC1CCCO[C@]21CCCCO2", "CCCCCO[C@@]6CCCCO6"), ("CC1CCCO[C@@]12CCCCO2", "CCCCCO[C@@]6CCCCO6"), ("NC[C@]12CCCC1C3CC2CC3", "NC[C@]CCCC5CCC8CC5"), ("NC[C@]12CCCC2C3CC1CC3", "NC[C@@]CCCC5CCC8CC5"), ("C2C1=C/CCCCCC/12", "CC=C/CCCCCC/89"), ("C1C2=C/CCCCCC1/2", "CC=C/CCCCCC9/8"), ] for d in data: encoded = ringconverter.encode(d[0]) self.assertEqual(d[1], encoded) encodedB = allconverter.encode(d[0]) if len(d) == 2: self.assertEqual(d[1], encodedB) else: self.assertEqual(d[2], encodedB)
def testRingDecoding(self): ringconverter = ds.Converter(rings=True) allconverter = ds.Converter(rings=True, branches=True) data = [ # smi, DeepSmiles/RC ("C1CCC1", "CCCC4"), ("C2CC1CCC1C2", "CCCCCC4C7"), ("c1c[nH]cc1", "cc[nH]cc5"), ("C1CCCCCCCCC1", "CCCCCCCCCC%10"), ("C1CCCCCCCCC1", "CCCCCCCCCC%(10)"), ("CCCCCCC1CCC1", "CCCCCCCCCC%(4)"), ("C1=C/CCCCCC/1", "C=C/CCCCCC/8"), ("C1C1C2C2C3C3C4C4C5C5C6C6C7C7C8C8C9C9C%10C%10", "CC2CC2CC2CC2CC2CC2CC2CC2CC2CC2"), ("C(CS)N", "C(CS)N", ""), ("C[C@@H]1CCCO[C@]12CCCCO2", "C[C@@H]CCCO[C@]6CCCCO6"), ("C2C1=C/CCCCCC/12", "CC=C/CCCCCC/89"), ("C1C2=C/CCCCCC1/2", "CC=C/CCCCCC9/8"), ] for d in data: decoded = ringconverter.decode(d[1]) self.assertEqual(d[0], decoded) minput = d[2] if len(d) == 3 else d[1] if minput: decodedB = allconverter.decode(minput) self.assertEqual(d[0], decodedB)
def testStringRep(self): self.assertEqual(str(ds.Converter()), "Converter(rings=False, branches=False)") self.assertEqual(str(ds.Converter(rings=True)), "Converter(rings=True, branches=False)") self.assertEqual(str(ds.Converter(branches=True)), "Converter(rings=False, branches=True)") self.assertEqual(str(ds.Converter(rings=True, branches=True)), "Converter(rings=True, branches=True)")
def testDecodingExceptions(self): converter = ds.Converter(rings=True, branches=True) data = [ "C8", "C))I", "%10C", "9C", "CCCCCC%(3CC", "C%(100)", "C[C@@CCl", "C%CC", "-5cc[nH]9" ] for dsmi in data: self.assertRaises(ds.DecodeError, converter.decode, dsmi) # Test just the ring decoder converter = ds.Converter(rings=True) data = ["C8", "%10C", "9C", "C%(100)", "C[C@@CCl", "C%CC", "-5cc[nH]9"] for dsmi in data: self.assertRaises(ds.DecodeError, converter.decode, dsmi)
def main(): converter = deepsmiles.Converter(rings=True, branches=True) client = MongoClient("localhost:27018") db = client['COCONUT2020-07'] collection = db.uniqueNaturalProduct.aggregate([{'$project': {'_id': 0, 'coconut_id': 1, 'unique_smiles': 1}}]) allnp = pd.DataFrame(list(collection)) for index, row in allnp.iterrows(): coconut_id = row['coconut_id'] smiles = row["unique_smiles"] deep_smiles = converter.encode(smiles) db.uniqueNaturalProduct.update_one({'coconut_id': coconut_id}, {"$set": {"deep_smiles": deep_smiles}}) # for each row - get unique_smiles -> convert to deep smiles and save at deep_smiles # db.uniqueNaturalProduct.update_one({'coconut_id': data[0]}, {"$set": {"name": data[2]}}) print("done")
def testRoundTripRingClosures(self): smi = "C%(1)C%(2)C%(3)C%(4)C%(5)C%(6)C%(7)C%(8)C%(9)C%(10)C%(11)C%(12)C%(13)C%(14)C%(15)C%(16)C%(17)C%(18)C%(19)C%(20)C%(21)C%(22)C%(23)C%(24)C%(25)C%(26)C%(27)C%(28)C%(29)C%(30)C%(31)C%(32)C%(33)C%(34)C%(35)C%(36)C%(37)C%(38)C%(39)C%(40)C%(41)C%(42)C%(43)C%(44)C%(45)C%(46)C%(47)C%(48)C%(49)C%(50)C%(51)C%(52)C%(53)C%(54)C%(55)C%(56)C%(57)C%(58)C%(59)C%(60)C%(61)C%(62)C%(63)C%(64)C%(65)C%(66)C%(67)C%(68)C%(69)C%(70)C%(71)C%(72)C%(73)C%(74)C%(75)C%(76)C%(77)C%(78)C%(79)C%(80)C%(81)C%(82)C%(83)C%(84)C%(85)C%(86)C%(87)C%(88)C%(89)C%(90)C%(91)C%(92)C%(93)C%(94)C%(95)C%(96)C%(97)C%(98)C%(99)C%(100)C%(100)C%(99)C%(98)C%(97)C%(96)C%(95)C%(94)C%(93)C%(92)C%(91)C%(90)C%(89)C%(88)C%(87)C%(86)C%(85)C%(84)C%(83)C%(82)C%(81)C%(80)C%(79)C%(78)C%(77)C%(76)C%(75)C%(74)C%(73)C%(72)C%(71)C%(70)C%(69)C%(68)C%(67)C%(66)C%(65)C%(64)C%(63)C%(62)C%(61)C%(60)C%(59)C%(58)C%(57)C%(56)C%(55)C%(54)C%(53)C%(52)C%(51)C%(50)C%(49)C%(48)C%(47)C%(46)C%(45)C%(44)C%(43)C%(42)C%(41)C%(40)C%(39)C%(38)C%(37)C%(36)C%(35)C%(34)C%(33)C%(32)C%(31)C%(30)C%(29)C%(28)C%(27)C%(26)C%(25)C%(24)C%(23)C%(22)C%(21)C%(20)C%(19)C%(18)C%(17)C%(16)C%(15)C%(14)C%(13)C%(12)C%(11)C%(10)C%(9)C%(8)C%(7)C%(6)C%(5)C%(4)C%(3)C%(2)C%(1)" for branches in [True, False]: converter = ds.Converter(rings=True, branches=branches) encoded = converter.encode(smi) decoded = converter.decode(encoded) self.assertTrue("%(100)" in decoded)
def testBranchEncoding(self): branchconverter = ds.Converter(branches=True) allconverter = ds.Converter(rings=True, branches=True) data = [ # smi, DeepSmiles/PN, DeepSMILES/RC+PN (if different) ("C(O)C", "CO)C"), ("C([O])C", "C[O])C"), ("C(OF)C", "COF))C"), ("C(F)(F)C", "CF)F)C"), ("C(Cl)(Cl)C", "CCl)Cl)C"), ("C(=O)Cl", "C=O)Cl"), ("C(OC(=O)Cl)I", "COC=O)Cl)))I"), ("[C@@H](Br)(Cl)I", "[C@@H]Br)Cl)I"), ("B(c1ccccc1)(O)O", "Bc1ccccc1))))))O)O", "Bcccccc6))))))O)O"), ("Cn1cccc-2nccc12", "Cn1cccc-2nccc12", "Cnccccnccc9-5") ] for d in data: encoded = branchconverter.encode(d[0]) self.assertEqual(d[1], encoded) encodedB = allconverter.encode(d[0]) if len(d) == 2: self.assertEqual(d[1], encodedB) else: self.assertEqual(d[2], encodedB)
def sm2ds(line): # Takes schwaller's preprocessed SMILES and turns them into deepSMILES converter = ds.Converter(rings=True, branches=True) line = line.replace(" ", "") molecules = line.split(".") new_line = [] for molecule in molecules: new_molecule = converter.encode(molecule) new_line.append(new_molecule) new_line = ".".join(new_line) return new_line
def ds2sm(line): # Takes a deepSMILES line and turns it into SMILES converter = ds.Converter(rings=True, branches=True) line = line.replace(" ", "") molecules = line.split(".") new_line = [] for molecule in molecules: new_molecule = converter.decode(molecule) new_line.append(new_molecule) new_line = ".".join(new_line) return new_line
def testBranchDecoding(self): branchconverter = ds.Converter(branches=True) data = [ # smi, DeepSmiles/PN ("COC", "COC"), ("C(O)C", "CO)C"), ("C(=O)C", "C=O)C"), ("C[O]C", "C[O]C"), ("C(OC(=O)Cl)I", "COC=O)Cl)))I"), ("C(F)(F)C", "CF)F)C"), ("Cn1ccnc1", "Cn1ccnc1"), ("c1ccn(cc1)O", "c1ccncc1))O"), ("Cn1cccc-2nccc12", "Cn1cccc-2nccc12"), ] for d in data: decoded = branchconverter.decode(d[1]) self.assertEqual(d[0], decoded)
def convert_smiles(smiles=False, deep=False): ''' smiles and deep must be str format Converts from SMILES to DeepSMILES and vice versa. Which ever has a string provided, will convert to the other. If strings are proivded for both, then nothing happens ''' converter = deepsmiles.Converter(rings=True, branches=True) if smiles and deep: print('Only provide a string for one of smiles or deep') return() if smiles: # Convert from SMILES to DeepSMILES deep_string = converter.encode(smiles) return deep_string if deep: # Convert from DeepSMILES to SMILES smiles_string = converter.decode(deep) return smiles_string
def main(): parser = argparse.ArgumentParser(description="Predicting test images") # Input Arguments parser.add_argument('--input', help='Enter the input filename', required=True) args = parser.parse_args() tokenizer = pickle.load(open("tokenizer.pkl", "rb")) # Prediction model parameters embedding_dim = 600 units = 1024 vocab_size = len(tokenizer.word_index) + 1 optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005) encoder = I2S_Model.CNN_Enc(embedding_dim) decoder = I2S_Model.RNN_Dec(embedding_dim, units, vocab_size) # Initialize DeepSMILES converter = deepsmiles.Converter(rings=True, branches=True) with open('Predictions.txt', 'w') as f: print(datetime.now().strftime('%Y/%m/%d %H:%M:%S'), "Predictions\n\n", flush=True, file=f) result = predictor(args.input, encoder=encoder, decoder=decoder, optimizer=optimizer, tokenizer=tokenizer) print( converter.decode(''.join(result).replace("<start>", "").replace("<end>", "")), '\tPredictedSmiles', flush=True, file=f, )
def convert_to_deepsmile(smiles_list): """ Convert smiles list to deepsmiles list. Make sure the smiles list is canonical. Args: smiles_list: Returns: deepsmiles list """ converter = deepsmiles.Converter(rings=True, branches=True) deep_lst = [converter.encode(smi) for smi in smiles_list] decoded_lst = [] final_deep_lst = [] num_decode, num_recover = 0, 0 for i in range(len(deep_lst)): try: decoded = converter.decode(deep_lst[i]) except deepsmiles.DecodeError as e: decoded = None print("DecodeError! Error message was {}".format(e.message)) decoded_lst.append(decoded) if decoded: num_decode += 1 decoded_can_lst = [] for item in decoded_lst: if Chem.MolFromSmiles(item) is not None: can_item = Chem.MolToSmiles(Chem.MolFromSmiles(item)) decoded_can_lst.append(can_item) else: decoded_can_lst.append(None) for i in range(len(smiles_list)): if smiles_list[i] == decoded_can_lst[i]: final_deep_lst.append(deep_lst[i]) return final_deep_lst
import os, sys sys.path.append('VAE_dependencies') sys.path.append('../../VAE_dependencies') print(sys.argv) import numpy as np import yaml import torch from torch import nn from random import shuffle from data_loader import multiple_smile_to_hot, grammar_one_hot_to_smile import pandas as pd from GPlus2S import GrammarPlusToSMILES, IncludeRingsForSMILES import deepsmiles converter = deepsmiles.Converter( rings=True, branches=True) # Coverter object, described by authors from rdkit.Chem import MolFromSmiles from rdkit import rdBase rdBase.DisableLog('rdApp.error') import time def _make_dir(directory): os.makedirs(directory) def save_models(encoder, decoder, epoch): out_dir = './saved_models/{}'.format(epoch) _make_dir(out_dir)
def decode(generated, start='<M>', end='</M>'): generated = DeepSMILESLanguageModelUtils.extract(generated, start, end) converter = deepsmiles.Converter(rings=True, branches=True) return converter.decode(generated)
def deepsml(x): ldeep={} for i,m in (x.items()): converter = deepsmiles.Converter(rings=True, branches=True) ldeep[i]=converter.encode(m) return ldeep
def testDecodingBasic(self): converter = ds.Converter() dsmi = converter.decode("C") self.assertEqual("C", dsmi)
from rdkit import Chem from rdkit.Chem import AllChem import re import deepsmiles f = open('allspec_90') ccc = 0 for line in f: if re.search('pred', line): try: smi = deepsmiles.Converter(rings=True, branches=True).decode( line.split()[1].split('<')[0].strip()) except: smi = None if smi: try: csmi = Chem.MolToSmiles(Chem.MolFromSmiles(smi)) except: pass if csmi: l = [] mm = Chem.MolFromSmiles(csmi) mm = AllChem.AddHs(mm) for at in mm.GetAtoms(): l.append(at.GetSymbol()) cs = l.count('C') hs = l.count('H') os = l.count('O') print('pred', csmi, cs, hs, os) if re.search('real', line): try:
tokenizer = pickle.load(open("tokenizer.pkl","rb")) #Prediction model parameters embedding_dim = 600 units = 1024 vocab_size = len(tokenizer.word_index) + 1 optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005) encoder = I2S_Model.CNN_Encoder(embedding_dim) decoder = I2S_Model.RNN_Decoder(embedding_dim, units, vocab_size) #Initialize DeepSMILES converter = deepsmiles.Converter(rings=True, branches=True) #Evaluator def evaluate(image): maxlength = 74 #should be determined from running the training script hidden = decoder.reset_state(batch_size=1) temp_input = tf.expand_dims(I2S_evalData.load_image(image)[0], 0) img_tensor_val = I2S_evalData.image_features_extract_model(temp_input) img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3])) features = encoder(img_tensor_val) dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)
""" if not mol: return None if random_type == "unrestricted": return rkc.MolToSmiles(mol, canonical=False, doRandom=True, isomericSmiles=False) if random_type == "restricted": new_atom_order = list(range(mol.GetNumAtoms())) random.shuffle(new_atom_order) random_mol = rkc.RenumberAtoms(mol, newOrder=new_atom_order) return rkc.MolToSmiles(random_mol, canonical=False, isomericSmiles=False) raise ValueError("Type '{}' is not valid".format(random_type)) DEEPSMI_CONVERTERS = { "rings": ds.Converter(rings=True), "branches": ds.Converter(branches=True), "both": ds.Converter(rings=True, branches=True) } def to_deepsmiles(smi, converter="both"): """ Converts a SMILES strings to the DeepSMILES alternative. :param smi: SMILES string. :return : A DeepSMILES string. """ return DEEPSMI_CONVERTERS[converter].encode(smi) def from_deepsmiles(deepsmi, converter="both"):
output = open(args.output_fn, 'w') rings = args.rings branches = args.branches do_encode = args.do_encode do_decode = args.do_decode if do_decode: do_encode = False assert(not (do_encode and do_decode)) if not (rings or branches): print("use at least --no-rings or --no-branches", file=sys.stderr) sys.exit(1) count = 0 # work ---------------------------------------------- smi_supplier = RobustSmilesSupplier(input_fn) converter = deepsmiles.Converter(rings, branches) if do_encode: for smi, name in smi_supplier: deep_smi = encode(converter, smi) print("%s\t%s" % (deep_smi, name), file=output) count += 1 else: # decode for deep_smi, name in smi_supplier: smi = decode(converter, deep_smi) if smi != None: print("%s\t%s" % (smi, name), file=output) count += 1 after = time.time() dt = after - before print("%d molecules at %.2f mol/s" % (count, count / dt), file=sys.stderr)