def main(): filename = 'data/macrolides_smiles.csv' create_directory() df = pd.read_csv(filename) smiles = [standardize_smiles(i) for i in df['smiles'].values] start_time = time.time() output_filename = 'desc' + filename[4:] ### Compute ECFP6 Fingerprints and export file. ecfps_descriptor = ECFP6(smiles) ecfps_descriptor.compute_ECFP6(output_filename) ## Compute MACCS Fingerprints and export file. maccs_descriptor = MACCS(smiles) maccs_descriptor.compute_MACCS(output_filename) ## Compute RDKit 2D Descriptors and export file. rdk_descriptor = RDKit_2D(smiles) rdk_descriptor.compute_2Drdkit(output_filename) ## Compute mordred_mrc Descriptors and export file. mrc_descriptor = Macrocycle_Descriptors(smiles) mrc_descriptor.mordred_compute(output_filename) mrc_descriptor.compute_mordred_macrocycle(output_filename) duration = convert_time(time.time()-start_time) print(duration)
def testStandardizeSmLong(self): if not doLong: raise unittest.SkipTest('long test') for data in self.readPCSdata(self.dataPCS_standardize_smiles100k): try: ss = molvs.standardize_smiles(data.smiles) except Exception: raise AssertionError(f'Line {data.lineNo}: MolVS standardization failed for SMILES {data.smiles}') self.assertEqual(ss, data.expected)
def process(smi): smis = smi.strip() # Only include compounds that exclusively # use tokens the model can generate if any(tok not in model.vocab2id for tok in tokenize(smis)): return None # Standardize SMILES return [molvs.standardize_smiles(smi) for smi in smis.split('.')]
def testStandardizeSmLong(self): if not doLong: raise unittest.SkipTest('long test') for data in self.readPCSdata(self.dataPCS_standardize_smiles100k): try: ss = molvs.standardize_smiles(data.smiles) except Exception: raise AssertionError( 'Line {0.lineNo}: MolVS standardization failed for SMILES {0.smiles}'.format(data)) self.assertEqual(ss, data.expected)
def main(): filename = 'data/macrolides_smiles.csv' df = pd.read_csv(filename) smiles = [standardize_smiles(i) for i in df['smiles'].values] output_filename = 'desc' + filename[4:] ## Compute ECFP6 Fingerprints and export file. maccs_descriptor = ECFP6(smiles) maccs_descriptor.compute_ECFP6(output_filename)
def process(fname): results = [] label = int(os.path.basename(fname).replace('.json', '')) with open(fname, 'r') as f: data = json.load(f) ok = [] for d in data: smi = d['smiles'] if smi is None: continue # Validate SMILES errs = molvs.validate_smiles(smi) if errs: # print('Validation error(s):', errs) continue # Standardize SMILES smi = molvs.standardize_smiles(smi) # Check if exists already if smi in pubchem: # print('Exists in PubChem') continue ok.append(smi) #print('Kept:', len(ok)) atc_codes = [atc_lookup[i] for i in atc_model.predict(ok)] for smi, atc_code in zip(ok, atc_codes): mol = Chem.MolFromSmiles(smi) formula = CalcMolFormula(mol) h = md5(smi.encode('utf8')).hexdigest() im = Draw.MolToImage(mol) im_path = os.path.join(images_dir, '{}.png'.format(h)) im.save(im_path) results.append({ 'label': label, 'smiles': smi, 'formula': formula, 'image': im_path, 'atc_code': atc_code, 'created_at': datetime.utcnow().isoformat() }) # Save generated compounds with open(fname, 'w') as f: json.dump(results, f)
def get_cmpd_information(molec): """Get information from SABIO Database of a compound with ID cID. """ QUERY_URL = ('http://sabiork.h-its.org/sabioRestWebServices/' 'searchCompoundDetails') # input: SabioCompoundID # valid output fields: "fields[]":["Name","ChebiID", # "PubChemID","InChI", # "SabioCompoundID","KeggCompoundID"] params = { "SabioCompoundID": molec.cID, "fields[]": ["Name", "ChebiID", "PubChemID", "InChI"] } if molec.InChi is None: request = requests.post(QUERY_URL, params=params) request.raise_for_status() if request.text == 'No results found for query': molec.mol = None else: # results txt = request.text.split('\n')[1].split('\t') _, _, _, molec.InChi = txt if molec.InChi != 'null': print('collect SMILES from SABIO InChi') molec.mol = get_rdkit_mol_from_InChi(molec.InChi) if molec.mol is not None: smiles = Chem.MolToSmiles(Chem.RemoveHs(molec.mol)) molec.SMILES = smiles try: molec.SMILES = standardize_smiles(molec.SMILES) except ValueError: print('standardization failed - therefore assume') print('SMILES were invalid - skip') molec.SMILES = None molec.mol = None # import sys # sys.exit() else: molec.SMILES = None else: molec.mol = None molec.SMILES = None
import json import molvs import random import policies from tqdm import tqdm from mcts import Node, mcts import tensorflow as tf from rdkit import Chem from rdkit.Chem import AllChem # Load base compounds starting_mols = set() with open('data/emolecules.smi', 'r') as f: for line in tqdm(f, desc='Loading base compounds'): smi = line.strip() smi = molvs.standardize_smiles(smi) starting_mols.add(smi) print('Base compounds:', len(starting_mols)) # Load policy networks with open('model/rules.json', 'r') as f: rules = json.load(f) rollout_rules = rules['rollout'] expansion_rules = rules['expansion'] rollout_net = policies.RolloutPolicyNet(n_rules=len(rollout_rules)) expansion_net = policies.ExpansionPolicyNet(n_rules=len(expansion_rules)) filter_net = policies.InScopeFilterNet() sess = tf.Session() init = tf.global_variables_initializer()
def __init__(self, dcompound, logfile, writecheck=1, kSMILES="CANONICAL_SMILES", kID="CMPD_CHEMBLID"): self.compound = dcompound loader = pydrug.PyDrug() # if SMILES, load using SMILES code if not kSMILES in dcompound.keys(): try: smile = runExternalSoft.babelConvertSDFtoSMILE( dcompound["sdf"]) self.compound[kSMILES] = smile except: print "ERROR INPUT SDF - l33" self.log = "ERROR" try: logfile.write(self.compound[kID] + "\t---\tERROR-SDF ORIGINAL INPUT\n") except: pass return #Standardize smile code try: smilestandadized = standardize_smiles(self.compound[kSMILES]) except: logfile.write(self.compound[kID] + "\t" + str(self.compound[kSMILES]) + "\tERROR-SMILES INPUT" "\n") self.log = "ERROR" return #Standardize using molvs (http://molvs.readthedocs.io/en/latest/api.html#molvs-fragment) s = Standardizer() mol = Chem.MolFromSmiles(smilestandadized) molstandardized = s.standardize(mol) smilestandadized = Chem.MolToSmiles(molstandardized) # remove salt # 1.default remover = SaltRemover() mol = Chem.MolFromSmiles(smilestandadized) molcleandefault = remover(mol) # 2. Personal remover homeremover = SaltRemover(defnData=LSALT) molclean = homeremover(molcleandefault) smilesclean = Chem.MolToSmiles(molclean) # 3. SMILES remove other manual salts + fragments -> for fragment take one if exactly same compound lelem = smilesclean.split(".") if len(lelem) > 1: # reduce double, case of several salts are included - 255 lelem = list(set(lelem)) for smilesdel in LSMILESREMOVE: if smilesdel in lelem: lelem.remove(smilesdel) try: lelem.remove("") # case of bad smile except: pass if len(lelem) == 1: smilesclean = str(lelem[0]) else: # 4. Fragments #Case of fragment -> stock in log file, check after to control logfile.write(self.compound[kID] + "\t" + str(self.compound[kSMILES]) + "\tFRAGMENT IN INPUT" "\n") print ".".join(lelem), " - FRAGMENTS - l66" self.log = "ERROR" return else: pass print self.compound[kSMILES], "SMILES IN - l25 liganddescriptors" print smilesclean, "SMILES without salt and standardized" # case where only salt are included if smilesclean == "": logfile.write(self.compound[kID] + "\t" + str(self.compound[kSMILES]) + "\tEMPTY SMILES AFTER " "STANDARDIZATION\n") print "EMPTY SMILES AFTER STANDARDIZATION - l84" self.log = "ERROR" return self.compound[kSMILES] = smilesclean self.log = "OK" if writecheck == 1: # SMILES code pfileSMILES = pathFolder.PR_COMPOUNDS + str( dcompound[kID]) + ".smi" fileSMILES = open(pfileSMILES, "w") fileSMILES.write(self.compound[kSMILES]) fileSMILES.close() # SDF input if "sdf" in self.compound.keys(): pfileSDF = pathFolder.PR_COMPOUNDS + str( dcompound[kID]) + ".sdf" fileSDF = open(pfileSDF, "w") fileSDF.write(self.compound["sdf"]) fileSDF.close() # read mol self.mol = loader.ReadMolFromSmile(self.compound[kSMILES])
def testStandardizeSmShort(self): for data in self.readPCSdata(self.dataPCS_standardize_smiles1k): ss = molvs.standardize_smiles(data.smiles) self.assertEqual(ss, data.expected)