def __call__(self, smiles, radius=3, bit_len=4096, scaffold=0): fps = np.zeros((len(smiles), bit_len)) for i, smile in enumerate(smiles): mol = Chem.MolFromSmiles(smile) arr = np.zeros((1, )) try: if scaffold == 1: mol = MurckoScaffold.GetScaffoldForMol(mol) elif scaffold == 2: mol = MurckoScaffold.MakeScaffoldGeneric(mol) if not mol: raise Exception( f'Failed to calculate Morgan fingerprint (creating RDKit instance from smiles failed: {smile})' ) fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=bit_len) DataStructs.ConvertToNumpyArray(fp, arr) fps[i, :] = arr except Exception as exp: # TODO: use a more specific exception related to descriptor errors # traceback.print_exc() self.builder.errors.append(exp) fps[i, :] = [0] * bit_len return pd.DataFrame(fps)
def get_fragments(self): fragments = None if False not in [self.is_mol(), self.is_small(), self.has_2_rings()]: # 3 requirements fulfilled fragments = [] # 1st add scf of the fragments hierarch = Recap.RecapDecompose(self.get_mol()) ks = hierarch.children for s, obj in ks.items(): m = obj.mol if (m is None) or (Chem.GetSSSR(m) < 2): continue # Fragments into scaffolds conversion try: core = MurckoScaffold.GetScaffoldForMol(m) except ValueError: # scf calculation not possible continue smiles_scf = Chem.MolToSmiles(core) if Chem.MolFromSmiles(smiles_scf) is None: continue fragments.append(smiles_scf) # 2nd add scf of itself try: core = MurckoScaffold.GetScaffoldForMol(self.get_mol()) smiles_scf = Chem.MolToSmiles(core) if Chem.MolFromSmiles(smiles_scf) is not None: fragments.append(smiles_scf) except ValueError: # scf calculation not possible pass return (fragments)
def GetScaffold(mol, generic_framework=False): """ ################################################################# Calculate Scaffold Usage: result = GetScaffold(mol) Input: mol is a molecule object. generic_framework is boolean value. If the generic_framework is True, the result returns a generic framework. Output: result is a string form of the molecule's scaffold. ################################################################# """ core = MurckoScaffold.GetScaffoldForMol(mol) if generic_framework == True: fw = MurckoScaffold.MakeScaffoldGeneric(core) mol_generic_framework = Chem.MolToSmiles(fw) return mol_generic_framework else: mol_scafflod = Chem.MolToSmiles(core) return mol_scafflod
def extract_murcko_scaffolds(mols, verbose=True): """ Extract Bemis-Murcko scaffolds from a smile string. :param mols: molecule data set in rdkit mol format. :return: smiles string of a scaffold and a framework. """ scaf = [] scaf_unique = [] generic_scaf = [] generic_scaf_unique = [] start = time.time() for mol in mols: if mol is None: continue try: core = MurckoScaffold.GetScaffoldForMol(mol) fw = MurckoScaffold.MakeScaffoldGeneric(core) scaf.append(Chem.MolToSmiles(core, isomericSmiles=True)) generic_scaf.append(Chem.MolToSmiles(fw, isomericSmiles=True)) except ValueError as e: print(e) scaf.append(['error']) generic_scaf.append(['error']) if verbose: print('Extracted', len(scaf), 'scaffolds in', time.time() - start, 'seconds.') return scaf, generic_scaf
def AddMurckoToFrame(frame, molCol = 'ROMol', MurckoCol = 'Murcko_SMILES', Generic = False): ''' Adds column with SMILES of Murcko scaffolds to pandas DataFrame. Generic set to true results in SMILES of generic framework. ''' if Generic: frame[MurckoCol] = frame.apply(lambda x: MolToSmiles(MurckoScaffold.MakeScaffoldGeneric(MurckoScaffold.GetScaffoldForMol(x[molCol]))), axis=1) else: frame[MurckoCol] = frame.apply(lambda x: MolToSmiles(MurckoScaffold.GetScaffoldForMol(x[molCol])), axis=1)
def getGenericScaffold(self, smile): mol = Chem.MolFromSmiles(smile) if mol: scaffold = MurckoScaffold.MakeScaffoldGeneric( MurckoScaffold.GetScaffoldForMol(mol)) return Chem.MolToSmiles(scaffold, isomericSmiles=False) else: return ''
def get_scaffolds(compounds): for i, c in enumerate(compounds): mol = Chem.MolFromSmiles(c["canonical_smiles"]) core = MurckoScaffold.GetScaffoldForMol(mol) compounds[i]["scaffold"] = Chem.MolToSmiles(core) compounds[i]["generic_scaffold"] = Chem.MolToSmiles( MurckoScaffold.MakeScaffoldGeneric(core)) return compounds
def _getscaffold(mol, stype='Murcko'): """ *Internal used only* """ assert stype in [ 'Murcko', 'Carbon' ], 'scaffold type must be a member of "Murcko" or "Carbon"' core = MurckoScaffold.GetScaffoldForMol(mol) core = core if stype == 'Murcko' else MurckoScaffold.MakeScaffoldGeneric( core) return Chem.MolToSmiles(core, isomericSmiles=False, canonical=True)
def _calculate_scaffold(self, smile): mol = Chem.MolFromSmiles(smile) if mol: try: scaffold = MurckoScaffold.MakeScaffoldGeneric( MurckoScaffold.GetScaffoldForMol(mol)) scaffold_smiles = Chem.MolToSmiles(scaffold, isomericSmiles=False) except ValueError: scaffold_smiles = '' else: scaffold_smiles = '' return scaffold_smiles
def calc_scaffold_similarity(s1: str, s2: str) -> float: mol1 = Chem.MolFromSmiles(s1) mol2 = Chem.MolFromSmiles(s2) if mol1 is None or mol2 is None: return -1.0 try: scafold1 = MurckoScaffold.GetScaffoldForMol(mol1) scafold2 = MurckoScaffold.GetScaffoldForMol(mol2) f1 = AllChem.GetMorganFingerprint(scafold1, 3) f2 = AllChem.GetMorganFingerprint(scafold2, 3) return DataStructs.TanimotoSimilarity(f1, f2) except Exception: return -1.0
def pipe_sim_filter(stream, query, cutoff=80, summary=None, comp_id="pipe_sim_filter"): """Filter for compounds that have a similarity greater or equal than `cutoff` (in percent) to the `query` Smiles. If the field `FP_b64` (e.g. pre-calculated) is present, this will be used, otherwise the fingerprint of the Murcko scaffold will be generated on-the-fly (much slower).""" rec_counter = 0 query_mol = Chem.MolFromSmiles(query) if not query_mol: print("* {} ERROR: could not generate query from SMILES.".format( comp_id)) return None murcko_mol = MurckoScaffold.GetScaffoldForMol(query_mol) if USE_FP == "morgan": query_fp = Desc.rdMolDescriptors.GetMorganFingerprintAsBitVect( murcko_mol, 2) elif USE_FP == "avalon": query_fp = pyAv.GetAvalonFP(murcko_mol, 1024) else: query_fp = FingerprintMols.FingerprintMol(murcko_mol) for rec in stream: if "mol" not in rec: continue if "FP_b64" in rec: # use the pre-defined fingerprint if it is present in the stream mol_fp = pickle.loads(b64.b64decode(rec["FP_b64"])) else: murcko_mol = MurckoScaffold.GetScaffoldForMol(rec["mol"]) if USE_FP == "morgan": mol_fp = Desc.rdMolDescriptors.GetMorganFingerprintAsBitVect( murcko_mol, 2) elif USE_FP == "avalon": mol_fp = pyAv.GetAvalonFP(murcko_mol, 1024) else: mol_fp = FingerprintMols.FingerprintMol(murcko_mol) sim = DataStructs.FingerprintSimilarity(query_fp, mol_fp) if sim * 100 >= cutoff: rec_counter += 1 rec["Sim"] = np.round(sim * 100, 2) if summary is not None: summary[comp_id] = rec_counter yield rec
def __init__(self, smi): self._smi = smi self._mol = Chem.MolFromSmiles(smi) self._scaf = MurckoScaffold.GetScaffoldForMol( MurckoScaffold.MakeScaffoldGeneric(self._mol)) self._scaf_atoms = self._scaf.GetAtoms() self._scaf_bonds = self._scaf.GetBonds() self._scaf_smi = Chem.MolToSmiles(self._scaf) self._ring_system = self.GetRingSystemsofscaf() self._ring_system_count = self.count_ring_systems() self._bin_values = [1, 2, 3, 4, 7] # Linkers: [direct bond between rings, linear chain between rings, branched chain between rings] self._linkers = [0, 0, 0] self._chain_binning = [0, 0, 0, 0, 0]
def computeFramwork(df): murckos = [] carbons = [] for smi in df['can']: mol = Chem.MolFromSmiles(smi) core = MurckoScaffold.GetScaffoldForMol(mol) carb = MurckoScaffold.MakeScaffoldGeneric(core) #将Murcko骨架和C骨架转成smile mur = Chem.MolToSmiles(core) carb = Chem.MolToSmiles(carb) murckos.append(mur) carbons.append(carb) df['murckos'] = murckos df['carbons'] = carbons return df
def get_hieriarchical_frags(mol_or_smi): """Hierarchically (recursively) split a molecule into fragments. Only-non-ring bonds are split and only fragments with at least one ring are considered. Takes a mol object or a Smiles string as input. Returns a list of fragments as Smiles.""" def _recursive_split(s, n=0): m = Chem.MolFromSmiles(s) if m is None: return splittable_bonds = [] for b in m.GetBonds(): if not b.IsInRing(): splittable_bonds.append(b.GetIdx()) frags = [] for bidx in splittable_bonds: nm = Chem.FragmentOnBonds(m, [bidx], addDummies=False) try: splits = Chem.GetMolFrags(nm, asMols=True) except ValueError: continue # verify the split occurred between two rings if len(splits) == 2 and Chem.CalcNumRings( splits[0]) > 0 and Chem.CalcNumRings(splits[1]) > 0: frags.extend(splits) for f in frags: try: murcko = MurckoScaffold.MurckoScaffoldSmiles(mol=f) except ValueError: continue if murcko not in result: result[murcko] = True if "[CH]" in murcko: print(f"{murcko} ({Chem.MolToSmiles(f)})") _recursive_split(murcko, n + 1) if isinstance(mol_or_smi, str): try: murcko = MurckoScaffold.MurckoScaffoldSmiles(smiles=mol_or_smi) except ValueError: return [] else: try: murcko = MurckoScaffold.MurckoScaffoldSmiles(mol=mol_or_smi) except ValueError: return [] result = {murcko: True} _recursive_split(murcko) return list(sorted(result.keys(), key=len, reverse=True))
def _recurse(scaf): orig_mol = Chem.MolFromSmiles(scaf) rwmol = Chem.RWMol(orig_mol) ri = rwmol.GetRingInfo() if ri.NumRings() < 3: return bonds = rwmol.GetBonds() for bond in bonds: if not bond.IsInRing(): rwmol = Chem.RWMol(orig_mol) rwmol.RemoveBond(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()) frags = rwmol.GetMol() frag_list = Chem.MolToSmiles(frags).split(".") ring_split = 0 rings_per_frag = [] for frag in frag_list: # have we split between two rings? if len(frag) > 2: mol = Chem.MolFromSmiles(frag) ri = mol.GetRingInfo() num_rings = ri.NumRings() rings_per_frag.append(num_rings) if num_rings > 0: ring_split += 1 if ring_split >= 2: for idx, frag in enumerate(frag_list): if rings_per_frag[idx] > 1: murcko_frag = MurckoScaffold.MurckoScaffoldSmiles( frag) if murcko_frag not in scaf_list: scaf_list.append(murcko_frag) _recurse(murcko_frag)
def generate_molecule_image(path, limit=25): from rdkit.Chem.Scaffolds import MurckoScaffold from rdkit.Chem import MolFromSmiles from rdkit.Chem.Draw import MolToImage from PIL import ImageDraw if os.path.exists(path): # Read the hits file smiles = [] ids = [] with open(path, 'r') as top_hits: for line_number, line in enumerate(top_hits.readlines()): if line_number >= limit: break smiles.append(line.split(" ")[0]) ids.append(line.split(" ")[1]) # Generate scaffold for smile, mid in zip(smiles, ids): mol = MurckoScaffold.GetScaffoldForMol(MolFromSmiles(smile)) image = MolToImage(mol) # Add text to the image draw = ImageDraw.Draw(image) draw.text((5, 5), mid, fill="black", align="right") image.save("GUI/images/molecules/{}.png".format(smile)) else: return
def main(name, argv): if not len(argv) == 2: print_usage(name) return rxn = rdChemReactions.ReactionFromSmarts( '[S:1](=[O:2])(=[O:3])F>>[S:1](=[O:2])(=[O:3])n1nnnn1') reactents_smarts = rxn.GetReactants() back = rdChemReactions.ReactionFromSmarts('[S:1]n1nnnn1>>[S:1]F') back_smarts = back.GetReactants() with open(argv[0], 'r') as f: lines = f.readlines() smiles = [line.split() for line in lines] with open(argv[1], 'w') as f: for line in smiles: mol = [Chem.MolFromSmiles(line[0]), line[0], line[1]] if mol is None: continue if not mol[0].HasSubstructMatch(reactents_smarts[0]): continue fake_ring = rxn.RunReactants((mol[0], ))[0][0] fake_ring = Chem.MolFromSmiles(Chem.MolToSmiles(fake_ring)) core = MurckoScaffold.GetScaffoldForMol(fake_ring) if core.HasSubstructMatch(back_smarts[0]): scaffold = back.RunReactants((core, ))[0][0] else: scaffold = Chem.MolFromSmiles('S(=O)(=O)F') f.write('%s\t%s\t%s\n' % (Chem.MolToSmiles(scaffold), line[0], line[1]))
def _recursive_split(s, n=0): m = Chem.MolFromSmiles(s) if m is None: return splittable_bonds = [] for b in m.GetBonds(): if not b.IsInRing(): splittable_bonds.append(b.GetIdx()) frags = [] for bidx in splittable_bonds: nm = Chem.FragmentOnBonds(m, [bidx], addDummies=False) try: splits = Chem.GetMolFrags(nm, asMols=True) except ValueError: continue # verify the split occurred between two rings if len(splits) == 2 and Chem.CalcNumRings( splits[0]) > 0 and Chem.CalcNumRings(splits[1]) > 0: frags.extend(splits) for f in frags: try: murcko = MurckoScaffold.MurckoScaffoldSmiles(mol=f) except ValueError: continue if murcko not in result: result[murcko] = True if "[CH]" in murcko: print(f"{murcko} ({Chem.MolToSmiles(f)})") _recursive_split(murcko, n + 1)
def get_murcko_scaffold(smiles_dict): """Reads a smile dictionary in this format 'CHEMBL189352': 'COc1ccc2c(cnn2n1)c3ccnc(Nc4ccc(cc4)C#N)n3' Returns a dictionary of Murcko scaffolds with the corresponding molecules 'Cc1n[nH]c2ccc(cc12)c3cncc(OC[C@@H](N)Cc4ccccc4)c3': 'CHEMBL379218' :param smiles_dict: smiles dictionary :return: dictionary of scaffolds and chembl_id """ smiles_list = smiles_dict.values() chembl_id_list = smiles_dict.keys() mols_list = [Chem.MolFromSmiles(x) for x in smiles_list] scaffolds = {} for mol, chembl_id in zip(mols_list, chembl_id_list): try: core = MurckoScaffold.GetScaffoldForMol(mol) scaffold = Chem.MolToSmiles(core) except Exception as e: print("rdkit could not read {}".format(chembl_id)) if scaffold in scaffolds: scaffolds[scaffold].append(chembl_id) else: scaffolds[scaffold] = [] scaffolds[scaffold].append(chembl_id) return scaffolds
def SMILES_2_ECFP(smiles, radius=3, bit_len=4096, index=None): """ This function transforms a list of SMILES strings into a list of ECFP with radius 3. ---------- smiles: List of SMILES strings to transform Returns ------- This function return the SMILES strings transformed into a vector of 4096 elements """ fps = np.zeros((len(smiles), bit_len)) for i, smile in enumerate(smiles): mol = Chem.MolFromSmiles(smile) arr = np.zeros((1, )) try: mol = MurckoScaffold.GetScaffoldForMol(mol) fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=bit_len) DataStructs.ConvertToNumpyArray(fp, arr) fps[i, :] = arr except: print(smile) fps[i, :] = [0] * bit_len return pd.DataFrame(fps, index=(smiles if index is None else index))
def SmilesToFrameInChIKey(SMILES): """ """ mol = GetMol(SMILES) if mol: frame = MurckoScaffold.GetScaffoldForMol(mol) try: frame = MurckoScaffold.MakeScaffoldGeneric(frame) except: pass else: frame = None frame = Chem.MolToInchiKey(frame) if frame else None return frame
def get_annotated_murcko_scaffold(mol, scaffold=None, as_mol=True): """ Return an annotated murcko scaffold where side chains are replaced with a dummy atom ('*'). Parameters ---------- mol : rdkit.Chem.rdchem.Mol scaffold : rdkit.Chem.rdchem.Mol, optional If a murcko scaffold is already calculated for the `mol`, this can be supplied as a template. The default is None. as_mol : bool, optional If True return rdkit.Chem.rdchem.Mol object else return a SMILES string representation. The default is True. Returns ------- {str, rdkit.Chem.rdchem.Mol} Annotated Murcko scaffold. """ if not scaffold: scaffold = MurckoScaffold.GetScaffoldForMol(mol) annotated = rdmolops.ReplaceSidechains(mol, scaffold) if as_mol: return annotated if annotated is None: return '' return MolToSmiles(annotated)
def main(name, argv): if len(argv) != 2: print_usage(name) return with open(argv[0], 'r') as f: smile = f.readline().split()[0] mol = Chem.MolFromSmiles(smile) size = mol.GetNumHeavyAtoms() hierarch = Recap.RecapDecompose(mol) children = [] for child in hierarch.GetAllChildren().keys() + [smile]: new_smiles = child.replace('[*]', '[H]') new = Chem.MolFromSmiles(new_smiles) if not new == None: new_size = new.GetNumHeavyAtoms() if new_size > 7 and new_size <= 25: if rdMolDescriptors.CalcNumRotatableBonds(new) <= 5: children.append(Chem.MolToSmiles(new, isomericSmiles=True)) #children.append(new_smiles) core_smile = MurckoScaffold.MurckoScaffoldSmilesFromSmiles( new_smiles, includeChirality=True) core = Chem.MolFromSmiles(core_smile) if rdMolDescriptors.CalcNumRotatableBonds( core) <= 5 and core.GetNumHeavyAtoms() > 7: children.append(core_smile) with open(argv[1], 'w') as f: i = 1 for m in set(children): if len(m) > 0: f.write(m + '\t' + str(i) + '\n') i += 1
def get_scaffold(self, smiles): from rdkit.Chem.Scaffolds import MurckoScaffold mol = Chem.MolFromSmiles(smiles) return MurckoScaffold.MurckoScaffoldSmiles( mol = mol, includeChirality = self.include_chirality )
def get_murcko_scaffold(mol, generic=False): """Get the murcko scaffold for an input molecule Parameters ---------- mol (Chem.Mol): an rdkit molecule generic (bool): if True return a generic scaffold (CSK) Returns ------- murcko (Chem.Mol): an rdkit molecule (scaffold) """ murcko = MurckoScaffold.GetScaffoldForMol(mol) if generic: murcko = MurckoScaffold.MakeScaffoldGeneric(murcko) return murcko
def findCluster(self, smiles): mol = Chem.MolFromSmiles(smiles) if mol: try: scaffold = MurckoScaffold.GetScaffoldForMol(mol) except: return "", "", False if scaffold: cluster = Chem.MolToSmiles(scaffold, isomericSmiles=False) else: return "", "", False else: return "", "", False fp = Pairs.GetAtomPairFingerprint(scaffold) # Change to Tanimoto? if cluster in self.getFingerprints(): return cluster, fp, False fps = list(self.getFingerprints().values()) sims = DataStructs.BulkTanimotoSimilarity(fp, fps) if len(sims) == 0: return cluster, fp, True closest = np.argmax(sims) if sims[closest] >= self.minsimilarity: return list(self.getFingerprints().keys())[closest], fp, False else: return cluster, fp, True
def get_fragments(insmiles): """Get core and sidechains using Murcko fragmentation method Args: insmiles (str): SMILES for molecule Returns: core: scaffold SMILES side: side chain SMILES """ mol_prev = Chem.MolFromSmiles(insmiles) ### get core using Murcko fragmentation ### core = Murcko.MurckoScaffoldSmilesFromSmiles(insmiles) if core != "": mol_core = Chem.MolFromSmiles(core) ### get sidechains ### mol_side = Chem.rdmolops.DeleteSubstructs(mol_prev, mol_core) side = Chem.MolToSmiles(mol_side).split(".") else: side = [insmiles] return core, side
def extract_side_chains(mol, remove_duplicates=False, mark='[*]'): """ Extract side chains from a smiles string. Core is handled as Murcko scaffold. :param mol: {str} smiles string of a molecule. :param remove_duplicates: {bool} Keep or remove duplicates. :param mark: character to mark attachment points. :return: smiles strings of side chains in a list, attachment points replaced by [R]. """ pos = range(0, 20) set_pos = ['[' + str(x) + '*]' for x in pos] m1 = MolFromSmiles(mol) try: core = MurckoScaffold.GetScaffoldForMol(m1) side_chain = ReplaceCore(m1, core) smi = MolToSmiles(side_chain, isomericSmiles=True ) # isomericSmiles adds a number to the dummy atoms. except: return list() for i in pos: smi = smi.replace(''.join(set_pos[i]), mark) if remove_duplicates: return list(set(smi.split('.'))) else: return smi.split('.')
def ECFP_from_SMILES(cls, smiles, radius=3, bit_len=4096, scaffold=0, index=None): fps = np.zeros((len(smiles), bit_len)) for i, smile in enumerate(smiles): mol = Chem.MolFromSmiles(smile) arr = np.zeros((1,)) try: if scaffold == 1: mol = MurckoScaffold.GetScaffoldForMol(mol) elif scaffold == 2: mol = MurckoScaffold.MakeScaffoldGeneric(mol) fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=bit_len) DataStructs.ConvertToNumpyArray(fp, arr) fps[i, :] = arr except: print(smile) fps[i, :] = [0] * bit_len return pd.DataFrame(fps, index=(smiles if index is None else index))
def GetMurckoScaffold(mol): #mol: rdkit RWMol or Mol from rdkit.Chem.Scaffolds import MurckoScaffold scaffold = MurckoScaffold.MakeScaffoldGeneric(mol) #return scaffold rdkit.mol object return scaffold