def GetScaffold(mol, generic_framework=False): """ ################################################################# Calculate Scaffold Usage: result = GetScaffold(mol) Input: mol is a molecule object. generic_framework is boolean value. If the generic_framework is True, the result returns a generic framework. Output: result is a string form of the molecule's scaffold. ################################################################# """ core = MurckoScaffold.GetScaffoldForMol(mol) if generic_framework == True: fw = MurckoScaffold.MakeScaffoldGeneric(core) mol_generic_framework = Chem.MolToSmiles(fw) return mol_generic_framework else: mol_scafflod = Chem.MolToSmiles(core) return mol_scafflod
def __call__(self, smiles, radius=3, bit_len=4096, scaffold=0): fps = np.zeros((len(smiles), bit_len)) for i, smile in enumerate(smiles): mol = Chem.MolFromSmiles(smile) arr = np.zeros((1, )) try: if scaffold == 1: mol = MurckoScaffold.GetScaffoldForMol(mol) elif scaffold == 2: mol = MurckoScaffold.MakeScaffoldGeneric(mol) if not mol: raise Exception( f'Failed to calculate Morgan fingerprint (creating RDKit instance from smiles failed: {smile})' ) fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=bit_len) DataStructs.ConvertToNumpyArray(fp, arr) fps[i, :] = arr except Exception as exp: # TODO: use a more specific exception related to descriptor errors # traceback.print_exc() self.builder.errors.append(exp) fps[i, :] = [0] * bit_len return pd.DataFrame(fps)
def extract_murcko_scaffolds(mols, verbose=True): """ Extract Bemis-Murcko scaffolds from a smile string. :param mols: molecule data set in rdkit mol format. :return: smiles string of a scaffold and a framework. """ scaf = [] scaf_unique = [] generic_scaf = [] generic_scaf_unique = [] start = time.time() for mol in mols: if mol is None: continue try: core = MurckoScaffold.GetScaffoldForMol(mol) fw = MurckoScaffold.MakeScaffoldGeneric(core) scaf.append(Chem.MolToSmiles(core, isomericSmiles=True)) generic_scaf.append(Chem.MolToSmiles(fw, isomericSmiles=True)) except ValueError as e: print(e) scaf.append(['error']) generic_scaf.append(['error']) if verbose: print('Extracted', len(scaf), 'scaffolds in', time.time() - start, 'seconds.') return scaf, generic_scaf
def get_scaffolds(compounds): for i, c in enumerate(compounds): mol = Chem.MolFromSmiles(c["canonical_smiles"]) core = MurckoScaffold.GetScaffoldForMol(mol) compounds[i]["scaffold"] = Chem.MolToSmiles(core) compounds[i]["generic_scaffold"] = Chem.MolToSmiles( MurckoScaffold.MakeScaffoldGeneric(core)) return compounds
def getGenericScaffold(self, smile): mol = Chem.MolFromSmiles(smile) if mol: scaffold = MurckoScaffold.MakeScaffoldGeneric( MurckoScaffold.GetScaffoldForMol(mol)) return Chem.MolToSmiles(scaffold, isomericSmiles=False) else: return ''
def GetMurckoScaffold(mol): #mol: rdkit RWMol or Mol from rdkit.Chem.Scaffolds import MurckoScaffold scaffold = MurckoScaffold.MakeScaffoldGeneric(mol) #return scaffold rdkit.mol object return scaffold
def AddMurckoToFrame(frame, molCol = 'ROMol', MurckoCol = 'Murcko_SMILES', Generic = False): ''' Adds column with SMILES of Murcko scaffolds to pandas DataFrame. Generic set to true results in SMILES of generic framework. ''' if Generic: frame[MurckoCol] = frame.apply(lambda x: MolToSmiles(MurckoScaffold.MakeScaffoldGeneric(MurckoScaffold.GetScaffoldForMol(x[molCol]))), axis=1) else: frame[MurckoCol] = frame.apply(lambda x: MolToSmiles(MurckoScaffold.GetScaffoldForMol(x[molCol])), axis=1)
def _getscaffold(mol, stype='Murcko'): """ *Internal used only* """ assert stype in [ 'Murcko', 'Carbon' ], 'scaffold type must be a member of "Murcko" or "Carbon"' core = MurckoScaffold.GetScaffoldForMol(mol) core = core if stype == 'Murcko' else MurckoScaffold.MakeScaffoldGeneric( core) return Chem.MolToSmiles(core, isomericSmiles=False, canonical=True)
def _calculate_scaffold(self, smile): mol = Chem.MolFromSmiles(smile) if mol: try: scaffold = MurckoScaffold.MakeScaffoldGeneric( MurckoScaffold.GetScaffoldForMol(mol)) scaffold_smiles = Chem.MolToSmiles(scaffold, isomericSmiles=False) except ValueError: scaffold_smiles = '' else: scaffold_smiles = '' return scaffold_smiles
def computeFramwork(df): murckos = [] carbons = [] for smi in df['can']: mol = Chem.MolFromSmiles(smi) core = MurckoScaffold.GetScaffoldForMol(mol) carb = MurckoScaffold.MakeScaffoldGeneric(core) #将Murcko骨架和C骨架转成smile mur = Chem.MolToSmiles(core) carb = Chem.MolToSmiles(carb) murckos.append(mur) carbons.append(carb) df['murckos'] = murckos df['carbons'] = carbons return df
def __init__(self, smi): self._smi = smi self._mol = Chem.MolFromSmiles(smi) self._scaf = MurckoScaffold.GetScaffoldForMol( MurckoScaffold.MakeScaffoldGeneric(self._mol)) self._scaf_atoms = self._scaf.GetAtoms() self._scaf_bonds = self._scaf.GetBonds() self._scaf_smi = Chem.MolToSmiles(self._scaf) self._ring_system = self.GetRingSystemsofscaf() self._ring_system_count = self.count_ring_systems() self._bin_values = [1, 2, 3, 4, 7] # Linkers: [direct bond between rings, linear chain between rings, branched chain between rings] self._linkers = [0, 0, 0] self._chain_binning = [0, 0, 0, 0, 0]
def get_murcko_scaffold(mol, generic=False): """Get the murcko scaffold for an input molecule Parameters ---------- mol (Chem.Mol): an rdkit molecule generic (bool): if True return a generic scaffold (CSK) Returns ------- murcko (Chem.Mol): an rdkit molecule (scaffold) """ murcko = MurckoScaffold.GetScaffoldForMol(mol) if generic: murcko = MurckoScaffold.MakeScaffoldGeneric(murcko) return murcko
def SmilesToFrameInChIKey(SMILES): """ """ mol = GetMol(SMILES) if mol: frame = MurckoScaffold.GetScaffoldForMol(mol) try: frame = MurckoScaffold.MakeScaffoldGeneric(frame) except: pass else: frame = None frame = Chem.MolToInchiKey(frame) if frame else None return frame
def ECFP_from_SMILES(cls, smiles, radius=3, bit_len=4096, scaffold=0, index=None): fps = np.zeros((len(smiles), bit_len)) for i, smile in enumerate(smiles): mol = Chem.MolFromSmiles(smile) arr = np.zeros((1,)) try: if scaffold == 1: mol = MurckoScaffold.GetScaffoldForMol(mol) elif scaffold == 2: mol = MurckoScaffold.MakeScaffoldGeneric(mol) fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=bit_len) DataStructs.ConvertToNumpyArray(fp, arr) fps[i, :] = arr except: print(smile) fps[i, :] = [0] * bit_len return pd.DataFrame(fps, index=(smiles if index is None else index))
def scaffold2smiles(mol, generic=True, return_smiles = True): ''' Returns a SMILES string representing the Murcko Scaffold of a given molecule''' if generic: # Makes a Murcko scaffold generic (all atoms -> carbon and all bonds -> single) scff = MurckoScaffold.MakeScaffoldGeneric(mol) scff = MurckoScaffold.GetScaffoldForMol(scff) scff_smiles = MolToSmiles(scff) else: # Return a smiles scaffold try: scff = MurckoScaffold.GetScaffoldForMol(mol) scff_smiles = MolToSmiles(scff) except: scff_smiles = '' scff = np.nan if return_smiles: return scff_smiles else: return scff
def get_murcko_scaffold(mol, generic=False): """Get the murcko scaffold for an input molecule. Parameters ---------- mol : rdkit.rdchem.Chem.Mol generic : bool If True return a generic scaffold (CSK) Returns ------- murcko : rdkit.Chem.rdchem.Mol Murcko scaffold. """ murcko = MurckoScaffold.GetScaffoldForMol(mol) if generic: murcko = MurckoScaffold.MakeScaffoldGeneric(murcko) return murcko
def CalculateMaxSizeSystemRing(mol): """ Number of atoms involved in the biggest system ring ---> maxring :param mol: molecular :type mol: rdkit.Chem.rdchem.Mol :return: number of atoms involved in the biggest system ring :rtype: int """ #0.Get the scaffold core = MurckoScaffold.GetScaffoldForMol(mol) fw = MurckoScaffold.MakeScaffoldGeneric(core) #1.Obtaining which atoms consist of rings MaxRing = 0 ri = fw.GetRingInfo() atoms = list(ri.AtomRings()) length = len(atoms) if length == 0: pass else: rw = Chem.RWMol(fw) #2.Judge which atoms are replacement atoms = [set(x) for x in atoms] for pair in combinations(range(length), 2): replace = list(atoms[pair[0]] & atoms[pair[1]]) if len(replace) >= 2: for repl in list(combinations(replace, 2)): rw.RemoveBond(*repl) else: pass m = Chem.MolFromSmiles(Chem.MolToSmiles(rw)) ri = m.GetRingInfo() bonds = ri.BondRings() for item in bonds: if len(item) > MaxRing: MaxRing = len(item) return MaxRing
def extract_descriptors(self, individual): """ Returning the descriptor(s) extracted from the given individual :param individual: :return: """ if self.descriptor_key == "gen_scaffolds": return [ MolToSmiles( MurckoScaffold.MakeScaffoldGeneric( MolFromSmiles(individual.to_smiles()))) ] elif self.descriptor_key == "ifg": curr_ifgs = ifg.identify_functional_groups( MolFromSmiles(individual.to_smiles())) return list(set([curr_ifg[2] for curr_ifg in curr_ifgs])) elif self.descriptor_key == "atoms": return list(set(individual.get_atom_types())) elif self.descriptor_key == "shg_1": return list(extract_shingles(individual, 1)) elif self.descriptor_key == "checkmol": return list(set(extract_checkmol(individual)))
def func(x): return Chem.MolToSmiles( MurckoScaffold.MakeScaffoldGeneric( MurckoScaffold.GetScaffoldForMol(x[molCol])))
del df_mesfp # remove the CIDs of the top x compounds... TPec = list(TS_ec_df.index) TPhts = list(TS_hts_df.index) TPmes = list(TS_mes_df.index) # caluculate smiles and Topological scaffold for each compound # analysing topological scaffolds... cmpd_lists = {'ecfp4':TPec, 'htsfp':TPhts, 'mesfp':TPmes} Generic_sets_dict = {} for FP_name, cmpds in cmpd_lists.items(): gen_scaf_set = set() for cid in cmpds: if str(cid) in cid2smi: g_scaf = Chem.MolToSmiles(ms.MakeScaffoldGeneric(Chem.MolFromSmiles(cid2smi[str(cid)]))) gen_scaf_set.add(g_scaf) else: print('NA???, thats not meant to happen....') continue print('{}\t {} / {}'.format(FP_name,len(gen_scaf_set),numba)) Generic_sets_dict[FP_name] = gen_scaf_set ec_scafs = Generic_sets_dict['ecfp4'] hts_scafs = Generic_sets_dict['htsfp'] mes_scafs = Generic_sets_dict['mesfp'] plt.figure(i, figsize=(6,5)) plt.title('AID:'+assay, size=15)
def fuzzy_scaffolding( mols: List[Chem.rdchem.Mol], enforce_subs: List[str] = None, n_atom_cuttoff: int = 8, additional_templates: List[Chem.rdchem.Mol] = None, ignore_non_ring: bool = False, mcs_params: Dict[Any, Any] = None, ): """Generate fuzzy scaffold with enforceable group that needs to appear in the core, forcing to keep the full side chain if required. NOTE(hadim): consider parallelize this (if possible). Args: mols: List of all molecules enforce_subs: List of substructure to enforce on the scaffold. n_atom_cuttoff: Minimum number of atom a core should have. additional_templates: Additional template to use to generate scaffolds. ignore_non_ring: Whether to ignore atom no in murcko ring system, even if they are in the framework. mcs_params: Arguments of MCS algorithm. Returns: scaffolds: set All found scaffolds in the molecules as valid smiles scaffold_infos: dict of dict Infos on the scaffold mapping, ignoring any side chain that had to be enforced. Key corresponds to generic scaffold smiles Values at ['smarts'] corresponds to smarts representation of the true scaffold (from MCS) Values at ['mols'] corresponds to list of molecules matching the scaffold scaffold_to_group: dict of list Map between each generic scaffold and the R-groups decomposition row """ if enforce_subs is None: enforce_subs = [] if additional_templates is None: additional_templates = [] if mcs_params is None: mcs_params = {} rg_params = rdRGroupDecomposition.RGroupDecompositionParameters() rg_params.removeAllHydrogenRGroups = True rg_params.removeHydrogensPostMatch = True rg_params.alignment = rdRGroupDecomposition.RGroupCoreAlignment.MCS rg_params.matchingStrategy = rdRGroupDecomposition.RGroupMatching.Exhaustive rg_params.rgroupLabelling = rdRGroupDecomposition.RGroupLabelling.AtomMap rg_params.labels = rdRGroupDecomposition.RGroupLabels.AtomIndexLabels core_query_param = AdjustQueryParameters() core_query_param.makeDummiesQueries = True core_query_param.adjustDegree = False core_query_param.makeBondsGeneric = True # group molecules by they generic Murcko scaffold, allowing # side chain that contains cycle (might be a bad idea) scf2infos = collections.defaultdict(dict) scf2groups = {} all_scaffolds = set([]) for m in mols: generic_m = MurckoScaffold.MakeScaffoldGeneric(m) scf = MurckoScaffold.GetScaffoldForMol(m) try: scf = MurckoScaffold.MakeScaffoldGeneric(scf) except: pass if ignore_non_ring: rw_scf = Chem.RWMol(scf) atms = [a.GetIdx() for a in rw_scf.GetAtoms() if not a.IsInRing()] atms.sort(reverse=True) for a in atms: rw_scf.RemoveAtom(a) scfs = list(rdmolops.GetMolFrags(rw_scf, asMols=False)) else: scfs = [dm.to_smiles(scf)] # add templates mols if exists: for tmp in additional_templates: tmp = dm.to_mol(tmp) tmp_scf = MurckoScaffold.MakeScaffoldGeneric(tmp) if generic_m.HasSubstructMatch(tmp_scf): scfs.append(dm.to_smiles(tmp_scf)) for scf in scfs: if scf2infos[scf].get("mols"): scf2infos[scf]["mols"].append(m) else: scf2infos[scf]["mols"] = [m] for scf in scf2infos: # cheat by adding murcko as last mol always popout = False mols = scf2infos[scf]["mols"] if len(mols) < 2: mols = mols + [MurckoScaffold.GetScaffoldForMol(mols[0])] popout = True # compute the MCS of the cluster mcs = rdFMCS.FindMCS( mols, atomCompare=rdFMCS.AtomCompare.CompareAny, bondCompare=rdFMCS.BondCompare.CompareAny, completeRingsOnly=True, **mcs_params, ) mcsM = Chem.MolFromSmarts(mcs.smartsString) mcsM.UpdatePropertyCache(False) Chem.SetHybridization(mcsM) if mcsM.GetNumAtoms() < n_atom_cuttoff: continue scf2infos[scf]["smarts"] = dm.to_smarts(mcsM) if popout: mols = mols[:-1] core_groups = [] # generate rgroups based on the mcs core success_mols = [] try: rg = rdRGroupDecomposition.RGroupDecomposition(mcsM, rg_params) for i, analog in enumerate(mols): analog.RemoveAllConformers() res = rg.Add(analog) if not (res < 0): success_mols.append(i) rg.Process() core_groups = rg.GetRGroupsAsRows() except Exception: pass mols = [mols[i] for i in success_mols] scf2groups[scf] = core_groups for mol, gp in zip(mols, core_groups): core = gp["Core"] acceptable_groups = [ a.GetAtomMapNum() for a in core.GetAtoms() if (a.GetAtomMapNum() and not a.IsInRing()) ] rgroups = [ gp[f"R{k}"] for k in acceptable_groups if f"R{k}" in gp.keys() ] if enforce_subs: rgroups = [ rgp for rgp in rgroups if not any([ len(rgp.GetSubstructMatch(frag)) > 0 for frag in enforce_subs ]) ] try: scaff = trim_side_chain( mol, AdjustQueryProperties(core, core_query_param), rgroups) except: continue all_scaffolds.add(dm.to_smiles(scaff)) return all_scaffolds, scf2infos, scf2groups
def get_scaffolds(self, scaffolding_method=ScaffoldingMethod.MurckoScaffold): """Compute deemed scaffolds for a given compound. Args: scaffolding_method (ScaffoldingMethod, optional): Defaults to MurckoScaffold. Scaffolding method to use Returns: list[rdkit.Chem.rdchem.Mol]: Scaffolds found in the component. """ try: scaffolds = [] if scaffolding_method == ScaffoldingMethod.MurckoScaffold: scaffolds = [(MurckoScaffold.GetScaffoldForMol(self.mol_no_h))] elif scaffolding_method == ScaffoldingMethod.MurckoGeneric: scaffolds = [ (MurckoScaffold.MakeScaffoldGeneric(self.mol_no_h)) ] elif scaffolding_method == ScaffoldingMethod.Brics: scaffolds = BRICS.BRICSDecompose(self.mol_no_h) brics_smiles = [ re.sub(r"(\[[0-9]*\*\])", "[H]", i) for i in scaffolds ] # replace dummy atoms with H's to get matches https://sourceforge.net/p/rdkit/mailman/message/35261974/ brics_mols = [ rdkit.Chem.MolFromSmiles(x) for x in brics_smiles ] for mol in brics_mols: rdkit.Chem.RemoveHs(mol) brics_hits = [ self.mol_no_h.GetSubstructMatches(i) for i in brics_mols ] for index, brics_hit in enumerate(brics_hits): smiles = rdkit.Chem.MolToSmiles(brics_mols[index]) name = scaffolding_method.name source = 'RDKit scaffolds' key = f'{name}_{smiles}' brics_hit = conversions.listit(brics_hit) if not smiles: continue if key not in self._scaffolds: self._scaffolds[key] = SubstructureMapping( name, smiles, source, brics_hit) return brics_mols for s in scaffolds: scaffold_atom_names = [ atom.GetProp('name') for atom in s.GetAtoms() ] mapping = [] for at_name in scaffold_atom_names: idx = [ atom.GetIdx() for atom in self.mol.GetAtoms() if atom.GetProp('name') == at_name ][0] mapping.append(idx) smiles = rdkit.Chem.MolToSmiles(s) name = scaffolding_method.name source = 'RDKit scaffolds' if not smiles: continue if name in self._scaffolds: self._scaffolds[name].mappings.append(mapping) else: self._scaffolds[name] = SubstructureMapping( name, smiles, source, [mapping]) return scaffolds except (RuntimeError, ValueError): raise CCDUtilsError( f'Computing scaffolds using method {scaffolding_method.name} failed.' )
def gframecheck(s): try: return Chem.MolToSmiles(ms.MakeScaffoldGeneric(Chem.MolFromSmiles(s))) except: pass
# remove the CIDs of the top x compounds... TPec = list(ecfp_r.index) TPhts = list(htsfp_r.index) TPces = list(BaSH_r.index) # caluculate smiles and Topological scaffold for each compound # analysing topological scaffolds... cmpd_lists = {'ecfp': TPec, 'htsfp': TPhts, 'cesfp': TPces} Generic_sets_dict = {} for FP_name, cmpds in cmpd_lists.items(): gen_scaf_set = set() for cid in cmpds: if str(cid) in cid2smi: m_scaf = ms.GetScaffoldForMol( Chem.MolFromSmiles(cid2smi[str(cid)])) g_scaf = Chem.MolToSmiles(ms.MakeScaffoldGeneric(m_scaf)) gen_scaf_set.add(g_scaf) else: print('NA???, thats not meant to happen....') continue print('{}\t {} / {}'.format(FP_name, len(gen_scaf_set), numba)) Generic_sets_dict[FP_name] = gen_scaf_set ec_scafs = Generic_sets_dict['ecfp'] hts_scafs = Generic_sets_dict['htsfp'] ces_scafs = Generic_sets_dict['cesfp'] # o = ec_scafs.union(hts_scafs) # l=len(o)
def genericize_scaffold(s): try: return MurckoScaffold.MakeScaffoldGeneric(s) except ValueError: return None
#!/usr/bin/env python3 from rdkit import Chem from rdkit.Chem import Draw from rdkit.Chem.Scaffolds import MurckoScaffold drugbank_input = Chem.SDMolSupplier('drugbank.sdf') drugbank = [m for m in drugbank_input if m] basic_structure = drugbank[222] atomic_scaffold = MurckoScaffold.GetScaffoldForMol(basic_structure) atomic_scaffold.Compute2DCoords() graph_scaffold = MurckoScaffold.MakeScaffoldGeneric(atomic_scaffold) Draw.MolsToGridImage([basic_structure, atomic_scaffold, graph_scaffold]) drugbank_atomic_scaffolds = [ MurckoScaffold.GetScaffoldForMol(mol) for mol in drugbank ] for i in drugbank_atomic_scaffolds: i.Compute2DCoords() def genericize_scaffold(s): try: return MurckoScaffold.MakeScaffoldGeneric(s) except ValueError: return None drugbank_grafh_scaffolds = [ genericize_scaffold(s) for s in drugbank_atomic_scaffolds
def GenerateMolecularFrameworks(): """Generate Bemis Murcko molecular framworks.""" Infile = OptionsInfo["Infile"] Outfile = OptionsInfo["Outfile"] UseChirality = OptionsInfo["UseChirality"] RemoveDuplicateFrameworks = OptionsInfo["RemoveDuplicateFrameworks"] UseGraphFrameworks = OptionsInfo["UseGraphFrameworks"] SortFrameworks = OptionsInfo["SortFrameworks"] if SortFrameworks: FrameworkMolIDs = [] FrameworkMolIDToMolMap = {} FrameworkMolIDToAtomCountMap = {} DuplicateFrameworkMolIDs = [] DuplicateFrameworkMolIDToMolMap = {} DuplicateFrameworkMolIDToAtomCountMap = {} DuplicatesOutfile = "" if RemoveDuplicateFrameworks: DuplicatesOutfile = OptionsInfo["DuplicatesOutfile"] # Setup a molecule reader... MiscUtil.PrintInfo("\nProcessing file %s..." % Infile) Mols = RDKitUtil.ReadMolecules(Infile, **OptionsInfo["InfileParams"]) # Set up a molecular framework writer... Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"]) if Writer is None: MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile) # Set up a duplicate molecular framework writer... if RemoveDuplicateFrameworks: DuplicatesWriter = RDKitUtil.MoleculesWriter(DuplicatesOutfile, **OptionsInfo["OutfileParams"]) if Writer is None: MiscUtil.PrintError("Failed to setup a writer for duplicates output fie %s " % DuplicatesOutfile) if RemoveDuplicateFrameworks: MiscUtil.PrintInfo("Generating files: %s and %s..." % (Outfile, DuplicatesOutfile)) else: MiscUtil.PrintInfo("Generating file %s..." % Outfile) # Process molecules... MolCount = 0 ValidMolCount = 0 FrameworksCount = 0 UniqueFrameworksCount = 0 DuplicateFrameworksCount = 0 CanonicalSMILESMap = {} Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] for Mol in Mols: MolCount += 1 if Mol is None: continue if RDKitUtil.IsMolEmpty(Mol): MolName = RDKitUtil.GetMolName(Mol, MolCount) MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) continue ValidMolCount += 1 if UseGraphFrameworks: FrameworksMol = MurckoScaffold.MakeScaffoldGeneric(Mol) else: FrameworksMol = MurckoScaffold.GetScaffoldForMol(Mol) if Compute2DCoords: AllChem.Compute2DCoords(FrameworksMol) if SortFrameworks: HeavyAtomCount = FrameworksMol.GetNumHeavyAtoms() FrameworksCount += 1 if RemoveDuplicateFrameworks: CanonicalSMILES = Chem.MolToSmiles(FrameworksMol, isomericSmiles = UseChirality, canonical = True) if CanonicalSMILES in CanonicalSMILESMap: DuplicateFrameworksCount += 1 if SortFrameworks: # Track duplicate frameworks... DuplicateFrameworkMolIDs.append(DuplicateFrameworksCount) DuplicateFrameworkMolIDToMolMap[DuplicateFrameworksCount] = FrameworksMol DuplicateFrameworkMolIDToAtomCountMap[DuplicateFrameworksCount] = HeavyAtomCount else: # Write it out... DuplicatesWriter.write(FrameworksMol) else: UniqueFrameworksCount += 1 CanonicalSMILESMap[CanonicalSMILES] = CanonicalSMILES if SortFrameworks: # Track unique frameworks... FrameworkMolIDs.append(UniqueFrameworksCount) FrameworkMolIDToMolMap[UniqueFrameworksCount] = FrameworksMol FrameworkMolIDToAtomCountMap[UniqueFrameworksCount] = HeavyAtomCount else: # Write it out... Writer.write(FrameworksMol) elif SortFrameworks: # Track for sorting... FrameworkMolIDs.append(FrameworksCount) FrameworkMolIDToMolMap[FrameworksCount] = FrameworksMol FrameworkMolIDToAtomCountMap[FrameworksCount] = HeavyAtomCount else: # Write it out... Writer.write(FrameworksMol) if SortFrameworks: ReverseOrder = OptionsInfo["DescendingSortOrder"] SortAndWriteFrameworks(Writer, FrameworkMolIDs, FrameworkMolIDToMolMap, FrameworkMolIDToAtomCountMap, ReverseOrder) if RemoveDuplicateFrameworks: SortAndWriteFrameworks(DuplicatesWriter, DuplicateFrameworkMolIDs, DuplicateFrameworkMolIDToMolMap, DuplicateFrameworkMolIDToAtomCountMap, ReverseOrder) Writer.close() if RemoveDuplicateFrameworks: DuplicatesWriter.close() MiscUtil.PrintInfo("\nTotal number of molecular frameworks: %d" % FrameworksCount) if RemoveDuplicateFrameworks: MiscUtil.PrintInfo("Number of unique molecular frameworks: %d" % UniqueFrameworksCount) MiscUtil.PrintInfo("Number of duplicate molecular frameworks: %d" % DuplicateFrameworksCount) MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
# get the molid molid = line.split()[1].strip() # get its smiles string smi = line.split()[0].strip() # get RDKit canonical smiles can = Chem.MolToSmiles(Chem.MolFromSmiles(smi)) # get canonical murcko smiles from canonical smiles murcko = ms.MurckoScaffoldSmiles(can) murcko = Chem.MolToSmiles(Chem.MolFromSmiles(murcko)) # get generic murcko smiles gen_murcko = Chem.MolToSmiles( ms.MakeScaffoldGeneric(Chem.MolFromSmiles(murcko))) # for each molid key, add the smi, murcko mol, and murcko smiles mol_dict[molid] = [smi, can, murcko, gen_murcko] # bin the mols into the different murcko scaffolds observed if gen_murcko in ms_dict: ms_dict[gen_murcko].append(molid) else: ms_dict[gen_murcko] = [molid] uniq_scaffs = ms_dict.keys() print "molid,scaffid" for molid in mol_dict: scaff = mol_dict[molid][3]
def calc_murcko_frame(mol): """Calculate the Murcko generic frame from a molecule as Smiles.""" return Chem.MolToSmiles(MurckoScaffold.MakeScaffoldGeneric(mol))