def RetrieveRGroups(Mols): """Retrieve R groups""" CoreMols = SetupCoreScaffolds(Mols) DecompositionParams = SetupRGroupDecompositionParams() RGroupDecompositionObject = rgd.RGroupDecomposition( CoreMols, DecompositionParams) MiscUtil.PrintInfo("\nPerforming R group decomposition...") UnmatchedMolIndices = [] for MolIndex, Mol in enumerate(Mols): Status = RGroupDecompositionObject.Add(Mol) if Status < 0: UnmatchedMolIndices.append(MolIndex) if not RGroupDecompositionObject.Process(): MiscUtil.PrintWarning( "R group decomposition failed to match any molecule to core scaffold(s)..." ) RGroups = RGroupDecompositionObject.GetRGroupsAsColumns(asSmiles=True) return (RGroups, UnmatchedMolIndices)
def FWDecompose( scaffolds, mols, scores, decomp_params=default_decomp_params) -> FreeWilsonDecomposition: """ Perform a free wilson analysis : param scaffolds : scaffold or list of scaffolds to use for the rgroup decomposition : param mols : molecules to decompose : param scores : list of floating point numbers for the regression ( you may need convert these to their logs in some cases) : param decomp_params : RgroupDecompositionParams default [ default_decomp_params = rdkit.Chem.rdRGroupDecomposition.RGroupDecompositionParameters() default_decomp_params.matchingStrategy = rgd.GA default_decomp_params.onlyMatchAtRGroups = False ] If you only want to decompose on specific group locations set onlyMatchAtRGroups to True >>> from rdkit import Chem >>> from freewilson import FWBuild, FWDecompose >>> from rdkit.Chem import Descriptors >>> scaffold = Chem.MolFromSmiles("c1cccnc1") >>> mols = [Chem.MolFromSmiles("c1cccnc1"+"C"*(i+1)) for i in range(100)] >>> scores = [Descriptors.MolLogP(m) for m in mols] >>> fw = FWDecompose(scaffold, mols, scores) >>> for pred in FWBuild(fw): ... print(pred) For an easy way to report predictions see >>> import sys >>> predictions_to_csv(sys.stdout, fw, FWBuild(fw)) See FWBuild docs to see how to filter predictions, molecular weight or molecular properties. """ descriptors = [] # list of descriptors, one per matched molecules # descriptors are 1/0 if a sidechain is present matched_scores = [] # scores from the matching molecules rgroup_idx = {} # rgroup index into descriptor { smiles: idx } rgroups = defaultdict(list) # final list of rgrups/sidechains if len(mols) != len(scores): raise ValueError( f"The number of molecules must match the number of scores #mols {len(mols)} #scores {len(scores)}" ) # decompose the rgroups logger.info(f"Decomposing {len(mols)} molecules...") decomposer = rgd.RGroupDecomposition(scaffolds, decomp_params) for mol, score in tqdm(zip(mols, scores)): if decomposer.Add(mol) >= 0: matched_scores.append(score) decomposer.Process() logger.info(f"Matched {len(matched_scores)} out of {len(mols)}") if not (matched_scores): logger.error("No scaffolds matched the input molecules") return decomposition = decomposition = decomposer.GetRGroupsAsRows(asSmiles=True) logger.info("Get unique rgroups...") rgroup_counts = defaultdict(int) for row in decomposition: for rgroup, smiles in row.items(): rgroup_counts[smiles] += 1 if smiles not in rgroup_idx: rgroup_idx[smiles] = len(rgroup_idx) rgroups[rgroup].append(RGroup(smiles, rgroup, 0, 0)) logger.info(f"Descriptor size {len(rgroup_idx)}") # get the descriptors list, one-hot encoding per rgroup for row in decomposition: descriptor = [0] * len(rgroup_idx) descriptors.append(descriptor) for smiles in row.values(): if smiles in rgroup_idx: descriptor[rgroup_idx[smiles]] = 1 assert len(descriptors) == len( matched_scores ), f"Number of descriptors({len(descriptors)}) doesn't match number of matcved scores({len(matched_scores)})" # Perform the Ridge Regression logger.info("Ridge Regressing...") lm = Ridge() lm.fit(descriptors, matched_scores) preds = lm.predict(descriptors) r2 = r2_score(matched_scores, preds) logger.info(f"R2 {r2}") logger.info(f"Intercept = {lm.intercept_:.2f}") for sidechains in rgroups.values(): for rgroup in sidechains: rgroup.count = rgroup_counts[rgroup.smiles] rgroup.coefficient = lm.coef_[rgroup_idx[rgroup.smiles]] rgroup.idx = rgroup_idx[rgroup.smiles] return FreeWilsonDecomposition(rgroups, rgroup_idx, lm, r2, descriptors)
def fuzzy_scaffolding( mols: List[Chem.rdchem.Mol], enforce_subs: List[str] = None, n_atom_cuttoff: int = 8, additional_templates: List[Chem.rdchem.Mol] = None, ignore_non_ring: bool = False, mcs_params: Dict[Any, Any] = None, ): """Generate fuzzy scaffold with enforceable group that needs to appear in the core, forcing to keep the full side chain if required. NOTE(hadim): consider parallelize this (if possible). Args: mols: List of all molecules enforce_subs: List of substructure to enforce on the scaffold. n_atom_cuttoff: Minimum number of atom a core should have. additional_templates: Additional template to use to generate scaffolds. ignore_non_ring: Whether to ignore atom no in murcko ring system, even if they are in the framework. mcs_params: Arguments of MCS algorithm. Returns: scaffolds: set All found scaffolds in the molecules as valid smiles scaffold_infos: dict of dict Infos on the scaffold mapping, ignoring any side chain that had to be enforced. Key corresponds to generic scaffold smiles Values at ['smarts'] corresponds to smarts representation of the true scaffold (from MCS) Values at ['mols'] corresponds to list of molecules matching the scaffold scaffold_to_group: dict of list Map between each generic scaffold and the R-groups decomposition row """ if enforce_subs is None: enforce_subs = [] if additional_templates is None: additional_templates = [] if mcs_params is None: mcs_params = {} rg_params = rdRGroupDecomposition.RGroupDecompositionParameters() rg_params.removeAllHydrogenRGroups = True rg_params.removeHydrogensPostMatch = True rg_params.alignment = rdRGroupDecomposition.RGroupCoreAlignment.MCS rg_params.matchingStrategy = rdRGroupDecomposition.RGroupMatching.Exhaustive rg_params.rgroupLabelling = rdRGroupDecomposition.RGroupLabelling.AtomMap rg_params.labels = rdRGroupDecomposition.RGroupLabels.AtomIndexLabels core_query_param = AdjustQueryParameters() core_query_param.makeDummiesQueries = True core_query_param.adjustDegree = False core_query_param.makeBondsGeneric = True # group molecules by they generic Murcko scaffold, allowing # side chain that contains cycle (might be a bad idea) scf2infos = collections.defaultdict(dict) scf2groups = {} all_scaffolds = set([]) for m in mols: generic_m = MurckoScaffold.MakeScaffoldGeneric(m) scf = MurckoScaffold.GetScaffoldForMol(m) try: scf = MurckoScaffold.MakeScaffoldGeneric(scf) except: pass if ignore_non_ring: rw_scf = Chem.RWMol(scf) atms = [a.GetIdx() for a in rw_scf.GetAtoms() if not a.IsInRing()] atms.sort(reverse=True) for a in atms: rw_scf.RemoveAtom(a) scfs = list(rdmolops.GetMolFrags(rw_scf, asMols=False)) else: scfs = [dm.to_smiles(scf)] # add templates mols if exists: for tmp in additional_templates: tmp = dm.to_mol(tmp) tmp_scf = MurckoScaffold.MakeScaffoldGeneric(tmp) if generic_m.HasSubstructMatch(tmp_scf): scfs.append(dm.to_smiles(tmp_scf)) for scf in scfs: if scf2infos[scf].get("mols"): scf2infos[scf]["mols"].append(m) else: scf2infos[scf]["mols"] = [m] for scf in scf2infos: # cheat by adding murcko as last mol always popout = False mols = scf2infos[scf]["mols"] if len(mols) < 2: mols = mols + [MurckoScaffold.GetScaffoldForMol(mols[0])] popout = True # compute the MCS of the cluster mcs = rdFMCS.FindMCS( mols, atomCompare=rdFMCS.AtomCompare.CompareAny, bondCompare=rdFMCS.BondCompare.CompareAny, completeRingsOnly=True, **mcs_params, ) mcsM = Chem.MolFromSmarts(mcs.smartsString) mcsM.UpdatePropertyCache(False) Chem.SetHybridization(mcsM) if mcsM.GetNumAtoms() < n_atom_cuttoff: continue scf2infos[scf]["smarts"] = dm.to_smarts(mcsM) if popout: mols = mols[:-1] core_groups = [] # generate rgroups based on the mcs core success_mols = [] try: rg = rdRGroupDecomposition.RGroupDecomposition(mcsM, rg_params) for i, analog in enumerate(mols): analog.RemoveAllConformers() res = rg.Add(analog) if not (res < 0): success_mols.append(i) rg.Process() core_groups = rg.GetRGroupsAsRows() except Exception: pass mols = [mols[i] for i in success_mols] scf2groups[scf] = core_groups for mol, gp in zip(mols, core_groups): core = gp["Core"] acceptable_groups = [ a.GetAtomMapNum() for a in core.GetAtoms() if (a.GetAtomMapNum() and not a.IsInRing()) ] rgroups = [ gp[f"R{k}"] for k in acceptable_groups if f"R{k}" in gp.keys() ] if enforce_subs: rgroups = [ rgp for rgp in rgroups if not any([ len(rgp.GetSubstructMatch(frag)) > 0 for frag in enforce_subs ]) ] try: scaff = trim_side_chain( mol, AdjustQueryProperties(core, core_query_param), rgroups) except: continue all_scaffolds.add(dm.to_smiles(scaff)) return all_scaffolds, scf2infos, scf2groups