def test_sanitize(): smiles = "CC(=O)Oc1ccccc1C(=O)O" mol = dm.to_mol(smiles, sanitize=False) mol = dm.sanitize_mol(mol, charge_neutral=True) assert dm.to_smiles(mol) == "CC(=O)Oc1ccccc1C(=O)O" mol = dm.sanitize_mol(None, charge_neutral=True) assert mol is None smiles_list = ( "CC.[H][N:1]1(C)=CC(O)=CC2CCCCC12", # broken "O=c1ccc2ccccc2n1", # sanitize "Cc1nnnn1C", # none "CCc1ccc2nc(=O)c(cc2c1)Cc1nnnn1C1CCCCC1", # sanitize "c1cnc2cc3ccnc3cc12", # none "c1cc2cc3ccnc3cc2n1", # none "O=c1ccnc(c1)-c1cnc2cc3ccnc3cc12", # sanitize "O=c1ccnc(c1)-c1cc1", # broken ) # check sanitize_mol assert dm.to_mol(smiles_list[1]) is None assert dm.to_mol(smiles_list[2]) is not None assert dm.sanitize_mol(None) is None assert dm.sanitize_mol(dm.to_mol(smiles_list[0], sanitize=False)) is None assert dm.sanitize_mol(dm.to_mol(smiles_list[1], sanitize=False)) is not None assert dm.sanitize_mol(dm.to_mol(smiles_list[2], sanitize=False)) is not None mol_2 = dm.sanitize_mol(dm.to_mol(smiles_list[1], sanitize=False)) assert dm.to_smiles(mol_2) == dm.sanitize_smiles("O=c1ccc2ccccc2[nH]1") fixed_smiles = [dm.sanitize_smiles(smiles) for smiles in smiles_list] assert len([x for x in fixed_smiles if x is not None]) == 6
def test_to_image(): # Get a list of molecules data = dm.data.freesolv() mols = dm.from_df(data) # type: ignore mols = mols[:8] # With multiple molecules legends = [dm.to_smiles(mol) for mol in mols] image = dm.viz.to_image(mols, legends=legends, n_cols=4, mol_size=(200, 200)) # image = _convert_ipython_to_array(image) image = np.array(image) assert image.dtype == np.uint8 assert image.shape == (400, 800, 3) assert image.shape[1] == 200 * 4 # With a single molecule mol = mols[0] legends = dm.to_smiles(mol) image = dm.viz.to_image(mol, legends=legends, mol_size=(200, 200)) # image = _convert_ipython_to_array(image) image = np.array(image) assert image.dtype == np.uint8 assert image.shape == (200, 200, 3) dm.viz.to_image(mol, indices=True, mol_size=400)
def test_to_smiles_fail(): smiles = dm.to_smiles(55, allow_to_fail=False) assert smiles == None # NOTE(hadim): ideally you want to catch only `Boost.Python.ArgumentError` here. with pytest.raises(Exception): dm.to_smiles(55, allow_to_fail=True)
def test_standardize_mol(): sm = "[Na]OC1=CC2CCCCC2N=C1" sm_standard = dm.to_smiles(dm.standardize_smiles(sm)) standard_mol = dm.standardize_mol(dm.to_mol(sm), disconnect_metals=True, uncharge=True) mol_standard = dm.to_smiles(Chem.MolToSmiles(standard_mol)) assert sm_standard == mol_standard
def test_to_sdf_mols(datadir, tmp_path): data_path = datadir / "TUBB3-observations.sdf.gz" mols = dm.read_sdf(data_path, as_df=False) sdf_path = tmp_path / "mols.sdf" dm.to_sdf(mols, sdf_path) new_mols = dm.read_sdf(sdf_path, as_df=False) assert [dm.to_smiles(mol) for mol in mols] == [dm.to_smiles(mol) for mol in new_mols]
def test_to_sdf_single_mol(tmp_path): sdf_path = tmp_path / "test.sdf" smiles = "CC1(C2C(C3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4O)O)O)O)C(=O)N)N(C)C)O)O" mol = dm.to_mol(smiles) dm.to_sdf(mol, sdf_path) mols = dm.read_sdf(sdf_path) assert dm.to_smiles(mol) == dm.to_smiles(mols[0])
def mmpa_cut(mol: Chem.Mol, rdkit_pattern: bool = False) -> Optional[Set[Any]]: """Cut molecules to perform mmpa analysis later Args: mol: Molecule to fragment. rdkit_pattern: Whether to perform the fragmentation using the default rdkit pattern: [#6+0;!$(*=, #[!#6])]!@!=!#[*]" Returns: List of 'smiles,core,chains' """ if mol is None: return mol outlines = set() smiles = dm.to_smiles(mol) if rdkit_pattern: frags = mmpa_frag(mol, max_cut=3, max_bond_cut=30) else: # heavy atoms frags = mmpa_frag(mol, pattern="[!#1]!@!=!#[!#1]", max_cut=4, max_bond_cut=30) frags.update( mmpa_frag(mol, pattern="[!#1]!@!=!#[!#1]", max_cut=3, max_bond_cut=30)) frags = set(frags) for core, chains in frags: output = f"{smiles},{core},{chains}\n" outlines.add(output) # hydrogen splitting mol = Chem.AddHs(mol) smiles = dm.to_smiles(mol) n = mol.GetNumHeavyAtoms() if n < 60: frags = mmpa_frag(mol, pattern=None, max_cut=1, max_bond_cut=100, h_split=True) for core, chains in frags: output = f"{smiles},{core},{chains}\n" outlines.add(output) return outlines
def all_transform_apply( mol, rxns, max_num_action=float("Inf"), asMols=True, **kwargs, ): """ Apply a transformation defined as a reaction from a set of reaction to the input molecule. The reaction need to be one reactant-only Arguments ---------- mol: <Chem.Mol> Input molecule rnxs: list list of reactions/ reaction smarts max_num_action: int, optional Maximum number of result to return (Default: inf) asMols: bool, optional Whether to return smiles or mols Returns ------- Products obtained from applying the chemical reactions """ mols = set([]) with dm.without_rdkit_log(): for rxn in rxns: if len(mols) >= max_num_action: break if isinstance(rxn, str): rxn = AllChem.ReactionFromSmarts(rxn) try: pcdts = [products[0] for products in rxn.RunReactants([mol])] pcdts = [dm.sanitize_mol(x) for x in pcdts] mols.update([dm.to_smiles(x) for x in pcdts if x]) except: pass mols = [x for x in mols if x is not None] if np.isfinite(max_num_action): mols = mols[:max_num_action] mols = [dm.to_mol(x) for x in mols] if not asMols: mols = [dm.to_smiles(x) for x in mols if x is not None] return mols
def smiles_to_fingerprint(smiles): mol = dm.to_mol(str(smiles), ordered=True) # mol = dm.fix_mol(mol) # mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False) # mol = dm.standardize_mol(mol, disconnect_metals=False, normalize=True, reionize=True, uncharge=False, stereo=True) fingerprint_function = rdMolDescriptors.GetMorganFingerprintAsBitVect pars = { "radius": 2, "nBits": 8192, "invariants": [], "fromAtoms": [], "useChirality": False, "useBondTypes": True, "useFeatures": False, } fp = fingerprint_function(mol, **pars) standard_smiles = dm.to_smiles(mol) # row["selfies"] = dm.to_selfies(mol) # row["inchi"] = dm.to_inchi(mol) # row["inchikey"] = dm.to_inchikey(mol) achiral_fp = list(fp.GetOnBits()) return standard_smiles, achiral_fp
def _compute_fragment_join( mol, fragment, mol_atom_count, bond_between_rings=True, asMols=True, ): """List all posibilities of where a fragment can be attached to a mol""" fragment = copy.copy( fragment ) # need to copy the fragment copy is faster than all the other methods with dm.without_rdkit_log(): combined = Chem.CombineMols(mol, fragment) for i1 in range(mol.GetNumAtoms()): a1 = combined.GetAtomWithIdx(i1) if a1.GetImplicitValence() == 0: continue for i2 in range(fragment.GetNumAtoms()): i2 += mol_atom_count a2 = combined.GetAtomWithIdx(i2) if a2.GetImplicitValence() == 0: continue # no bond between atoms already in rings if not bond_between_rings and a1.IsInRing() and a2.IsInRing(): continue # no bond to form large rings else: possibilities = _all_atom_join(combined, a1, a2) for x in possibilities: x = dm.sanitize_mol(x) if x is not None: if not asMols: x = dm.to_smiles(x) yield x
def _preprocess(i, row): # print('hello') mol = dm.to_mol(str(row[smiles_column]), ordered=True) mol = dm.fix_mol(mol) mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False) mol = dm.standardize_mol(mol, disconnect_metals=False, normalize=True, reionize=True, uncharge=False, stereo=True) fingerprint_function = rdMolDescriptors.GetMorganFingerprintAsBitVect pars = { "radius": 2, "nBits": 8192, "invariants": [], "fromAtoms": [], "useChirality": True, "useBondTypes": True, "useFeatures": False, } fp = fingerprint_function(mol, **pars) row["standard_smiles"] = dm.standardize_smiles(dm.to_smiles(mol)) row["selfies"] = dm.to_selfies(mol) row["inchi"] = dm.to_inchi(mol) row["inchikey"] = dm.to_inchikey(mol) row["onbits_fp"] =list(fp.GetOnBits()) return row
def test_to_from_text(tmp_path): temp_file = tmp_path / "mols.smi" smiles_list = [ "Cn1c(=S)ccc2nc[nH]c21", "Clc1n[nH]c2c1=[NH+]C(c1ccc[nH+]c1)C[NH+]=2", "Fc1ccsc1", "N#Cc1cc2c(o1)[NH2+]CCN2Cn1cnc2c1CSCC2", "O=CN1CCC2NC=CC2C1", "Oc1[nH]nc2c1-n1ncnc1C2", "OC1=NNC2(OC=CCO2)C2(C3CCCc4nonc43)NN=NN12", "[NH-]Sc1cc2nc[nH+]cc2o1", "[NH3+]C12CNCCOC1(N1CCCCC1)C=C(F)NC2", ] mols = [dm.to_mol(m) for m in smiles_list] # Save from text and read from text dm.to_smi(mols, temp_file) loaded_mols = dm.read_smi(temp_file) loaded_smiles = [dm.to_smiles(m) for m in loaded_mols] assert loaded_smiles == smiles_list # Check error raised when list is empty with pytest.raises(ValueError): dm.to_smi([], temp_file, error_if_empty=True) temp_file.unlink() # Check file like object works too file_like = io.StringIO() dm.to_smi(mols, file_like) assert file_like.getvalue().strip().split("\n") == smiles_list
def all_mmpa_assemble(molist, max_num_action=float("Inf"), asMols=True, **kwargs): """Enumerate all mmpa assembly of molecules in molist Arguments ---------- molist: list of <Chem.Mol> List of molecules to fragmente and reconstruct asMols: bool, optional Whether to return smiles or mols max_num_action: int, optional Maximum number of assembly (Default: inf) Returns ------- res: list of <Chem.Mol> Molecules obtained by merging core and side_chains """ frags = set([]) cores = [] side_chains = [] for mol in molist: mol_frag = mmpa_frag(mol, max_bond_cut=30) if not mol_frag: continue _, mol_frag = map(list, zip(*mol_frag)) for m in mol_frag: core, sidechain = m.split(".") cores.append(Chem.MolFromSmiles(core.replace("[*:1]", "[1*]"))) side_chains.append(Chem.MolFromSmiles(sidechain.replace("[*:1]", "[1*]"))) new_mols = _compute_mmpa_assembly(cores, side_chains, max_num_action=max_num_action) if not asMols: new_mols = [dm.to_smiles(x) for x in new_mols if x] return new_mols
def test_enumerate_tautomers(): mol = dm.to_mol("OC1=CC2CCCCC2[N:1]=C1") mols = dm.enumerate_tautomers(mol, n_variants=10) assert {dm.to_smiles(m) for m in mols } == {"O=C1C=[N:1]C2CCCCC2C1", "OC1=CC2CCCCC2[N:1]=C1"}
def test_from_selfies(): selfies = ( "[C][C][Branch1_2][C][=O][O][C][=C][C][=C][C][=C][Ring1][Branch1_2][C][Branch1_2][C][=O][O]" ) smiles = dm.from_selfies(selfies, as_mol=False) assert smiles == "CC(=O)OC1=CC=CC=C1C(=O)O" mol = dm.from_selfies(selfies, as_mol=True) assert dm.to_smiles(mol) == "CC(=O)Oc1ccccc1C(=O)O"
def test_to_neutral(): smiles = "[NH4+]" mol = dm.to_mol(smiles, add_hs=False, explicit_only=False) smiles = dm.to_smiles(dm.to_neutral(mol)) assert smiles == "[NH4]" smiles = "O=C(c1ccccc1)[O-]" mol = dm.to_mol(smiles, add_hs=False, explicit_only=False) uncharged_mol = dm.to_neutral(mol) assert sum([a.GetFormalCharge() for a in uncharged_mol.GetAtoms()]) == 0
def test_enumerate_stereo(): mol = dm.to_mol("OC1=CC2CCCCC2[N:1]=C1") mols = dm.enumerate_stereoisomers(mol, n_variants=10) assert {dm.to_smiles(m) for m in mols} == { "OC1=C[C@@H]2CCCC[C@@H]2[N:1]=C1", "OC1=C[C@@H]2CCCC[C@H]2[N:1]=C1", "OC1=C[C@H]2CCCC[C@@H]2[N:1]=C1", "OC1=C[C@H]2CCCC[C@H]2[N:1]=C1", }
def mmpa_fragment_exchange(mol1, mol2, return_all=False, **kwargs): """Perform a fragment exchange between two molecules using mmpa rules Arguments ---------- mol1: <Chem.Mol> input molecule 1 mol2: <Chem.Mol> input molecule 1 return_all: bool, optional Whether to return list of all molecules Returns ------- modified_mol1, modified_mol2 Molecules obtained by exchanging fragment between mol1 and mol2. In case of failure, mol1, mol2 are returned """ unwanted = [dm.to_smiles(m) for m in [mol1, mol2]] + [None] res = all_mmpa_assemble([mol1, mol2]) # find unique res = set([dm.to_smiles(m) for m in res]) res = list(res - set(unwanted)) out = [] for sm in res: r = None try: r = dm.to_mol(sm, sanitize=True) except: continue if r is not None: out.append(r) if return_all: return out random.shuffle(out) out.extend([mol1, mol2]) return out[0], out[1]
def all_fragment_on_bond(mol, asMols=False, max_num_action=float("Inf"), break_aromatic=True): """Fragment all possible bond in a molecule and return the set of resulting fragments This is similar to `random_bond_cut`, but is not stochastic as it does not return a random fragment but all the fragments resulting from all potential bond break in the molecule. .. note:: This will always be a subset of all_bond_remove, the main difference being that all_bond_remove, allow decreasing bond count, while this one will always break a molecule into two. Args: mol: <Chem.Mol> input molecule asMols: bool, optional Whether to return results as mols or smiles max_num_action: float, optional Maximum number of action to reduce complexity break_aromatic: bool, optional Whether to attempt to break even aromatic bonds (Default: True) Returns: set of fragments """ mol.GetRingInfo().AtomRings() fragment_set = set([]) bonds = list(mol.GetBonds()) stop = False if bonds: if break_aromatic: Chem.Kekulize(mol, clearAromaticFlags=True) for bond in bonds: if stop: break if break_aromatic or not bond.GetIsAromatic(): truncate = Chem.FragmentOnBonds(mol, [bond.GetIdx()], addDummies=False) truncate = dm.sanitize_mol(truncate) if truncate is not None: for frag in rdmolops.GetMolFrags(truncate, asMols=True): frag = dm.sanitize_mol(frag) if frag: if not asMols: frag = dm.to_smiles(frag) fragment_set.add(frag) if len(fragment_set) > max_num_action: stop = True break return fragment_set
def _preprocess(i, row): # print('hello') try: mol = dm.to_mol(str(row[smiles_column]), ordered=True) mol = dm.fix_mol(mol) mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False) mol = dm.standardize_mol(mol, disconnect_metals=False, normalize=True, reionize=True, uncharge=False, stereo=True) opts = StereoEnumerationOptions(unique=True, maxIsomers=20, rand=0xf00d) isomers = EnumerateStereoisomers(mol, options=opts) enum_smiles = sorted( Chem.MolToSmiles(y, isomericSmiles=True) for y in isomers) smiles_list = [] for count, smi in enumerate(enum_smiles): smiles_string = smi smiles_list.append(smiles_string) # fingerprint_function = rdMolDescriptors.GetMorganFingerprintAsBitVect # pars = { "radius": 2, # "nBits": 8192, # "invariants": [], # "fromAtoms": [], # "useChirality": False, # "useBondTypes": True, # "useFeatures": False, # } # fp = fingerprint_function(mol, **pars) row["standard_smiles"] = dm.standardize_smiles(dm.to_smiles(mol)) row["selfies"] = dm.to_selfies(mol) row["inchi"] = dm.to_inchi(mol) row["inchikey"] = dm.to_inchikey(mol) row["enumerated_smiles"] = smiles_list # row["onbits_fp"] =list(fp.GetOnBits()) return row except ValueError: row["standard_smiles"] = 'dropped' row["selfies"] = 'dropped' row["inchi"] = 'dropped' row["inchikey"] = 'dropped' row["enumerated_smiles"] = list('dropped') return row
def all_atom_add( mol, atom_types=["C", "N", "O", "F", "Cl", "Br"], asMols=True, max_num_action=float("Inf"), **kwargs, ): """Add a new atom on the mol, by considering all bond type .. warning:: This is computationally expensive Args: mol: <Chem.Mol> Input molecule atom_types: list List of atom symbol to use as replacement (Default: ["C", "N", "O", "F", "Cl", "Br"]) asMols: bool, optional Whether to return output as molecule or smiles max_num_action: float, optional Maximum number of action to reduce complexity Returns: All possible molecules with one additional atom added """ new_mols = [] stop = False with dm.without_rdkit_log(): for atom in mol.GetAtoms(): if stop: break if atom.GetImplicitValence() == 0: continue for atom_symb in atom_types: emol = Chem.RWMol(mol) new_index = emol.AddAtom(Chem.Atom(atom_symb)) emol.UpdatePropertyCache(strict=False) new_mols.extend( _all_atom_join(emol, atom, emol.GetMol().GetAtomWithIdx(new_index))) if len(new_mols) > max_num_action: stop = True break new_mols = [dm.sanitize_mol(mol) for mol in new_mols] new_mols = [mol for mol in new_mols if mol is not None] if not asMols: return [dm.to_smiles(x) for x in new_mols if x] return new_mols
def test_inchi(): smiles = "CC(=O)Oc1ccccc1C(=O)O" mol = dm.to_mol(smiles) inchi = dm.to_inchi(mol) assert inchi == "InChI=1S/C9H8O4/c1-6(10)13-8-5-3-2-4-7(8)9(11)12/h2-5H,1H3,(H,11,12)" inchikey = dm.to_inchikey(mol) assert inchikey == "BSYNRYMUTXBXSQ-UHFFFAOYSA-N" new_mol = dm.from_inchi(inchi) assert dm.to_smiles(new_mol) == smiles assert dm.to_inchi(None) is None assert dm.to_inchikey(None) is None assert dm.from_inchi(None) is None
def all_atom_replace(mol, atom_types=["C", "N", "S", "O"], asMols=True, max_num_action=float("Inf"), **kwargs): """Replace all non-hydrogen atoms by other possibilities. .. warning:: This is computationally expensive Args: mol: <Chem.Mol> Input molecule atom_types: list List of atom symbol to use as replacement (Default: ['C', 'N', 'S', 'O']) asMols: bool, optional Whether to return output as molecule or smiles max_num_action: float, optional Maximum number of action to reduce complexity Returns: All possible molecules with atoms replaced """ new_mols = [] stop = False with dm.without_rdkit_log(): for atom in mol.GetAtoms(): if stop: break if atom.GetAtomicNum() > 1: for atom_symb in atom_types: emol = Chem.RWMol(mol) emol.ReplaceAtom(atom.GetIdx(), Chem.Atom(atom_symb)) new_mols.append(emol) if len(new_mols) > max_num_action: stop = True break # Sanitize and remove bad molecules new_mols = [dm.sanitize_mol(mol) for mol in new_mols] new_mols = [mol for mol in new_mols if mol is not None] if not asMols: # Return SMILES return [dm.to_smiles(x) for x in new_mols] return new_mols
def all_fragment_assemble( fragmentlist, max_num_action=float("Inf"), asMols=True, seen=None, **kwargs, ): """Assemble a set of fragment into a new molecule .. warning:: This is computationally expensive Arguments ---------- fragmentlist: list List of blocks to use for replacement, or addition to molparent max_num_action: float, optional Maximum number of action to reduce complexity. No limit by default asMols: bool, optional Whether to return smiles or mols seen: list, optional List of initial molecules Returns ------- reconstructed molecules """ mols = [] for m in dm.assemble.assemble_brics_order( fragmentlist, seen=seen, allow_incomplete=False, max_n_mols=max_num_action ): if len(mols) > max_num_action: break mols.append(m) if not asMols: mols = [dm.to_smiles(x) for x in mols if x is not None] return mols
def sanitize_smiles(smiles: str, isomeric: bool = True) -> Optional[str]: """Takes SMILES string and returns its sanitized version. Args: smiles: smiles to be sanitized. isomeric: Whether to include information about stereochemistry in the SMILES. Returns: sanitized smiles. """ try: mol = dm.to_mol(smiles, sanitize=False) mol = dm.sanitize_mol(mol, False) except Exception: return None if mol is None: return None try: smiles = dm.to_smiles(mol, isomeric=isomeric) # type: ignore except: return None return smiles
def test_to_smiles(): smiles = "O=C(C)Oc1ccccc1C(=O)O" mol = dm.to_mol(smiles) smiles = dm.to_smiles( mol, isomeric=True, ordered=True, explicit_bonds=False, explicit_hs=False, ) assert smiles == "CC(=O)Oc1ccccc1C(=O)O" smiles = dm.to_smiles( mol, isomeric=True, ordered=False, explicit_bonds=True, explicit_hs=False, ) assert smiles == "C-C(=O)-O-c1:c:c:c:c:c:1-C(=O)-O" smiles = dm.to_smiles( mol, isomeric=True, ordered=False, explicit_bonds=False, explicit_hs=True, ) assert smiles == "[CH3][C](=[O])[O][c]1[cH][cH][cH][cH][c]1[C](=[O])[OH]" smiles = "O=C(C)Oc1ccccc1C(=O)O" mol = dm.to_mol(smiles) randomized_smiles = dm.to_smiles(mol, randomize=True) randomized_mol = dm.to_mol(randomized_smiles) assert dm.to_smiles(randomized_mol) == dm.to_smiles(mol)
def fuzzy_scaffolding( mols: List[Chem.rdchem.Mol], enforce_subs: List[str] = None, n_atom_cuttoff: int = 8, additional_templates: List[Chem.rdchem.Mol] = None, ignore_non_ring: bool = False, mcs_params: Dict[Any, Any] = None, ): """Generate fuzzy scaffold with enforceable group that needs to appear in the core, forcing to keep the full side chain if required. NOTE(hadim): consider parallelize this (if possible). Args: mols: List of all molecules enforce_subs: List of substructure to enforce on the scaffold. n_atom_cuttoff: Minimum number of atom a core should have. additional_templates: Additional template to use to generate scaffolds. ignore_non_ring: Whether to ignore atom no in murcko ring system, even if they are in the framework. mcs_params: Arguments of MCS algorithm. Returns: scaffolds: set All found scaffolds in the molecules as valid smiles scaffold_infos: dict of dict Infos on the scaffold mapping, ignoring any side chain that had to be enforced. Key corresponds to generic scaffold smiles Values at ['smarts'] corresponds to smarts representation of the true scaffold (from MCS) Values at ['mols'] corresponds to list of molecules matching the scaffold scaffold_to_group: dict of list Map between each generic scaffold and the R-groups decomposition row """ if enforce_subs is None: enforce_subs = [] if additional_templates is None: additional_templates = [] if mcs_params is None: mcs_params = {} rg_params = rdRGroupDecomposition.RGroupDecompositionParameters() rg_params.removeAllHydrogenRGroups = True rg_params.removeHydrogensPostMatch = True rg_params.alignment = rdRGroupDecomposition.RGroupCoreAlignment.MCS rg_params.matchingStrategy = rdRGroupDecomposition.RGroupMatching.Exhaustive rg_params.rgroupLabelling = rdRGroupDecomposition.RGroupLabelling.AtomMap rg_params.labels = rdRGroupDecomposition.RGroupLabels.AtomIndexLabels core_query_param = AdjustQueryParameters() core_query_param.makeDummiesQueries = True core_query_param.adjustDegree = False core_query_param.makeBondsGeneric = True # group molecules by they generic Murcko scaffold, allowing # side chain that contains cycle (might be a bad idea) scf2infos = collections.defaultdict(dict) scf2groups = {} all_scaffolds = set([]) for m in mols: generic_m = MurckoScaffold.MakeScaffoldGeneric(m) scf = MurckoScaffold.GetScaffoldForMol(m) try: scf = MurckoScaffold.MakeScaffoldGeneric(scf) except: pass if ignore_non_ring: rw_scf = Chem.RWMol(scf) atms = [a.GetIdx() for a in rw_scf.GetAtoms() if not a.IsInRing()] atms.sort(reverse=True) for a in atms: rw_scf.RemoveAtom(a) scfs = list(rdmolops.GetMolFrags(rw_scf, asMols=False)) else: scfs = [dm.to_smiles(scf)] # add templates mols if exists: for tmp in additional_templates: tmp = dm.to_mol(tmp) tmp_scf = MurckoScaffold.MakeScaffoldGeneric(tmp) if generic_m.HasSubstructMatch(tmp_scf): scfs.append(dm.to_smiles(tmp_scf)) for scf in scfs: if scf2infos[scf].get("mols"): scf2infos[scf]["mols"].append(m) else: scf2infos[scf]["mols"] = [m] for scf in scf2infos: # cheat by adding murcko as last mol always popout = False mols = scf2infos[scf]["mols"] if len(mols) < 2: mols = mols + [MurckoScaffold.GetScaffoldForMol(mols[0])] popout = True # compute the MCS of the cluster mcs = rdFMCS.FindMCS( mols, atomCompare=rdFMCS.AtomCompare.CompareAny, bondCompare=rdFMCS.BondCompare.CompareAny, completeRingsOnly=True, **mcs_params, ) mcsM = Chem.MolFromSmarts(mcs.smartsString) mcsM.UpdatePropertyCache(False) Chem.SetHybridization(mcsM) if mcsM.GetNumAtoms() < n_atom_cuttoff: continue scf2infos[scf]["smarts"] = dm.to_smarts(mcsM) if popout: mols = mols[:-1] core_groups = [] # generate rgroups based on the mcs core success_mols = [] try: rg = rdRGroupDecomposition.RGroupDecomposition(mcsM, rg_params) for i, analog in enumerate(mols): analog.RemoveAllConformers() res = rg.Add(analog) if not (res < 0): success_mols.append(i) rg.Process() core_groups = rg.GetRGroupsAsRows() except Exception: pass mols = [mols[i] for i in success_mols] scf2groups[scf] = core_groups for mol, gp in zip(mols, core_groups): core = gp["Core"] acceptable_groups = [ a.GetAtomMapNum() for a in core.GetAtoms() if (a.GetAtomMapNum() and not a.IsInRing()) ] rgroups = [ gp[f"R{k}"] for k in acceptable_groups if f"R{k}" in gp.keys() ] if enforce_subs: rgroups = [ rgp for rgp in rgroups if not any([ len(rgp.GetSubstructMatch(frag)) > 0 for frag in enforce_subs ]) ] try: scaff = trim_side_chain( mol, AdjustQueryProperties(core, core_query_param), rgroups) except: continue all_scaffolds.add(dm.to_smiles(scaff)) return all_scaffolds, scf2infos, scf2groups
def test_to_cxsmiles(): mol = dm.to_mol("OC1=CC2CCCCC2[N:1]=C1") smiles = dm.to_smiles(mol, cxsmiles=True) assert smiles == "OC1=CC2CCCCC2[N:1]=C1 |atomProp:9.molAtomMapNumber.1|"
def to_df( mols: List[Chem.rdchem.Mol], smiles_column: Optional[str] = "smiles", mol_column: str = None, include_private: bool = False, include_computed: bool = False, render_df_mol: bool = True, render_all_df_mol: bool = False, ) -> Optional[pd.DataFrame]: """Convert a list of mols to a dataframe using each mol properties as a column. Args: mols: a molecule. smiles_column: name of the SMILES column. mol_column: Name of the column. If not None, rdkit.Chem.PandaTools is used to add a molecule column. include_private: Include private properties in the columns. include_computed: Include computed properties in the columns. render_df_mol: whether to render the molecule in the dataframe to images. If called once, it will be applied for the newly created dataframe with mol in it. render_all_df_mol: Whether to render all pandas dataframe mol column as images. """ # Init a dataframe df = pd.DataFrame() # Feed it with smiles if smiles_column is not None: smiles = [dm.to_smiles(mol) for mol in mols] df[smiles_column] = smiles # Add a mol column if mol_column is not None: df[mol_column] = mols # Add any other properties present in the molecule props = [ mol.GetPropsAsDict( includePrivate=include_private, includeComputed=include_computed, ) for mol in mols ] props_df = pd.DataFrame(props) if smiles_column is not None and smiles_column in props_df.columns: logger.warning( f"The SMILES column name provided ('{smiles_column}') is already present in the properties" " of the molecules. THe returned dataframe will two columns with the same name." ) # Concat the df with the properties df df = pd.concat([df, props_df], axis=1) # Render mol column to images if render_df_mol is True and mol_column is not None: # NOTE(hadim): replace by `PandaTools.ChangeMoleculeRendering` once # https://github.com/rdkit/rdkit/issues/3563 is fixed. _ChangeMoleculeRendering(df) if render_all_df_mol: PandasTools.RenderImagesInAllDataFrames() return df
def test_sanitize_first(): smiles = ["fake_smiles", "CC(=O)Oc1ccccc1C(=O)O"] mols = [dm.to_mol(s) for s in smiles] mol = dm.sanitize_first(mols) assert dm.to_smiles(mol) == "CC(=O)Oc1ccccc1C(=O)O"