def get_hieriarchical_frags(mol_or_smi): """Hierarchically (recursively) split a molecule into fragments. Only-non-ring bonds are split and only fragments with at least one ring are considered. Takes a mol object or a Smiles string as input. Returns a list of fragments as Smiles.""" def _recursive_split(s, n=0): m = Chem.MolFromSmiles(s) if m is None: return splittable_bonds = [] for b in m.GetBonds(): if not b.IsInRing(): splittable_bonds.append(b.GetIdx()) frags = [] for bidx in splittable_bonds: nm = Chem.FragmentOnBonds(m, [bidx], addDummies=False) try: splits = Chem.GetMolFrags(nm, asMols=True) except ValueError: continue # verify the split occurred between two rings if len(splits) == 2 and Chem.CalcNumRings( splits[0]) > 0 and Chem.CalcNumRings(splits[1]) > 0: frags.extend(splits) for f in frags: try: murcko = MurckoScaffold.MurckoScaffoldSmiles(mol=f) except ValueError: continue if murcko not in result: result[murcko] = True if "[CH]" in murcko: print(f"{murcko} ({Chem.MolToSmiles(f)})") _recursive_split(murcko, n + 1) if isinstance(mol_or_smi, str): try: murcko = MurckoScaffold.MurckoScaffoldSmiles(smiles=mol_or_smi) except ValueError: return [] else: try: murcko = MurckoScaffold.MurckoScaffoldSmiles(mol=mol_or_smi) except ValueError: return [] result = {murcko: True} _recursive_split(murcko) return list(sorted(result.keys(), key=len, reverse=True))
def _recursive_split(s, n=0): m = Chem.MolFromSmiles(s) if m is None: return splittable_bonds = [] for b in m.GetBonds(): if not b.IsInRing(): splittable_bonds.append(b.GetIdx()) frags = [] for bidx in splittable_bonds: nm = Chem.FragmentOnBonds(m, [bidx], addDummies=False) try: splits = Chem.GetMolFrags(nm, asMols=True) except ValueError: continue # verify the split occurred between two rings if len(splits) == 2 and Chem.CalcNumRings( splits[0]) > 0 and Chem.CalcNumRings(splits[1]) > 0: frags.extend(splits) for f in frags: try: murcko = MurckoScaffold.MurckoScaffoldSmiles(mol=f) except ValueError: continue if murcko not in result: result[murcko] = True if "[CH]" in murcko: print(f"{murcko} ({Chem.MolToSmiles(f)})") _recursive_split(murcko, n + 1)
def _recurse(scaf): orig_mol = Chem.MolFromSmiles(scaf) rwmol = Chem.RWMol(orig_mol) ri = rwmol.GetRingInfo() if ri.NumRings() < 3: return bonds = rwmol.GetBonds() for bond in bonds: if not bond.IsInRing(): rwmol = Chem.RWMol(orig_mol) rwmol.RemoveBond(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()) frags = rwmol.GetMol() frag_list = Chem.MolToSmiles(frags).split(".") ring_split = 0 rings_per_frag = [] for frag in frag_list: # have we split between two rings? if len(frag) > 2: mol = Chem.MolFromSmiles(frag) ri = mol.GetRingInfo() num_rings = ri.NumRings() rings_per_frag.append(num_rings) if num_rings > 0: ring_split += 1 if ring_split >= 2: for idx, frag in enumerate(frag_list): if rings_per_frag[idx] > 1: murcko_frag = MurckoScaffold.MurckoScaffoldSmiles( frag) if murcko_frag not in scaf_list: scaf_list.append(murcko_frag) _recurse(murcko_frag)
def get_scaffold(self, smiles): from rdkit.Chem.Scaffolds import MurckoScaffold mol = Chem.MolFromSmiles(smiles) return MurckoScaffold.MurckoScaffoldSmiles( mol = mol, includeChirality = self.include_chirality )
def generate_scaffold(smiles, include_chirality=False): """Compute the Bemis-Murcko scaffold for a SMILES string.""" if isinstance(smiles, Chem.Mol): mol = smiles else: mol = Chem.MolFromSmiles(smiles) scaffold = MurckoScaffold.MurckoScaffoldSmiles( mol=mol, includeChirality=include_chirality) return scaffold
def generate_scaffold(smiles, include_chirality=False): """ Obtain Bemis-Murcko scaffold from smiles :param smiles: :param include_chirality: :return: smiles of scaffold """ scaffold = MurckoScaffold.MurckoScaffoldSmiles( smiles=smiles, includeChirality=include_chirality) return scaffold
def get_ordered_scaffold_sets(molecules, include_chirality, log_every_n): """Group molecules based on their Bemis-Murcko scaffolds and order these groups based on their sizes. The order is decided by comparing the size of groups, where groups with a larger size are placed before the ones with a smaller size. Parameters ---------- molecules : list of rdkit.Chem.rdchem.Mol Pre-computed RDKit molecule instances. We expect a one-on-one correspondence between ``dataset.smiles`` and ``mols``, i.e. ``mols[i]`` corresponds to ``dataset.smiles[i]``. include_chirality : bool Whether to consider chirality in computing scaffolds. log_every_n : None or int Molecule related computation can take a long time for a large dataset and we want to learn the progress of processing. This can be done by printing a message whenever a batch of ``log_every_n`` molecules have been processed. If None, no messages will be printed. Returns ------- scaffold_sets : list Each element of the list is a list of int, representing the indices of compounds with a same scaffold. """ if log_every_n is not None: print('Start computing Bemis-Murcko scaffolds.') scaffolds = defaultdict(list) for i, mol in enumerate(molecules): count_and_log('Computing Bemis-Murcko for compound', i, len(molecules), log_every_n) # For mols that have not been sanitized, we need to compute their ring information try: FastFindRings(mol) mol_scaffold = MurckoScaffold.MurckoScaffoldSmiles( mol=mol, includeChirality=include_chirality) # Group molecules that have the same scaffold scaffolds[mol_scaffold].append(i) except: print('Failed to compute the scaffold for molecule {:d} ' 'and it will be excluded.'.format(i + 1)) # Order groups of molecules by first comparing the size of groups # and then the index of the first compound in the group. scaffold_sets = [ scaffold_set for (scaffold, scaffold_set) in sorted(scaffolds.items(), key=lambda x: (len(x[1]), x[1][0]), reverse=True) ] return scaffold_sets
def generate_scaffold(smiles, include_chirality=False): """ Compute the Bemis-Murcko scaffold for a SMILES string. :param smiles: A smiles string. :param include_chirality: Whether to include chirality. :return: """ mol = Chem.MolFromSmiles(smiles) scaffold = MurckoScaffold.MurckoScaffoldSmiles( mol=mol, includeChirality=include_chirality) return scaffold
def murcko_clustering(folder): df = read_csvs(folder="../data") murcko_clusters = {} for index, row in df.iterrows(): sm = row["smiles"] fw = MurckoScaffold.MurckoScaffoldSmiles(sm) try: murcko_clusters[fw].append(sm) except: murcko_clusters[fw] = [sm] if index % 10000 == 0: print(index)
def generate_scaffold(mol: Union[str, Chem.Mol], include_chirality: bool = False) -> str: """ Computes the Bemis-Murcko scaffold for a SMILES string. :param mol: A SMILES or an RDKit molecule. :param include_chirality: Whether to include chirality in the computed scaffold.. :return: The Bemis-Murcko scaffold for the molecule. """ mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol, includeChirality=include_chirality) return scaffold
def pipe_murcko_smiles(stream, summary=None, comp_id="pipe_murcko_smiles"): """Calculate Murcko Smiles from the molecules on the stream. The property `murcko_smiles` is added to each record.""" rec_counter = 0 for rec in stream: rec_counter += 1 msmiles = MurckoScaffold.MurckoScaffoldSmiles(mol=rec["mol"]) if summary is not None: summary[comp_id] = rec_counter rec["Murcko_Smiles"] = msmiles yield rec
def get_scaffold(self, mol): """ Get Murcko scaffolds for molecules. Murcko scaffolds are described in DOI: 10.1021/jm9602928. They are essentially that part of the molecule consisting of rings and the linker atoms between them. Parameters ---------- mols : array_like Molecules. """ return MurckoScaffold.MurckoScaffoldSmiles( mol=mol, includeChirality=self.include_chirality)
def generate_scaffold(mol, include_chirality=True): """ Compute the Bemis-Murcko scaffold for a SMILES string. Implementation copied from https://github.com/chemprop/chemprop. :param mol: A smiles string or an RDKit molecule. :param include_chirality: Whether to include chirality. :return: """ mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol scaffold = MurckoScaffold.MurckoScaffoldSmiles( mol=mol, includeChirality=include_chirality) return scaffold
def generate_scaffold(mol: Union[str, Chem.Mol], include_chirality: bool = False) -> str: """ Compute the Bemis-Murcko scaffold for a SMILES string. :param mol: A smiles string or an RDKit molecule. :param include_chirality: Whether to include chirality. :return: """ mol = Chem.MolFromSmiles(mol) if isinstance(mol, str) else mol scaffold = MurckoScaffold.MurckoScaffoldSmiles( mol=mol, includeChirality=include_chirality) return scaffold
def _generate_scaffold(self, smiles, include_chirality=False): """ Obtain Bemis-Murcko scaffold from smiles Args: smiles (str): SMILES string include_chirality (bool): Whether taking chirality into consideration when generating MurckoScaffolds. Returns: SMILES of scaffold """ scaffold = MurckoScaffold.MurckoScaffoldSmiles( smiles=smiles, includeChirality=include_chirality) return scaffold
def generate_scaffold(smiles, include_chirality=False): """ Obtain Bemis-Murcko scaffold from smiles Args: smiles: include_chirality: Return: scaffold: the scaffold of the given smiles. """ scaffold = MurckoScaffold.MurckoScaffoldSmiles( smiles=smiles, includeChirality=include_chirality) return scaffold
def generate_scaffold(mol: Union[str, Chem.Mol, Tuple[Chem.Mol, Chem.Mol]], include_chirality: bool = False) -> str: """ Computes the Bemis-Murcko scaffold for a SMILES string. :param mol: A SMILES or an RDKit molecule. :param include_chirality: Whether to include chirality in the computed scaffold.. :return: The Bemis-Murcko scaffold for the molecule. """ if isinstance(mol, str): mol = make_mol(mol, keep_h = False, add_h = False) if isinstance(mol, tuple): mol = mol[0] scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol = mol, includeChirality = include_chirality) return scaffold
def create_scaffold_split(df, fold_seed, frac, entity): # reference: https://github.com/chemprop/chemprop/blob/master/chemprop/data/scaffold.py try: from rdkit import Chem from rdkit.Chem.Scaffolds import MurckoScaffold except: raise ImportError( "Please install rdkit by 'conda install -c conda-forge rdkit'! ") from tqdm import tqdm from collections import defaultdict s = df[entity].values scaffolds = defaultdict(set) idx2mol = dict(zip(list(range(len(s))), s)) for i, smiles in tqdm(enumerate(s), total=len(s)): scaffold = MurckoScaffold.MurckoScaffoldSmiles( mol=Chem.MolFromSmiles(smiles), includeChirality=False) scaffolds[scaffold].add(i) index_sets = sorted(list(scaffolds.values()), key=lambda i: len(i), reverse=True) train, val, test = [], [], [] train_size = int(len(df) * frac[0]) val_size = int(len(df) * frac[1]) test_size = len(df) - train_size - val_size train_scaffold_count, val_scaffold_count, test_scaffold_count = 0, 0, 0 for index_set in index_sets: if len(train) + len(index_set) <= train_size: train += index_set train_scaffold_count += 1 elif len(val) + len(index_set) <= val_size: val += index_set val_scaffold_count += 1 else: test += index_set test_scaffold_count += 1 return { 'train': df.iloc[train].reset_index(drop=True), 'valid': df.iloc[val].reset_index(drop=True), 'test': df.iloc[test].reset_index(drop=True) }
from tqdm import tqdm from frag.network.models import Attr if __name__ == "__main__": parser = argparse.ArgumentParser( description= 'Decorate a library of molecules for insertion to the database.') parser.add_argument('--input_smi') parser.add_argument('--output_attr') args = parser.parse_args() out_smi = open(args.output_attr, "w") for mol in tqdm( Chem.SmilesMolSupplier(args.input_smi, delimiter=',', smilesColumn=1, nameColumn=0)): this_smi = Chem.MolToSmiles(mol, isomericSmiles=True) new_smis = decorate_smi(this_smi) new_murck = decorate_smi(MurckoScaffold.MurckoScaffoldSmiles(this_smi)) # mol_frags = get_fragments(Chem.MolFromSmiles(this_smi),iso_labels=False) # new_smis.extend([x.replace("Xe","At") for x in mol_frags]) new_smis.extend(new_murck) new_smis = list(set(new_smis)) # Do this on original and on Murcko Scaffold name = mol.GetProp("_Name") new_attr = Attr(this_smi, ["EM", name]) out_smi.write(str(new_attr) + "\n") for i, smi in enumerate(new_smis): new_attr = Attr(smi, ["EM", name + "_" + str(i)]) out_smi.write(str(new_attr) + "\n")
mol_dict = {} ms_dict = {} for line in data: # get the molid molid = line.split()[1].strip() # get its smiles string smi = line.split()[0].strip() # get RDKit canonical smiles can = Chem.MolToSmiles(Chem.MolFromSmiles(smi)) # get canonical murcko smiles from canonical smiles murcko = ms.MurckoScaffoldSmiles(can) murcko = Chem.MolToSmiles(Chem.MolFromSmiles(murcko)) # get generic murcko smiles gen_murcko = Chem.MolToSmiles( ms.MakeScaffoldGeneric(Chem.MolFromSmiles(murcko))) # for each molid key, add the smi, murcko mol, and murcko smiles mol_dict[molid] = [smi, can, murcko, gen_murcko] # bin the mols into the different murcko scaffolds observed if gen_murcko in ms_dict: ms_dict[gen_murcko].append(molid) else: ms_dict[gen_murcko] = [molid]
def get_scaffold(self, mol): from rdkit.Chem.Scaffolds import MurckoScaffold return MurckoScaffold.MurckoScaffoldSmiles( mol=mol, includeChirality=self.include_chirality)
def generate_scaffold(smiles, include_chirality=False): scaffold = MurckoScaffold.MurckoScaffoldSmiles( smiles=smiles, includeChirality=include_chirality) return scaffold
def scaffold_split(dataset, args, return_smiles=False): total_precent = args.frac_train + args.frac_valid + args.frac_test np.testing.assert_almost_equal(total_precent, 1.0) smiles_list_file = os.path.join(args.data_dir, "smiles.csv") smiles_list = pd.read_csv(smiles_list_file, header=None)[0].tolist() non_null = np.ones(len(dataset)) == 1 smiles_list = list(compress(enumerate(smiles_list), non_null)) # create dict of the form {scaffold_i: [idx1, idx....]} all_scaffolds = {} for i, smiles in smiles_list: scaffold = MurckoScaffold.MurckoScaffoldSmiles( smiles=smiles, includeChirality=True) # scaffold = generate_scaffold(smiles, include_chirality=True) if scaffold not in all_scaffolds: all_scaffolds[scaffold] = [i] else: all_scaffolds[scaffold].append(i) # sort from largest to smallest sets all_scaffolds = { key: sorted(value) for key, value in all_scaffolds.items() } all_scaffold_sets = [ scaffold_set for (scaffold, scaffold_set) in sorted( all_scaffolds.items(), key=lambda x: (len(x[1]), x[1][0]), reverse=True) ] # get train, valid test indices train_cutoff = args.frac_train * len(smiles_list) valid_cutoff = (args.frac_train + args.frac_valid) * len(smiles_list) train_idx, valid_idx, test_idx = [], [], [] for scaffold_set in all_scaffold_sets: if len(train_idx) + len(scaffold_set) > train_cutoff: if len(train_idx) + len(valid_idx) + len( scaffold_set) > valid_cutoff: test_idx.extend(scaffold_set) else: valid_idx.extend(scaffold_set) else: train_idx.extend(scaffold_set) assert len(set(train_idx).intersection(set(valid_idx))) == 0 assert len(set(test_idx).intersection(set(valid_idx))) == 0 # log.info(len(scaffold_set)) # log.info(["train_idx", train_idx]) # log.info(["valid_idx", valid_idx]) # log.info(["test_idx", test_idx]) train_dataset = Subset(dataset, train_idx) valid_dataset = Subset(dataset, valid_idx) test_dataset = Subset(dataset, test_idx) if return_smiles: train_smiles = [smiles_list[i][1] for i in train_idx] valid_smiles = [smiles_list[i][1] for i in valid_idx] test_smiles = [smiles_list[i][1] for i in test_idx] return train_dataset, valid_dataset, test_dataset, ( train_smiles, valid_smiles, test_smiles) return train_dataset, valid_dataset, test_dataset
def generate_scaffold(mol: Union[str, Chem.Mol], include_chirality: bool = False) -> str: mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol, includeChirality=include_chirality) return scaffold
def get_scaffold(self, mol): return MurckoScaffold.MurckoScaffoldSmiles( mol=mol, includeChirality=self.include_chirality)
def create_scaffold_split(df, seed, frac, entity): # reference: https://github.com/chemprop/chemprop/blob/master/chemprop/data/scaffold.py try: from rdkit import Chem from rdkit.Chem.Scaffolds import MurckoScaffold from rdkit import RDLogger RDLogger.DisableLog('rdApp.*') except: raise ImportError( "Please install rdkit by 'conda install -c conda-forge rdkit'! ") from tqdm import tqdm from random import Random from collections import defaultdict random = Random(seed) s = df[entity].values scaffolds = defaultdict(set) idx2mol = dict(zip(list(range(len(s))), s)) error_smiles = 0 for i, smiles in tqdm(enumerate(s), total=len(s)): try: scaffold = MurckoScaffold.MurckoScaffoldSmiles( mol=Chem.MolFromSmiles(smiles), includeChirality=False) scaffolds[scaffold].add(i) except: print_sys(smiles + ' returns RDKit error and is thus omitted...') error_smiles += 1 train, val, test = [], [], [] train_size = int((len(df) - error_smiles) * frac[0]) val_size = int((len(df) - error_smiles) * frac[1]) test_size = (len(df) - error_smiles) - train_size - val_size train_scaffold_count, val_scaffold_count, test_scaffold_count = 0, 0, 0 #index_sets = sorted(list(scaffolds.values()), key=lambda i: len(i), reverse=True) index_sets = list(scaffolds.values()) big_index_sets = [] small_index_sets = [] for index_set in index_sets: if len(index_set) > val_size / 2 or len(index_set) > test_size / 2: big_index_sets.append(index_set) else: small_index_sets.append(index_set) random.seed(seed) random.shuffle(big_index_sets) random.shuffle(small_index_sets) index_sets = big_index_sets + small_index_sets if frac[2] == 0: for index_set in index_sets: if len(train) + len(index_set) <= train_size: train += index_set train_scaffold_count += 1 else: val += index_set val_scaffold_count += 1 else: for index_set in index_sets: if len(train) + len(index_set) <= train_size: train += index_set train_scaffold_count += 1 elif len(val) + len(index_set) <= val_size: val += index_set val_scaffold_count += 1 else: test += index_set test_scaffold_count += 1 return { 'train': df.iloc[train].reset_index(drop=True), 'valid': df.iloc[val].reset_index(drop=True), 'test': df.iloc[test].reset_index(drop=True) }
def generate_scaffold(mol): mol = Chem.MolFromSmiles(mol) scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol, includeChirality=False) return scaffold
def calc_murcko_scaf(mol): "Calculate the Murcko scaffold from a molecule and return as Smiles." return MurckoScaffold.MurckoScaffoldSmiles(mol=mol)
#name: Murcko Scaffolds #description: Generation of Murcko scaffolds from a molecule #help-url: /help/domains/chem/functions/murcko-scaffolds.md #language: python #sample: chem/smiles.csv #tags: demo, chem, rdkit #input: dataframe data [Input data table] #input: column smiles {type:categorical; semType: Molecule} [Molecules, in SMILES format] #output: dataframe scaffolds {action:join(data); semType: Molecule} [Murcko scaffolds, in SMILES format] import numpy as np from rdkit import Chem from rdkit.Chem.Scaffolds import MurckoScaffold smiles = data[smiles] length = len(smiles) scaffolds = np.full(length, None, dtype=object) for n in range(0, length): mol = Chem.MolFromSmiles(smiles[n]) if mol is None: continue scaffolds[n] = MurckoScaffold.MurckoScaffoldSmiles(mol=mol) # Convert to Pandas DataFrame scaffolds = pd.DataFrame(scaffolds, columns=['scaffolds'])