def canonicalize(smi_list, showprogress=False): mol_list = [] if showprogress: print('Canonicalising mols') for smi in tqdm(smi_list): mol = MolFromSmiles(smi) if mol is not None: mol_list.append(MolToSmiles(mol)) else: for smi in smi_list: mol = MolFromSmiles(smi) if mol is not None: mol_list.append(MolToSmiles(mol)) mol_list = list(set(mol_list)) final_list = [] if showprogress: print('Size of unfiltered final library: {}'.format(len(mol_list))) print('Filtering by n_heavy and logP:') for smi in tqdm(mol_list): mol = MolFromSmiles(smi) n_heavy = mol.GetNumHeavyAtoms() if n_heavy > 17: logP = Crippen.MolLogP(mol) if logP <= 5: final_list.append(smi) else: for smi in mol_list: mol = MolFromSmiles(smi) n_heavy = mol.GetNumHeavyAtoms() if n_heavy > 17: logP = Crippen.MolLogP(mol) if logP <= 5: final_list.append(smi) return final_list
def canonicalize_and_filter(smi_list, showprogress=False): """ Function that returns the set of unique RDKit molecules from a list of input RDKit molecules by turning them into canonical SMILES and checking the strings for uniqueness. Also performs rudimentary Lipinski rule-of-5 filtering by dropping molecules with logP >5 and more than 17 heavy atoms. """ mol_list = [] if showprogress: print('Canonicalising mols') for smi in tqdm(smi_list): mol = MolFromSmiles(smi) if mol is not None: mol_list.append(MolToSmiles(mol)) else: for smi in smi_list: mol = MolFromSmiles(smi) if mol is not None: mol_list.append(mol) mol_list = list(set(mol_list)) final_list = [] if showprogress: print('Size of unfiltered final library: {}'.format(len(mol_list))) print('Filtering by n_heavy and logP:') for smi in tqdm(mol_list): mol = MolFromSmiles(smi) n_heavy = mol.GetNumHeavyAtoms() if n_heavy > 17: logP = Crippen.MolLogP(mol) if logP <= 5: final_list.append(smi) else: for smi in mol_list: mol = MolFromSmiles(smi) n_heavy = mol.GetNumHeavyAtoms() if n_heavy > 17: logP = Crippen.MolLogP(mol) if logP <= 5: final_list.append(smi) return final_list
def worker(line): smiles, cid = line.strip().split()[:2] mol = MolFromSmiles(smiles) if mol: if '.' in smiles: mol = remover.StripMol(mol) logp = MolLogP(mol) num_heavy_atoms = mol.GetNumHeavyAtoms() if num_heavy_atoms > 99: num_heavy_atoms = 99 sign = 'M' if logp < 0.0 else 'P' return f'{smiles} {cid} H{num_heavy_atoms:02}{sign}{abs(scale_logp_value(logp)):03}\n'
def dataset_distribution(f_paths): n_heavy = [] f_path = None for f_path in glob(f_paths): dataset = pd.read_csv(f_path) for smiles in dataset["SMILES"]: mol = MolFromSmiles(smiles) n_heavy.append(mol.GetNumHeavyAtoms()) f_dir = osp.dirname(f_path) f_base = osp.basename(f_path).split(".")[0] plt.hist(n_heavy, bins=range(min(n_heavy), max(n_heavy) + 1)) plt.xlabel("num of heavy atoms") plt.ylabel("count") plt.title("dd_sol") plt.savefig(osp.join(f_dir, f_base + "_mmff_dist.png"))