def grad(self, mol): """ Calculate the pseudo gradient with respect to the atoms. The pseudo gradient is the number of times the atom set that particular bit. Args: mol (skchem.Mol): The molecule for which to calculate the pseudo gradient. Returns: pandas.DataFrame: Dataframe of pseudogradients, with columns corresponding to atoms, and rows corresponding to features of the fingerprint. """ cols = pd.Index(list(range(len(mol.atoms))), name='atom_idx') dist = GetDistanceMatrix(mol) info = {} if self.n_feats < 0: res = GetMorganFingerprint(mol, self.radius, useFeatures=self.use_features, useBondTypes=self.use_bond_types, useChirality=self.use_chirality, bitInfo=info).GetNonzeroElements() idx_list = list(res.keys()) idx = pd.Index(idx_list, name='features') grad = np.zeros((len(idx), len(cols))) for bit in info: for atom_idx, radius in info[bit]: grad[idx_list.index(bit)] += (dist <= radius)[atom_idx] else: GetHashedMorganFingerprint(mol, self.radius, nBits=self.n_feats, useFeatures=self.use_features, useBondTypes=self.use_bond_types, useChirality=self.use_chirality, bitInfo=info) idx = pd.Index(range(self.n_feats), name='features') grad = np.zeros((len(idx), len(cols))) for bit in info: for atom_idx, radius in info[bit]: grad[bit] += (dist <= radius)[atom_idx] grad = pd.DataFrame(grad, index=idx, columns=cols) if self.as_bits: grad = (grad > 0) return grad.astype(int)
def get_sum_stats(smi_list, smiles): self_fp = GetMorganFingerprint(Chem.MolFromSmiles(smiles), 2) fps = [GetMorganFingerprint(Chem.MolFromSmiles(x), 2) for x in smi_list] out_sims = [] for i in fps: out_sims.append(DataStructs.TanimotoSimilarity(i, self_fp)) return len(out_sims), min(out_sims), max(out_sims), sum(out_sims) / float( len(out_sims))
def _transform_mol(self, mol): """Private method to transform a skchem molecule. Use `transform` for the public method, which genericizes the argument to iterables of mols. Args: mol (skchem.Mol): Molecule to calculate fingerprint for. Returns: np.array or dict: Fingerprint as an array (or a dict if sparse). """ if self.as_bits and self.n_feats > 0: fp = GetMorganFingerprintAsBitVect( mol, self.radius, nBits=self.n_feats, useFeatures=self.use_features, useBondTypes=self.use_bond_types, useChirality=self.use_chirality) res = np.array(0) ConvertToNumpyArray(fp, res) res = res.astype(np.uint8) else: if self.n_feats <= 0: res = GetMorganFingerprint(mol, self.radius, useFeatures=self.use_features, useBondTypes=self.use_bond_types, useChirality=self.use_chirality) res = res.GetNonzeroElements() if self.as_bits: res = {k: int(v > 0) for k, v in res.items()} else: res = GetHashedMorganFingerprint( mol, self.radius, nBits=self.n_feats, useFeatures=self.use_features, useBondTypes=self.use_bond_types, useChirality=self.use_chirality) res = np.array(list(res)) return res
def GetUnfoldedCircularFragment(mol, minRadius=1, maxRadius=2, maxFragment=True, disposed=True): """Get unfolded circular fragment Parameters ---------- mol : dkit.Chem.rdchem.Mol object Compound to be Calculated minRadius : int, optional The probable minimum radius of circular fragment, by default 1 maxRadius : int, optional The probable maximum radius of circular fragment, by default 2 maxFragment : bool, optional Whether only return the maximum fragment at a center atom, by default True disposed : bool, optional Whether dispose the original bitinfo, by default True Returns ------- fragments : list of list The first element is the ID of all fragments generated the second one is the ID of output fragments """ bitInfo = {} fp = GetMorganFingerprint(mol, radius=maxRadius, bitInfo=bitInfo) fragments = _DisposeCircularBitInfo( bitInfo, minRadius, maxFragment ) if disposed else bitInfo return fragments
def _morgan(self, molecules): if self.vector == 'int': from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint self.fps_ = [ GetMorganFingerprint(self._sanitary(mol), self.radius, **self.kwargs) for mol in molecules ] # get nonzero elements as a dictionary for each molecule dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps_] # pairScores = [] # for fp in dict_nonzero: # pairScores += list(fp) data = pd.DataFrame( dict_nonzero) #, columns=list(set(pairScores))) data.fillna(0, inplace=True) return data elif self.vector == 'bit': from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect self.fps_ = [ GetMorganFingerprintAsBitVect(self._sanitary(mol), self.radius, nBits=self.n_bits, **self.kwargs) for mol in molecules ] data = np.array(self.fps_) data = pd.DataFrame(data) return data
def test__string_output_format(self) -> None: fprintr = CircularFPFeaturizer(output_format="sparse_string") fps_str = fprintr.fit_transform(self.smis) # using SMILES # Output shape self.assertEqual(self.n_mols, len(fps_str)) # Fingerprint matrix structure for i, mol in enumerate(self.mols): fps_ref = GetMorganFingerprint(mol, radius=fprintr.radius, useFeatures=fprintr.use_features_, useChirality=fprintr.use_chirality, useCounts=fprintr.use_counts_) fp_i_from_str = eval("{" + fps_str[i] + "}") for hash, cnt in fps_ref.GetNonzeroElements().items(): self.assertEqual(fp_i_from_str[hash], cnt)
def transform_mol(self, molecule: Chem.rdchem.Mol) -> Tuple[np.ndarray, bool]: use_chirality = self.__dict__.get('use_chirality', False) fp = GetMorganFingerprint( molecule, radius=self.radius, useFeatures=self.use_features, useCounts=self.use_counts, useChirality=use_chirality, **self.fingerprint_extra_args, ) fp = rdkit_sparse_array_to_np(fp.GetNonzeroElements().items(), use_counts=self.use_counts, fp_size=self.fp_size) return fp, True
def diverse_mols_indexes(mol_list, n_pick, radius=4, seed=42): fps = [GetMorganFingerprint(mol, radius) for mol in mol_list] picker = MaxMinPicker() n_fps = len(fps) def fp_distance(i, j): return 1 - \ DataStructs.DiceSimilarity(fps[i], fps[j]) indexes = picker.LazyPick(fp_distance, n_fps, n_pick, seed=seed) return indexes
def test__hashed_counting_fingerprints__fcfp(self) -> None: fprintr = CircularFPFeaturizer(fp_type="FCFP") fps_mat_smi = fprintr.fit_transform(self.smis) # using SMILES fps_mat_mol = fprintr.fit_transform(self.mols) # using Mol objects # Output shape self.assertEqual(fps_mat_smi.shape[0], self.n_mols) self.assertEqual(fps_mat_smi.shape[1], fprintr.max_hash_value_) self.assertEqual(fps_mat_mol.shape[0], self.n_mols) self.assertEqual(fps_mat_mol.shape[1], fprintr.max_hash_value_) # Fingerprint matrix structure for i, mol in enumerate(self.mols): fps_ref = GetMorganFingerprint(mol, radius=fprintr.radius, useFeatures=fprintr.use_features_, useChirality=fprintr.use_chirality, useCounts=fprintr.use_counts_) for hash, cnt in fps_ref.GetNonzeroElements().items(): self.assertEqual(fps_mat_smi[i, hash], cnt) self.assertEqual(fps_mat_mol[i, hash], cnt)
def test__hashed_binary_fingerprints__ecfp(self) -> None: fprintr = CircularFPFeaturizer(fp_mode="binary") fps_mat_smi = fprintr.fit_transform(self.smis) # using SMILES fps_mat_mol = fprintr.fit_transform(self.mols) # using Mol objects # Output shape self.assertEqual(fps_mat_smi.shape[0], self.n_mols) self.assertEqual(fps_mat_smi.shape[1], fprintr.max_hash_value_) self.assertEqual(fps_mat_mol.shape[0], self.n_mols) self.assertEqual(fps_mat_mol.shape[1], fprintr.max_hash_value_) # Fingerprint matrix structure for i, mol in enumerate(self.mols): fps_ref = GetMorganFingerprint(mol, radius=fprintr.radius, useFeatures=fprintr.use_features_, useChirality=fprintr.use_chirality, useCounts=fprintr.use_counts_) for hash in fps_ref.GetNonzeroElements(): self.assertTrue(fps_mat_smi[i, hash]) self.assertTrue(fps_mat_mol[i, hash]) # No other elements are set self.assertEqual(np.sum(fps_mat_smi[i, :].data), len(fps_ref.GetNonzeroElements())) self.assertEqual(np.sum(fps_mat_mol[i, :].data), len(fps_ref.GetNonzeroElements()))
def pick_subset(mols, num=5, radius=3, seed=-1): """ Pick a disparate subset of molecules using Morgan Fingerprints. https://towardsdatascience.com/a-practical-introduction-to-the-use-of-molecular-fingerprints-in-drug-discovery-7f15021be2b1 :param mols: an iterable of molecules :param num: number of molecules to pick :param radius: :return: list of integer locations of the subset of molecules """ fps = [GetMorganFingerprint(mol, radius) for mol in mols] def distij(i, j, fps=fps): return 1 - DataStructs.DiceSimilarity(fps[i], fps[j]) return list(MaxMinPicker().LazyPick(distij, len(fps), num, seed=seed))
def main(prior_name, name, max_samples, diversity_picker, oracle, w_min): prior_model = model_from_json(prior_name) # We start by creating another prior instance, then replace it with the actual weights # name = search_vae search_model = model_from_json(prior_name) model_weights_path = os.path.join(script_dir, 'results', name, 'weights.pth') search_model.load(model_weights_path) samples, weights = get_samples(prior_model, search_model, max=max_samples, w_min=w_min) # if diversity picker < max_samples, we subsample with rdkit picker : if 0 < diversity_picker < max_samples: mols = [Chem.MolFromSmiles(s) for s in samples] fps = [GetMorganFingerprint(x, 3) for x in mols] picker = MaxMinPicker() def distij(i, j, fps=fps): return 1 - DataStructs.DiceSimilarity(fps[i], fps[j]) pickIndices = picker.LazyPick(distij, max_samples, diversity_picker) idces = list(pickIndices) samples = [samples[i] for i in idces] weights = [weights[i] for i in idces] # Since we don't maintain a dict for qed, we just give everything to the docker if oracle != 'docking' or True: dump_path = os.path.join(script_dir, 'results', name, 'docker_samples.p') pickle.dump(samples, open(dump_path, 'wb')) # Dump for the trainer dump_path = os.path.join(script_dir, 'results', name, 'samples.p') pickle.dump((samples, weights), open(dump_path, 'wb')) else: # Memoization, we split the list into already docked ones and dump a simili-docking csv whole_path = os.path.join(script_dir, '..', 'data', 'drd3_scores.pickle') docking_whole_results = pickle.load(open(whole_path, 'rb')) filtered_smiles = list() already_smiles = list() already_scores = list() for i, smile in enumerate(samples): if smile in docking_whole_results: already_smiles.append(smile) already_scores.append(docking_whole_results[smile]) else: filtered_smiles.append(smile) # Dump simili-docking dump_path = os.path.join(script_dir, 'results', name, 'docking_small_results', 'simili.csv') df = pd.DataFrame.from_dict({ 'smile': already_smiles, 'score': already_scores }) df.to_csv(dump_path) # Dump for the docker dump_path = os.path.join(script_dir, 'results', name, 'docker_samples.p') pickle.dump(filtered_smiles, open(dump_path, 'wb')) # Dump for the trainer dump_path = os.path.join(script_dir, 'results', name, 'samples.p') pickle.dump((samples, weights), open(dump_path, 'wb'))
#s=Chem.MolToSmiles(m, kekuleSmiles=True) if s not in training_set: novel += 1 print(novel / len(smiles_list)) ## Diversity sampling from rdkit import Chem from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint from rdkit import DataStructs from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker ms = [Chem.MolFromSmiles(s) for s in smiles_list] start = time() fps = [GetMorganFingerprint(x, 3) for x in ms] nfps = len(fps) end = time() print(f'Time for {nfps} fingerprints: ', end - start) def distij(i, j, fps=fps): return 1 - DataStructs.DiceSimilarity(fps[i], fps[j]) picker = MaxMinPicker() start = time() pickIndices = picker.LazyPick(distij, nfps, 1000, seed=23) end = time() idces = list(pickIndices) print('Time for picker: ', end - start) m_selected = [ms[i] for i in idces]
if RECOMPUTE: radius = 5 mol = {} fp = [] fpNames = [] cstr = getStructs(path.join('data', 'chem_prop.tsv')) for c in cstr: try: mol[c] = Chem.MolFromSmiles(cstr[c]) except: continue pickle.dump(mol, open(path.join('data', 'mnxMol.pk'), 'w')) for c in mol: try: fp.append(GetMorganFingerprint(mol[c], radius)) except: continue fpNames.append(c) f = open(path.join('data', 'mnxFp.pk'), 'w') pickle.dump(fp, f) pickle.dump(fpNames, f) f.close() else: print('Reading fingerprints...') data = np.load('fp.npz', allow_pickle=True) fp = data['x'] fpNames = data['y']
def score_and_append_diversity_scores(molecules_list): """ This function will take list of molecules which makes up a population. It will then create a diversity score for each molecules: It creates the diversity score by determining the Morgan Fingerprint for each molecule in the population. It then compares the fingerprints for every molecule against every molecule in a pairwise manner. Based on the approach provided on http://www.rdkit.org/docs/GettingStartedInPython.html section: "Picking Diverse Molecules Using Fingerprints" It determines a score of similarity using the RDKit function DataStructs.DiceSimilarity -The higher the similarity the higher the similarity score -ie) if you compare two identical SMILES the similarity score is 1.0. I.e., if you compare 4 identical SMILES the similarity score for each is 4.0. -ie) if you compare two completely different SMILES, the score is 0.0 It sums the similarity score for each pairwise comparison. -ie) if there are 15 ligands the max score is 15 the minimum is 0.0 with 15.0 if all ligands are identical It then appends the diversity score to the molecule list which it returns. It can raise an AssertionError if there are ligs which fail to sanitize or deprotanate. -this prevents future errors from occuring in later steps and makes this funciton usable for multiple codes It will remove any Nones from the input list Inputs: :param list molecules_list: list of all molecules in the populations with the respective info Returns: :returns: list molecules_list: list of all molecules in the populations with the respective info and append diversity score """ mol_list = [] for pair in molecules_list: if pair is not None: smile = pair[0] # name = pair[1] try: mol = Chem.MolFromSmiles(smile, sanitize=False) except: mol = None if mol is None: raise AssertionError( "mol in list failed to sanitize. Issue in Ranking.py \ def score_and_append_diversity_scores") mol = MOH.check_sanitization(mol) if mol is None: raise AssertionError( "mol in list failed to sanitize. Issue in Ranking.py \ def score_and_append_diversity_scores") mol = MOH.try_deprotanation(mol) if mol is None: raise AssertionError( "mol in list failed to sanitize. Issue in Ranking.py \ def score_and_append_diversity_scores") temp = [x for x in pair] temp.append(mol) if temp[-1] is None: print(temp) print("None in temp list, skip this one") continue if temp[-1] is not None: mol_list.append(temp) else: print( "noneitem in molecules_list in score_and_append_diversity_scores" ) fps_list = [] for molecule in mol_list: fp = GetMorganFingerprint(molecule[-1], 10, useFeatures=True) temp = [x for x in molecule] temp.append(fp) fps_list.append(temp) fps_list_w_div_score = [] for i in range(0, len(fps_list)): diversity_score = 0 for j in range(0, len(fps_list)): if i != j: # if DiceSimilarity=1.0 its a perfect match, the smaller the # number the more diverse it is. The sum of all of these gives # the distance from the normal. The smaller the number means # the more distant diversity_score = diversity_score + DataStructs.DiceSimilarity( fps_list[i][-1], fps_list[j][-1]) temp = [x for x in fps_list[i]] temp.append(str(diversity_score)) fps_list_w_div_score.append(temp) # take the diversity score and append to the last column in the original # list for i in range(0, len(molecules_list)): if molecules_list[i][0] == fps_list_w_div_score[i][0]: molecules_list[i].append(fps_list_w_div_score[i][-1]) return molecules_list
def morgan(mol, **kwargs): return list(GetMorganFingerprint(mol, **kwargs).GetNonzeroElements())
def Fingerprint(self): if self.FPtype == 'Hashed_atom_pair' or self.FPtype == 'HAP': if self.vector == 'int': from rdkit.Chem.AtomPairs.Pairs import GetHashedAtomPairFingerprint self.fps = [ GetHashedAtomPairFingerprint(m, nBits=self.nBits) for m in self.molecules ] dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps] data = pd.DataFrame(dict_nonzero, columns=range(self.nBits)) data.fillna(0, inplace=True) return data elif self.vector == 'bit': from rdkit.Chem.rdMolDescriptors import GetHashedAtomPairFingerprintAsBitVect self.fps = [ GetHashedAtomPairFingerprintAsBitVect(m, nBits=self.nBits) for m in self.molecules ] data = np.array(self.fps) data = pd.DataFrame(data) return data else: msg = "The argument vector can be 'int' or 'bit'" raise ValueError(msg) elif self.FPtype == 'Atom_pair' or self.FPtype == 'AP': if self.vector == 'int': from rdkit.Chem.AtomPairs.Pairs import GetAtomPairFingerprint self.fps = [GetAtomPairFingerprint(m) for m in self.molecules] dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps] pairScores = [] for fp in dict_nonzero: pairScores += [key for key in fp] data = pd.DataFrame(dict_nonzero, columns=list(set(pairScores))) data.fillna(0, inplace=True) return data elif self.vector == 'bit': from rdkit.Chem.AtomPairs.Pairs import GetAtomPairFingerprintAsBitVect self.fps = [ GetAtomPairFingerprintAsBitVect(m) for m in self.molecules ] data = np.array(self.fps) data = pd.DataFrame(data) print len(data.columns) d_des = data.describe() for i in data.columns: if d_des[i]['mean'] == 0: data.drop(i, 1) print len(data.columns) dict_nonzero = [] for fp in self.fps: dict_nonzero.append( {i: el for i, el in enumerate(fp) if el != 0}) pairScores = [] for fp in dict_nonzero: pairScores += [key for key in fp] data = pd.DataFrame(dict_nonzero, columns=list(set(pairScores))) data.fillna(0, inplace=True) return data else: msg = "The argument vector can be 'int' or 'bit'" raise ValueError(msg) elif self.FPtype == 'MACCS': if self.vector == 'int': msg = "There is no RDKit function to encode int vectors for MACCS keys" raise ValueError(msg) elif self.vector == 'bit': from rdkit.Chem.MACCSkeys import GenMACCSKeys self.fps = [GenMACCSKeys(mol) for mol in self.molecules] data = np.array(self.fps) data = pd.DataFrame(data) return data else: msg = "The vector argument can only be 'int' or 'bit'" raise ValueError(msg) elif self.FPtype == 'Morgan': if self.vector == 'int': from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint self.fps = [ GetMorganFingerprint(mol, self.radius) for mol in self.molecules ] dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps] pairScores = [] for fp in dict_nonzero: pairScores += list(fp) data = pd.DataFrame(dict_nonzero, columns=list(set(pairScores))) data.fillna(0, inplace=True) return data elif self.vector == 'bit': from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect self.fps = [ GetMorganFingerprintAsBitVect(mol, self.radius, nBits=self.nBits) for mol in self.molecules ] data = np.array(self.fps) data = pd.DataFrame(data) return data else: msg = "The argument vector can only be 'int' or 'bit'" raise ValueError(msg) elif self.FPtype == 'Hashed_topological_torsion' or self.FPtype == 'HTT': if self.vector == 'int': from rdkit.Chem.rdMolDescriptors import GetHashedTopologicalTorsionFingerprint self.fps = [ GetHashedTopologicalTorsionFingerprint(m, nBits=self.nBits) for m in self.molecules ] dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps] data = pd.DataFrame(dict_nonzero, columns=range(self.nBits)) data.fillna(0, inplace=True) return data elif self.vector == 'bit': from rdkit.Chem.rdMolDescriptors import GetHashedTopologicalTorsionFingerprintAsBitVect self.fps = [ GetHashedTopologicalTorsionFingerprintAsBitVect( m, nBits=self.nBits) for m in self.molecules ] data = np.array(self.fps) data = pd.DataFrame(data) return data else: msg = "The argument vector can be 'int' or 'bit'" raise ValueError(msg) elif self.FPtype == 'Topological_torsion' or self.FPtype == 'TT': if self.vector == 'int': from rdkit.Chem.AtomPairs.Torsions import GetTopologicalTorsionFingerprintAsIntVect self.fps = [ GetTopologicalTorsionFingerprintAsIntVect(mol) for mol in self.molecules ] dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps] pairScores = [] for fp in dict_nonzero: pairScores += list(fp) data = pd.DataFrame(dict_nonzero, columns=list(set(pairScores))) data.fillna(0, inplace=True) return data elif self.vector == 'bit': msg = "There is no RDKit function to encode bit vectors for Topological Torsion Fingerprints" raise ValueError(msg) else: msg = "The argument vector can only be 'int'" raise ValueError(msg) else: msg = "The type argument '%s' is not a valid fingerprint type" % self.FPtype raise ValueError(msg)
def execute(self): """ TODO """ print() print("Loading input file with path: " + ZincPicker.input_file_path) zinc_for_sale_mol_supplier = Chem.SmilesMolSupplier( ZincPicker.input_file_path) num_none_mols = 0 print("Output file path: " + ZincPicker.output_file_path) writer = Chem.SmilesWriter(ZincPicker.output_file_path) lower_index = 0 upper_index = ZincPicker.pool_size print("Entering picking iterations...") print() for y in range(0, ZincPicker.num_iterations): print("Number of iteration: ", y) print("Lower index: ", lower_index) print("Upper index: ", upper_index) print("Loading molecules now...") molecules = [] for x in range(lower_index, upper_index): mol = zinc_for_sale_mol_supplier[x] if mol is None: num_none_mols += 1 continue molecules.append(mol) while molecules.count(None): molecules.remove(None) # radius 3 print("Number of molecules loaded: ", len(molecules)) print("Calculating fingerprints...") self.fingerprint_list = [ GetMorganFingerprint(x, 3) for x in molecules ] nfps = len(self.fingerprint_list) print("Number of fingerprints: ", nfps) print("Now min-max picking ", ZincPicker.pick_size, " out of the finger print list...") picker = MaxMinPicker() pickIndices = picker.LazyPick( self.calculate_dice_similarity_distance, nfps, ZincPicker.pick_size, seed=23) print("Finished picking, writing to file...") for z in pickIndices: writer.write(molecules[z]) # clear memory molecules = [] self.fingerprint_list = [] nfps = 0 picker = None pickIndices = [] # raise indices lower_index = lower_index + ZincPicker.pool_size upper_index = upper_index + ZincPicker.pool_size print("Finished this iteration, entering the next...") print() print("Execution successful.") print("Picked ", ZincPicker.pick_size * ZincPicker.num_iterations - num_none_mols, " out of ", ZincPicker.num_iterations * ZincPicker.pool_size, " molecules in ", ZincPicker.num_iterations, " iterations, while picking ", ZincPicker.pick_size, " in each iteration.")