def get_conformer_rmsd(mol: RDKitMol) -> np.ndarray: """ Calculate conformer-conformer RMSD. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- rmsd: np.ndarray A conformer-conformer RMSD value. The shape is `(NumConformers, NumConformers)` """ try: from rdkit.Chem import AllChem except ModuleNotFoundError: raise ValueError("This function requires RDKit to be installed.") rmsd = np.zeros((mol.GetNumConformers(), mol.GetNumConformers()), dtype=float) for i, ref_conf in enumerate(mol.GetConformers()): for j, fit_conf in enumerate(mol.GetConformers()): if i >= j: continue rmsd[i, j] = AllChem.GetBestRMS(mol, mol, ref_conf.GetId(), fit_conf.GetId()) rmsd[j, i] = rmsd[i, j] return rmsd
def prune_conformers(self, mol: RDKitMol) -> RDKitMol: """ Prune conformers from a molecule using an RMSD threshold, starting with the lowest energy conformer. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- new_mol: rdkit.Chem.rdchem.Mol A new rdkit.Chem.rdchem.Mol containing the chosen conformers, sorted by increasing energy. """ try: from rdkit import Chem except ModuleNotFoundError: raise ValueError("This function requires RDKit to be installed.") if self.rmsd_threshold < 0 or mol.GetNumConformers() <= 1: return mol energies = self.get_conformer_energies(mol) rmsd = self.get_conformer_rmsd(mol) sort = np.argsort(energies) # sort by increasing energy keep: List[float] = [] # always keep lowest-energy conformer discard = [] for i in sort: # always keep lowest-energy conformer if len(keep) == 0: keep.append(i) continue # discard conformers after max_conformers is reached if len(keep) >= self.max_conformers: discard.append(i) continue # get RMSD to selected conformers this_rmsd = rmsd[i][np.asarray(keep, dtype=int)] # discard conformers within the RMSD threshold if np.all(this_rmsd >= self.rmsd_threshold): keep.append(i) else: discard.append(i) # create a new molecule to hold the chosen conformers # this ensures proper conformer IDs and energy-based ordering new_mol = Chem.Mol(mol) new_mol.RemoveAllConformers() conf_ids = [conf.GetId() for conf in mol.GetConformers()] for i in keep: conf = mol.GetConformer(conf_ids[i]) new_mol.AddConformer(conf, assignId=True) return new_mol
def coulomb_matrix(self, mol: RDKitMol) -> np.ndarray: """ Generate Coulomb matrices for each conformer of the given molecule. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray The coulomb matrices of the given molecule """ try: from rdkit import Chem from rdkit.Chem import AllChem except ModuleNotFoundError: raise ImportError("This class requires RDKit to be installed.") # Check whether num_confs >=1 or not num_confs = len(mol.GetConformers()) if num_confs == 0: mol = Chem.AddHs(mol) AllChem.EmbedMolecule(mol, AllChem.ETKDG()) if self.remove_hydrogens: mol = Chem.RemoveHs(mol) n_atoms = mol.GetNumAtoms() z = [atom.GetAtomicNum() for atom in mol.GetAtoms()] rval = [] for conf in mol.GetConformers(): d = self.get_interatomic_distances(conf) m = np.outer(z, z) / d m[range(n_atoms), range(n_atoms)] = 0.5 * np.array(z)**2.4 if self.randomize: for random_m in self.randomize_coulomb_matrix(m): random_m = pad_array(random_m, self.max_atoms) rval.append(random_m) else: m = pad_array(m, self.max_atoms) rval.append(m) rval = np.asarray(rval) return rval
def _featurize(self, datapoint: RDKitMol, **kwargs) -> np.ndarray: """Calculate atomic coordinates. Parameters ---------- datapoint: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray A numpy array of atomic coordinates. The shape is `(n_atoms, 3)`. """ try: from rdkit import Chem from rdkit.Chem import AllChem except ModuleNotFoundError: raise ImportError("This class requires RDKit to be installed.") if 'mol' in kwargs: datapoint = kwargs.get("mol") raise DeprecationWarning( 'Mol is being phased out as a parameter, please pass "datapoint" instead.' ) # Check whether num_confs >=1 or not num_confs = len(datapoint.GetConformers()) if num_confs == 0: datapoint = Chem.AddHs(datapoint) AllChem.EmbedMolecule(datapoint, AllChem.ETKDG()) datapoint = Chem.RemoveHs(datapoint) N = datapoint.GetNumAtoms() coords = np.zeros((N, 3)) # RDKit stores atomic coordinates in Angstrom. Atomic unit of length is the # bohr (1 bohr = 0.529177 Angstrom). Converting units makes gradient calculation # consistent with most QM software packages. if self.use_bohr: coords_list = [ datapoint.GetConformer(0).GetAtomPosition(i).__idiv__( 0.52917721092) for i in range(N) ] else: coords_list = [ datapoint.GetConformer(0).GetAtomPosition(i) for i in range(N) ] for atom in range(N): coords[atom, 0] = coords_list[atom].x coords[atom, 1] = coords_list[atom].y coords[atom, 2] = coords_list[atom].z return coords
def minimize_conformers(self, mol: RDKitMol) -> None: """ Minimize molecule conformers. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit Mol object with embedded conformers. """ for conf in mol.GetConformers(): ff = self.get_molecule_force_field(mol, conf_id=conf.GetId()) ff.Minimize()
def _featurize(self, mol: RDKitMol) -> np.ndarray: """Calculate atomic coordinates. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray A numpy array of atomic coordinates. The shape is `(n_atoms, 3)`. """ try: from rdkit import Chem from rdkit.Chem import AllChem except ModuleNotFoundError: raise ImportError("This class requires RDKit to be installed.") # Check whether num_confs >=1 or not num_confs = len(mol.GetConformers()) if num_confs == 0: mol = Chem.AddHs(mol) AllChem.EmbedMolecule(mol, AllChem.ETKDG()) mol = Chem.RemoveHs(mol) N = mol.GetNumAtoms() coords = np.zeros((N, 3)) # RDKit stores atomic coordinates in Angstrom. Atomic unit of length is the # bohr (1 bohr = 0.529177 Angstrom). Converting units makes gradient calculation # consistent with most QM software packages. if self.use_bohr: coords_list = [ mol.GetConformer(0).GetAtomPosition(i).__idiv__(0.52917721092) for i in range(N) ] else: coords_list = [ mol.GetConformer(0).GetAtomPosition(i) for i in range(N) ] for atom in range(N): coords[atom, 0] = coords_list[atom].x coords[atom, 1] = coords_list[atom].y coords[atom, 2] = coords_list[atom].z return coords
def get_conformer_energies(self, mol: RDKitMol) -> np.ndarray: """ Calculate conformer energies. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit Mol object with embedded conformers. Returns ------- energies : np.ndarray Minimized conformer energies. """ energies = [] for conf in mol.GetConformers(): ff = self.get_molecule_force_field(mol, conf_id=conf.GetId()) energy = ff.CalcEnergy() energies.append(energy) return np.asarray(energies, dtype=float)