def _featurize(self, datapoint: RDKitMol, **kwargs) -> GraphData: """Calculate molecule graph features from RDKit mol object. Parameters ---------- datapoint: rdkit.Chem.rdchem.Mol RDKit mol object. Returns ------- graph: GraphData A molecule graph with some features. """ if 'mol' in kwargs: datapoint = kwargs.get("mol") raise DeprecationWarning( 'Mol is being phased out as a parameter, please pass "datapoint" instead.' ) node_features = np.asarray( [self._pagtn_atom_featurizer(atom) for atom in datapoint.GetAtoms()], dtype=np.float) edge_index, edge_features = self._pagtn_edge_featurizer(datapoint) graph = GraphData(node_features, edge_index, edge_features) return graph
def convert_protein_to_pdbqt(mol: RDKitMol, outfile: str) -> None: """Convert a protein PDB file into a pdbqt file. Writes the extra PDBQT terms directly to `outfile`. Parameters ---------- mol: RDKit Mol Protein molecule outfile: str filename which already has a valid pdb representation of mol """ lines = [x.strip() for x in open(outfile).readlines()] out_lines = [] for line in lines: if "ROOT" in line or "ENDROOT" in line or "TORSDOF" in line: out_lines.append("%s\n" % line) continue if not line.startswith("ATOM"): continue line = line[:66] atom_index = int(line[6:11]) atom = mol.GetAtoms()[atom_index - 1] line = "%s +0.000 %s\n" % (line, atom.GetSymbol().ljust(2)) out_lines.append(line) with open(outfile, 'w') as fout: for line in out_lines: fout.write(line)
def _featurize(self, datapoint: RDKitMol, **kwargs) -> np.ndarray: """Calculate symmetry function. Parameters ---------- datapoint: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray A numpy array of symmetry function. The shape is `(max_atoms, 4)`. """ if 'mol' in kwargs: datapoint = kwargs.get("mol") raise DeprecationWarning( 'Mol is being phased out as a parameter, please pass "datapoint" instead.' ) coordinates = self.coordfeat._featurize(datapoint) atom_numbers = np.array( [atom.GetAtomicNum() for atom in datapoint.GetAtoms()]) atom_numbers = np.expand_dims(atom_numbers, axis=1) assert atom_numbers.shape[0] == coordinates.shape[0] features = np.concatenate([atom_numbers, coordinates], axis=1) return pad_array(features, (self.max_atoms, 4))
def max_pair_distance_pairs(mol: RDKitMol, max_pair_distance: Optional[int]) -> np.ndarray: """Helper method which finds atom pairs within max_pair_distance graph distance. This helper method is used to find atoms which are within max_pair_distance graph_distance of one another. This is done by using the fact that the powers of an adjacency matrix encode path connectivity information. In particular, if `adj` is the adjacency matrix, then `adj**k` has a nonzero value at `(i, j)` if and only if there exists a path of graph distance `k` between `i` and `j`. To find all atoms within `max_pair_distance` of each other, we can compute the adjacency matrix powers `[adj, adj**2, ...,adj**max_pair_distance]` and find pairs which are nonzero in any of these matrices. Since adjacency matrices and their powers are positive numbers, this is simply the nonzero elements of `adj + adj**2 + ... + adj**max_pair_distance`. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit molecules max_pair_distance: Optional[int], (default None) This value can be a positive integer or None. This parameter determines the maximum graph distance at which pair features are computed. For example, if `max_pair_distance==2`, then pair features are computed only for atoms at most graph distance 2 apart. If `max_pair_distance` is `None`, all pairs are considered (effectively infinite `max_pair_distance`) Returns ------- np.ndarray Of shape `(2, num_pairs)` where `num_pairs` is the total number of pairs within `max_pair_distance` of one another. """ from rdkit import Chem from rdkit.Chem import rdmolops N = len(mol.GetAtoms()) if (max_pair_distance is None or max_pair_distance >= N): max_distance = N elif max_pair_distance is not None and max_pair_distance <= 0: raise ValueError( "max_pair_distance must either be a positive integer or None") elif max_pair_distance is not None: max_distance = max_pair_distance adj = rdmolops.GetAdjacencyMatrix(mol) # Handle edge case of self-pairs (i, i) sum_adj = np.eye(N) for i in range(max_distance): # Increment by 1 since we don't want 0-indexing power = i + 1 sum_adj += np.linalg.matrix_power(adj, power) nonzero_locs = np.where(sum_adj != 0) num_pairs = len(nonzero_locs[0]) # This creates a matrix of shape (2, num_pairs) pair_edges = np.reshape(np.array(list(zip(nonzero_locs))), (2, num_pairs)) return pair_edges
def _featurize(self, mol: RDKitMol) -> GraphData: """Calculate molecule graph features from RDKit mol object. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit mol object. Returns ------- graph: GraphData A molecule graph with some features. """ if self.use_partial_charge: try: mol.GetAtomWithIdx(0).GetProp('_GasteigerCharge') except: # If partial charges were not computed try: from rdkit.Chem import AllChem AllChem.ComputeGasteigerCharges(mol) except ModuleNotFoundError: raise ImportError( "This class requires RDKit to be installed.") # construct atom (node) feature h_bond_infos = construct_hydrogen_bonding_info(mol) atom_features = np.asarray( [ _construct_atom_feature(atom, h_bond_infos, self.use_chirality, self.use_partial_charge) for atom in mol.GetAtoms() ], dtype=float, ) # construct edge (bond) index src, dest = [], [] for bond in mol.GetBonds(): # add edge list considering a directed graph start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() src += [start, end] dest += [end, start] # construct edge (bond) feature bond_features = None # deafult None if self.use_edges: features = [] for bond in mol.GetBonds(): features += 2 * [_construct_bond_feature(bond)] bond_features = np.asarray(features, dtype=float) return GraphData(node_features=atom_features, edge_index=np.asarray([src, dest], dtype=int), edge_features=bond_features)
def _featurize(self, mol: RDKitMol) -> GraphData: """Calculate molecule graph features from RDKit mol object. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit mol object. Returns ------- graph: GraphData A molecule graph with some features. """ from rdkit import Chem from rdkit.Chem import AllChem # construct atom and bond features try: mol.GetAtomWithIdx(0).GetProp('_GasteigerCharge') except: # If partial charges were not computed AllChem.ComputeGasteigerCharges(mol) h_bond_infos = construct_hydrogen_bonding_info(mol) sssr = Chem.GetSymmSSSR(mol) # construct atom (node) feature atom_features = np.array( [ _construct_atom_feature(atom, h_bond_infos, sssr) for atom in mol.GetAtoms() ], dtype=np.float, ) # construct edge (bond) information src, dest, bond_features = [], [], [] for bond in mol.GetBonds(): # add edge list considering a directed graph start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() src += [start, end] dest += [end, start] bond_features += 2 * [_construct_bond_feature(bond)] if self.add_self_edges: num_atoms = mol.GetNumAtoms() src += [i for i in range(num_atoms)] dest += [i for i in range(num_atoms)] # add dummy edge features bond_fea_length = len(bond_features[0]) bond_features += num_atoms * [[0 for _ in range(bond_fea_length)]] return GraphData(node_features=atom_features, edge_index=np.array([src, dest], dtype=np.int), edge_features=np.array(bond_features, dtype=np.float))
def _featurize(self, datapoint: RDKitMol, **kwargs) -> Optional[GraphMatrix]: """ Calculate adjacency matrix and nodes features for RDKitMol. It strips any chirality and charges Parameters ---------- datapoint: rdkit.Chem.rdchem.Mol RDKit mol object. Returns ------- graph: GraphMatrix A molecule graph with some features. """ try: from rdkit import Chem except ModuleNotFoundError: raise ImportError("This method requires RDKit to be installed.") if 'mol' in kwargs: datapoint = kwargs.get("mol") raise DeprecationWarning( 'Mol is being phased out as a parameter, please pass "datapoint" instead.' ) if self.kekulize: Chem.Kekulize(datapoint) A = np.zeros(shape=(self.max_atom_count, self.max_atom_count), dtype=np.float32) bonds = datapoint.GetBonds() begin, end = [b.GetBeginAtomIdx() for b in bonds], [b.GetEndAtomIdx() for b in bonds] bond_type = [self.bond_encoder[b.GetBondType()] for b in bonds] A[begin, end] = bond_type A[end, begin] = bond_type degree = np.sum(A[:datapoint.GetNumAtoms(), :datapoint.GetNumAtoms()], axis=-1) X = np.array( [ self.atom_encoder[atom.GetAtomicNum()] for atom in datapoint.GetAtoms() ] + [0] * (self.max_atom_count - datapoint.GetNumAtoms()), dtype=np.int32, ) graph = GraphMatrix(A, X) return graph if (degree > 0).all() else None
def construct_node_features_matrix(self, mol: RDKitMol) -> np.ndarray: """ This function constructs a matrix of atom features for all atoms in a given molecule using the atom_features function. Parameters ---------- mol: RDKitMol RDKit Mol object. Returns ---------- Atom_features: ndarray Numpy array containing atom features. """ return np.array([self.atom_features(atom) for atom in mol.GetAtoms()])
def _featurize(self, mol: RDKitMol) -> Optional[GraphMatrix]: """ Calculate adjacency matrix and nodes features for RDKitMol. It strips any chirality and charges Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit mol object. Returns ------- graph: GraphMatrix A molecule graph with some features. """ try: from rdkit import Chem except ModuleNotFoundError: raise ImportError("This method requires RDKit to be installed.") if self.kekulize: Chem.Kekulize(mol) A = np.zeros(shape=(self.max_atom_count, self.max_atom_count), dtype=np.float32) bonds = mol.GetBonds() begin, end = [b.GetBeginAtomIdx() for b in bonds], [b.GetEndAtomIdx() for b in bonds] bond_type = [self.bond_encoder[b.GetBondType()] for b in bonds] A[begin, end] = bond_type A[end, begin] = bond_type degree = np.sum(A[:mol.GetNumAtoms(), :mol.GetNumAtoms()], axis=-1) X = np.array( [ self.atom_encoder[atom.GetAtomicNum()] for atom in mol.GetAtoms() ] + [0] * (self.max_atom_count - mol.GetNumAtoms()), dtype=np.int32, ) graph = GraphMatrix(A, X) return graph if (degree > 0).all() else None
def coulomb_matrix(self, mol: RDKitMol) -> np.ndarray: """ Generate Coulomb matrices for each conformer of the given molecule. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray The coulomb matrices of the given molecule """ try: from rdkit import Chem from rdkit.Chem import AllChem except ModuleNotFoundError: raise ImportError("This class requires RDKit to be installed.") # Check whether num_confs >=1 or not num_confs = len(mol.GetConformers()) if num_confs == 0: mol = Chem.AddHs(mol) AllChem.EmbedMolecule(mol, AllChem.ETKDG()) if self.remove_hydrogens: mol = Chem.RemoveHs(mol) n_atoms = mol.GetNumAtoms() z = [atom.GetAtomicNum() for atom in mol.GetAtoms()] rval = [] for conf in mol.GetConformers(): d = self.get_interatomic_distances(conf) m = np.outer(z, z) / d m[range(n_atoms), range(n_atoms)] = 0.5 * np.array(z)**2.4 if self.randomize: for random_m in self.randomize_coulomb_matrix(m): random_m = pad_array(random_m, self.max_atoms) rval.append(random_m) else: m = pad_array(m, self.max_atoms) rval.append(m) rval = np.asarray(rval) return rval
def _featurize(self, mol: RDKitMol) -> GraphData: """Calculate molecule graph features from RDKit mol object. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit mol object. Returns ------- graph: GraphData A molecule graph with some features. """ node_features = np.asarray( [self._pagtn_atom_featurizer(atom) for atom in mol.GetAtoms()], dtype=np.float) edge_index, edge_features = self._pagtn_edge_featurizer(mol) graph = GraphData(node_features, edge_index, edge_features) return graph
def _featurize(self, mol: RDKitMol) -> np.ndarray: """Calculate symmetry function. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray A numpy array of symmetry function. The shape is `(max_atoms, 4)`. """ coordinates = self.coordfeat._featurize(mol) atom_numbers = np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()]) atom_numbers = np.expand_dims(atom_numbers, axis=1) assert atom_numbers.shape[0] == coordinates.shape[0] features = np.concatenate([atom_numbers, coordinates], axis=1) return pad_array(features, (self.max_atoms, 4))
def _featurize(self, mol: RDKitMol) -> GraphMatrix: """Calculate adjacency matrix and nodes features for RDKitMol. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit mol object. Returns ------- graph: GraphMatrix A molecule graph with some features. """ if self.kekulize: Chem.Kekulize(mol) A = np.zeros(shape=(self.max_atom_count, self.max_atom_count), dtype=np.float32) bonds = mol.GetBonds() begin, end = [b.GetBeginAtomIdx() for b in bonds], [b.GetEndAtomIdx() for b in bonds] bond_type = [self.bond_encoder[b.GetBondType()] for b in bonds] A[begin, end] = bond_type A[end, begin] = bond_type degree = np.sum(A[:mol.GetNumAtoms(), :mol.GetNumAtoms()], axis=-1) X = np.array( [ self.atom_encoder[atom.GetAtomicNum()] for atom in mol.GetAtoms() ] + [0] * (self.max_atom_count - mol.GetNumAtoms()), dtype=np.int32, ) graph = GraphMatrix(A, X) return graph if (degree > 0).all() else None
def compute_all_ecfp(mol: RDKitMol, indices: Optional[Set[int]] = None, degree: int = 2) -> Dict[int, str]: """Obtain molecular fragment for all atoms emanating outward to given degree. For each fragment, compute SMILES string (for now) and hash to an int. Return a dictionary mapping atom index to hashed SMILES. Parameters ---------- mol: rdkit Molecule Molecule to compute ecfp fragments on indices: Optional[Set[int]] List of atom indices for molecule. Default is all indices. If specified will only compute fragments for specified atoms. degree: int Graph degree to use when computing ECFP fingerprints Returns ---------- dict Dictionary mapping atom index to hashed smiles. """ ecfp_dict = {} from rdkit import Chem for i in range(mol.GetNumAtoms()): if indices is not None and i not in indices: continue env = Chem.FindAtomEnvironmentOfRadiusN(mol, degree, i, useHs=True) submol = Chem.PathToSubmol(mol, env) smile = Chem.MolToSmiles(submol) ecfp_dict[i] = "%s,%s" % (mol.GetAtoms()[i].GetAtomicNum(), smile) return ecfp_dict
def _featurize(self, datapoint: RDKitMol, **kwargs) -> GraphData: """Calculate molecule graph features from RDKit mol object. Parameters ---------- datapoint: rdkit.Chem.rdchem.Mol RDKit mol object. Returns ------- graph: GraphData A molecule graph with some features. """ assert datapoint.GetNumAtoms( ) > 1, "More than one atom should be present in the molecule for this featurizer to work." if 'mol' in kwargs: datapoint = kwargs.get("mol") raise DeprecationWarning( 'Mol is being phased out as a parameter, please pass "datapoint" instead.' ) if self.use_partial_charge: try: datapoint.GetAtomWithIdx(0).GetProp('_GasteigerCharge') except: # If partial charges were not computed try: from rdkit.Chem import AllChem AllChem.ComputeGasteigerCharges(datapoint) except ModuleNotFoundError: raise ImportError( "This class requires RDKit to be installed.") # construct atom (node) feature h_bond_infos = construct_hydrogen_bonding_info(datapoint) atom_features = np.asarray( [ _construct_atom_feature(atom, h_bond_infos, self.use_chirality, self.use_partial_charge) for atom in datapoint.GetAtoms() ], dtype=float, ) # construct edge (bond) index src, dest = [], [] for bond in datapoint.GetBonds(): # add edge list considering a directed graph start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() src += [start, end] dest += [end, start] # construct edge (bond) feature bond_features = None # deafult None if self.use_edges: features = [] for bond in datapoint.GetBonds(): features += 2 * [_construct_bond_feature(bond)] bond_features = np.asarray(features, dtype=float) return GraphData(node_features=atom_features, edge_index=np.asarray([src, dest], dtype=int), edge_features=bond_features)