def _featurize(self, datapoint: RDKitMol, **kwargs) -> Optional[GraphMatrix]: """ Calculate adjacency matrix and nodes features for RDKitMol. It strips any chirality and charges Parameters ---------- datapoint: rdkit.Chem.rdchem.Mol RDKit mol object. Returns ------- graph: GraphMatrix A molecule graph with some features. """ try: from rdkit import Chem except ModuleNotFoundError: raise ImportError("This method requires RDKit to be installed.") if 'mol' in kwargs: datapoint = kwargs.get("mol") raise DeprecationWarning( 'Mol is being phased out as a parameter, please pass "datapoint" instead.' ) if self.kekulize: Chem.Kekulize(datapoint) A = np.zeros(shape=(self.max_atom_count, self.max_atom_count), dtype=np.float32) bonds = datapoint.GetBonds() begin, end = [b.GetBeginAtomIdx() for b in bonds], [b.GetEndAtomIdx() for b in bonds] bond_type = [self.bond_encoder[b.GetBondType()] for b in bonds] A[begin, end] = bond_type A[end, begin] = bond_type degree = np.sum(A[:datapoint.GetNumAtoms(), :datapoint.GetNumAtoms()], axis=-1) X = np.array( [ self.atom_encoder[atom.GetAtomicNum()] for atom in datapoint.GetAtoms() ] + [0] * (self.max_atom_count - datapoint.GetNumAtoms()), dtype=np.int32, ) graph = GraphMatrix(A, X) return graph if (degree > 0).all() else None
def mol_to_graph(mol: RDKitMol): """Convert RDKit Mol to NetworkX graph Convert mol into a graph representation atoms are nodes, and bonds are vertices stored as graph Parameters ---------- mol: RDKit Mol The molecule to convert into a graph. Returns ------- graph: networkx.Graph Contains atoms indices as nodes, edges as bonds. Note ---- This function requires NetworkX to be installed. """ try: import networkx as nx except ModuleNotFoundError: raise ValueError("This function requires NetworkX to be installed.") G = nx.Graph() num_atoms = mol.GetNumAtoms() G.add_nodes_from(range(num_atoms)) for i in range(mol.GetNumBonds()): from_idx = mol.GetBonds()[i].GetBeginAtomIdx() to_idx = mol.GetBonds()[i].GetEndAtomIdx() G.add_edge(from_idx, to_idx) return G
def _create_component_map(mol: RDKitMol, components: List[List[int]]) -> Dict[int, int]: """Creates a map from atom ids to disconnected component id For each atom in `mol`, maps it to the id of the component in the molecule. The intent is that this is used on a molecule whose rotatable bonds have been removed. `components` is a list of the connected components after this surgery. Parameters ---------- mol: RDKit Mol molecule to find disconnected compontents in components: List[List[int]] List of connected components Returns ------- comp_map: Dict[int, int] Maps atom ids to component ides """ comp_map = {} for i in range(mol.GetNumAtoms()): for j in range(len(components)): if i in components[j]: comp_map[i] = j break return comp_map
def _featurize(self, mol: RDKitMol) -> Optional[GraphMatrix]: """ Calculate adjacency matrix and nodes features for RDKitMol. It strips any chirality and charges Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit mol object. Returns ------- graph: GraphMatrix A molecule graph with some features. """ try: from rdkit import Chem except ModuleNotFoundError: raise ImportError("This method requires RDKit to be installed.") if self.kekulize: Chem.Kekulize(mol) A = np.zeros(shape=(self.max_atom_count, self.max_atom_count), dtype=np.float32) bonds = mol.GetBonds() begin, end = [b.GetBeginAtomIdx() for b in bonds], [b.GetEndAtomIdx() for b in bonds] bond_type = [self.bond_encoder[b.GetBondType()] for b in bonds] A[begin, end] = bond_type A[end, begin] = bond_type degree = np.sum(A[:mol.GetNumAtoms(), :mol.GetNumAtoms()], axis=-1) X = np.array( [ self.atom_encoder[atom.GetAtomicNum()] for atom in mol.GetAtoms() ] + [0] * (self.max_atom_count - mol.GetNumAtoms()), dtype=np.int32, ) graph = GraphMatrix(A, X) return graph if (degree > 0).all() else None
def _featurize(self, mol: RDKitMol) -> GraphData: """Calculate molecule graph features from RDKit mol object. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit mol object. Returns ------- graph: GraphData A molecule graph with some features. """ from rdkit import Chem from rdkit.Chem import AllChem # construct atom and bond features try: mol.GetAtomWithIdx(0).GetProp('_GasteigerCharge') except: # If partial charges were not computed AllChem.ComputeGasteigerCharges(mol) h_bond_infos = construct_hydrogen_bonding_info(mol) sssr = Chem.GetSymmSSSR(mol) # construct atom (node) feature atom_features = np.array( [ _construct_atom_feature(atom, h_bond_infos, sssr) for atom in mol.GetAtoms() ], dtype=np.float, ) # construct edge (bond) information src, dest, bond_features = [], [], [] for bond in mol.GetBonds(): # add edge list considering a directed graph start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() src += [start, end] dest += [end, start] bond_features += 2 * [_construct_bond_feature(bond)] if self.add_self_edges: num_atoms = mol.GetNumAtoms() src += [i for i in range(num_atoms)] dest += [i for i in range(num_atoms)] # add dummy edge features bond_fea_length = len(bond_features[0]) bond_features += num_atoms * [[0 for _ in range(bond_fea_length)]] return GraphData(node_features=atom_features, edge_index=np.array([src, dest], dtype=np.int), edge_features=np.array(bond_features, dtype=np.float))
def _featurize(self, datapoint: RDKitMol, **kwargs) -> np.ndarray: """Calculate atomic coordinates. Parameters ---------- datapoint: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray A numpy array of atomic coordinates. The shape is `(n_atoms, 3)`. """ try: from rdkit import Chem from rdkit.Chem import AllChem except ModuleNotFoundError: raise ImportError("This class requires RDKit to be installed.") if 'mol' in kwargs: datapoint = kwargs.get("mol") raise DeprecationWarning( 'Mol is being phased out as a parameter, please pass "datapoint" instead.' ) # Check whether num_confs >=1 or not num_confs = len(datapoint.GetConformers()) if num_confs == 0: datapoint = Chem.AddHs(datapoint) AllChem.EmbedMolecule(datapoint, AllChem.ETKDG()) datapoint = Chem.RemoveHs(datapoint) N = datapoint.GetNumAtoms() coords = np.zeros((N, 3)) # RDKit stores atomic coordinates in Angstrom. Atomic unit of length is the # bohr (1 bohr = 0.529177 Angstrom). Converting units makes gradient calculation # consistent with most QM software packages. if self.use_bohr: coords_list = [ datapoint.GetConformer(0).GetAtomPosition(i).__idiv__( 0.52917721092) for i in range(N) ] else: coords_list = [ datapoint.GetConformer(0).GetAtomPosition(i) for i in range(N) ] for atom in range(N): coords[atom, 0] = coords_list[atom].x coords[atom, 1] = coords_list[atom].y coords[atom, 2] = coords_list[atom].z return coords
def _featurize(self, mol: RDKitMol) -> GraphMatrix: """Calculate adjacency matrix and nodes features for RDKitMol. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit mol object. Returns ------- graph: GraphMatrix A molecule graph with some features. """ if self.kekulize: Chem.Kekulize(mol) A = np.zeros(shape=(self.max_atom_count, self.max_atom_count), dtype=np.float32) bonds = mol.GetBonds() begin, end = [b.GetBeginAtomIdx() for b in bonds], [b.GetEndAtomIdx() for b in bonds] bond_type = [self.bond_encoder[b.GetBondType()] for b in bonds] A[begin, end] = bond_type A[end, begin] = bond_type degree = np.sum(A[:mol.GetNumAtoms(), :mol.GetNumAtoms()], axis=-1) X = np.array( [ self.atom_encoder[atom.GetAtomicNum()] for atom in mol.GetAtoms() ] + [0] * (self.max_atom_count - mol.GetNumAtoms()), dtype=np.int32, ) graph = GraphMatrix(A, X) return graph if (degree > 0).all() else None
def _featurize(self, mol: RDKitMol) -> np.ndarray: """Calculate atomic coordinates. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray A numpy array of atomic coordinates. The shape is `(n_atoms, 3)`. """ try: from rdkit import Chem from rdkit.Chem import AllChem except ModuleNotFoundError: raise ImportError("This class requires RDKit to be installed.") # Check whether num_confs >=1 or not num_confs = len(mol.GetConformers()) if num_confs == 0: mol = Chem.AddHs(mol) AllChem.EmbedMolecule(mol, AllChem.ETKDG()) mol = Chem.RemoveHs(mol) N = mol.GetNumAtoms() coords = np.zeros((N, 3)) # RDKit stores atomic coordinates in Angstrom. Atomic unit of length is the # bohr (1 bohr = 0.529177 Angstrom). Converting units makes gradient calculation # consistent with most QM software packages. if self.use_bohr: coords_list = [ mol.GetConformer(0).GetAtomPosition(i).__idiv__(0.52917721092) for i in range(N) ] else: coords_list = [ mol.GetConformer(0).GetAtomPosition(i) for i in range(N) ] for atom in range(N): coords[atom, 0] = coords_list[atom].x coords[atom, 1] = coords_list[atom].y coords[atom, 2] = coords_list[atom].z return coords
def coulomb_matrix(self, mol: RDKitMol) -> np.ndarray: """ Generate Coulomb matrices for each conformer of the given molecule. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray The coulomb matrices of the given molecule """ try: from rdkit import Chem from rdkit.Chem import AllChem except ModuleNotFoundError: raise ImportError("This class requires RDKit to be installed.") # Check whether num_confs >=1 or not num_confs = len(mol.GetConformers()) if num_confs == 0: mol = Chem.AddHs(mol) AllChem.EmbedMolecule(mol, AllChem.ETKDG()) if self.remove_hydrogens: mol = Chem.RemoveHs(mol) n_atoms = mol.GetNumAtoms() z = [atom.GetAtomicNum() for atom in mol.GetAtoms()] rval = [] for conf in mol.GetConformers(): d = self.get_interatomic_distances(conf) m = np.outer(z, z) / d m[range(n_atoms), range(n_atoms)] = 0.5 * np.array(z)**2.4 if self.randomize: for random_m in self.randomize_coulomb_matrix(m): random_m = pad_array(random_m, self.max_atoms) rval.append(random_m) else: m = pad_array(m, self.max_atoms) rval.append(m) rval = np.asarray(rval) return rval
def _pagtn_edge_featurizer(self, mol: RDKitMol) -> Tuple[np.ndarray, np.ndarray]: """Calculate bond features from RDKit mol object. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit mol object. Returns ------- np.ndarray Source and Destination node indexes of each bond. np.ndarray numpy vector of bond features. """ n_atoms = mol.GetNumAtoms() # To get the shortest paths between two nodes. paths_dict = compute_all_pairs_shortest_path(mol) # To get info if two nodes belong to the same ring. rings_dict = compute_pairwise_ring_info(mol) # Featurizer feats = [] src = [] dest = [] for i in range(n_atoms): for j in range(n_atoms): src.append(i) dest.append(j) if (i, j) not in paths_dict: feats.append(np.zeros(7 * self.max_length + 7)) continue ring_info = rings_dict.get(self.ordered_pair(i, j), []) feats.append( self._edge_features(mol, paths_dict[(i, j)], ring_info)) return np.array([src, dest], dtype=np.int), np.array(feats, dtype=np.float)
def compute_all_ecfp(mol: RDKitMol, indices: Optional[Set[int]] = None, degree: int = 2) -> Dict[int, str]: """Obtain molecular fragment for all atoms emanating outward to given degree. For each fragment, compute SMILES string (for now) and hash to an int. Return a dictionary mapping atom index to hashed SMILES. Parameters ---------- mol: rdkit Molecule Molecule to compute ecfp fragments on indices: Optional[Set[int]] List of atom indices for molecule. Default is all indices. If specified will only compute fragments for specified atoms. degree: int Graph degree to use when computing ECFP fingerprints Returns ---------- dict Dictionary mapping atom index to hashed smiles. """ ecfp_dict = {} from rdkit import Chem for i in range(mol.GetNumAtoms()): if indices is not None and i not in indices: continue env = Chem.FindAtomEnvironmentOfRadiusN(mol, degree, i, useHs=True) submol = Chem.PathToSubmol(mol, env) smile = Chem.MolToSmiles(submol) ecfp_dict[i] = "%s,%s" % (mol.GetAtoms()[i].GetAtomicNum(), smile) return ecfp_dict
def pair_features(mol: RDKitMol, bond_features_map: dict, bond_adj_list: List, bt_len: int = 6, graph_distance: bool = True, max_pair_distance: Optional[int] = None) -> np.ndarray: """Helper method used to compute atom pair feature vectors. Many different featurization methods compute atom pair features such as WeaveFeaturizer. Note that atom pair features could be for pairs of atoms which aren't necessarily bonded to one another. Parameters ---------- mol: RDKit Mol Molecule to compute features on. bond_features_map: dict Dictionary that maps pairs of atom ids (say `(2, 3)` for a bond between atoms 2 and 3) to the features for the bond between them. bond_adj_list: list of lists `bond_adj_list[i]` is a list of the atom indices that atom `i` shares a bond with . This list is symmetrical so if `j in bond_adj_list[i]` then `i in bond_adj_list[j]`. bt_len: int, optional (default 6) The number of different bond types to consider. graph_distance: bool, optional (default True) If true, use graph distance between molecules. Else use euclidean distance. The specified `mol` must have a conformer. Atomic positions will be retrieved by calling `mol.getConformer(0)`. max_pair_distance: Optional[int], (default None) This value can be a positive integer or None. This parameter determines the maximum graph distance at which pair features are computed. For example, if `max_pair_distance==2`, then pair features are computed only for atoms at most graph distance 2 apart. If `max_pair_distance` is `None`, all pairs are considered (effectively infinite `max_pair_distance`) Note ---- This method requires RDKit to be installed. Returns ------- features: np.ndarray Of shape `(N_edges, bt_len + max_distance + 1)`. This is the array of pairwise features for all atom pairs, where N_edges is the number of edges within max_pair_distance of one another in this molecules. pair_edges: np.ndarray Of shape `(2, num_pairs)` where `num_pairs` is the total number of pairs within `max_pair_distance` of one another. """ if graph_distance: max_distance = 7 else: max_distance = 1 N = mol.GetNumAtoms() pair_edges = max_pair_distance_pairs(mol, max_pair_distance) num_pairs = pair_edges.shape[1] N_edges = pair_edges.shape[1] features = np.zeros((N_edges, bt_len + max_distance + 1)) # Get mapping mapping = {} for n in range(N_edges): a1, a2 = pair_edges[:, n] mapping[(int(a1), int(a2))] = n num_atoms = mol.GetNumAtoms() rings = mol.GetRingInfo().AtomRings() for a1 in range(num_atoms): for a2 in bond_adj_list[a1]: # first `bt_len` features are bond features(if applicable) if (int(a1), int(a2)) not in mapping: raise ValueError( "Malformed molecule with bonds not in specified graph distance.") else: n = mapping[(int(a1), int(a2))] features[n, :bt_len] = np.asarray( bond_features_map[tuple(sorted((a1, a2)))], dtype=float) for ring in rings: if a1 in ring: for a2 in ring: if (int(a1), int(a2)) not in mapping: # For ring pairs outside max pairs distance continue continue else: n = mapping[(int(a1), int(a2))] # `bt_len`-th feature is if the pair of atoms are in the same ring if a2 == a1: features[n, bt_len] = 0 else: features[n, bt_len] = 1 # graph distance between two atoms if graph_distance: # distance is a matrix of 1-hot encoded distances for all atoms distance = find_distance( a1, num_atoms, bond_adj_list, max_distance=max_distance) for a2 in range(num_atoms): if (int(a1), int(a2)) not in mapping: # For ring pairs outside max pairs distance continue continue else: n = mapping[(int(a1), int(a2))] features[n, bt_len + 1:] = distance[a2] # Euclidean distance between atoms if not graph_distance: coords = np.zeros((N, 3)) for atom in range(N): pos = mol.GetConformer(0).GetAtomPosition(atom) coords[atom, :] = [pos.x, pos.y, pos.z] features[:, :, -1] = np.sqrt(np.sum(np.square( np.stack([coords] * N, axis=1) - \ np.stack([coords] * N, axis=0)), axis=2)) return features, pair_edges
def _featurize(self, datapoint: RDKitMol, **kwargs) -> GraphData: """Calculate molecule graph features from RDKit mol object. Parameters ---------- datapoint: rdkit.Chem.rdchem.Mol RDKit mol object. Returns ------- graph: GraphData A molecule graph with some features. """ assert datapoint.GetNumAtoms( ) > 1, "More than one atom should be present in the molecule for this featurizer to work." if 'mol' in kwargs: datapoint = kwargs.get("mol") raise DeprecationWarning( 'Mol is being phased out as a parameter, please pass "datapoint" instead.' ) if self.use_partial_charge: try: datapoint.GetAtomWithIdx(0).GetProp('_GasteigerCharge') except: # If partial charges were not computed try: from rdkit.Chem import AllChem AllChem.ComputeGasteigerCharges(datapoint) except ModuleNotFoundError: raise ImportError( "This class requires RDKit to be installed.") # construct atom (node) feature h_bond_infos = construct_hydrogen_bonding_info(datapoint) atom_features = np.asarray( [ _construct_atom_feature(atom, h_bond_infos, self.use_chirality, self.use_partial_charge) for atom in datapoint.GetAtoms() ], dtype=float, ) # construct edge (bond) index src, dest = [], [] for bond in datapoint.GetBonds(): # add edge list considering a directed graph start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() src += [start, end] dest += [end, start] # construct edge (bond) feature bond_features = None # deafult None if self.use_edges: features = [] for bond in datapoint.GetBonds(): features += 2 * [_construct_bond_feature(bond)] bond_features = np.asarray(features, dtype=float) return GraphData(node_features=atom_features, edge_index=np.asarray([src, dest], dtype=int), edge_features=bond_features)