def smiles2adjoin(smiles, explicit_hydrogens=True, canonical_atom_order=False): mol = Chem.MolFromSmiles(smiles) if mol is None: print('error') mol = Chem.MolFromSmiles(obsmitosmile(smiles)) assert mol is not None, smiles + ' is not valid ' if explicit_hydrogens: mol = Chem.AddHs(mol) else: mol = Chem.RemoveHs(mol) if canonical_atom_order: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) num_atoms = mol.GetNumAtoms() atoms_list = [] for i in range(num_atoms): atom = mol.GetAtomWithIdx(i) atoms_list.append(atom.GetSymbol()) adjoin_matrix = np.eye(num_atoms) # Add edges num_bonds = mol.GetNumBonds() for i in range(num_bonds): bond = mol.GetBondWithIdx(i) u = bond.GetBeginAtomIdx() v = bond.GetEndAtomIdx() adjoin_matrix[u, v] = 1.0 adjoin_matrix[v, u] = 1.0 return atoms_list, adjoin_matrix
def featurize_smiles_np(arr, featurizer, log_every_N=1000, verbose=True): """Featurize individual compounds in a numpy array. Given a featurizer that operates on individual chemical compounds or macromolecules, compute & add features for that compound to the features array """ features = [] from rdkit import Chem from rdkit.Chem import rdmolfiles from rdkit.Chem import rdmolops for ind, elem in enumerate(arr.tolist()): mol = Chem.MolFromSmiles(elem) if mol: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) if ind % log_every_N == 0: log("Featurizing sample %d" % ind, verbose) features.append(featurizer.featurize([mol])) valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features], dtype=bool) features = [ elt for (is_valid, elt) in zip(valid_inds, features) if is_valid ] features = np.squeeze(np.array(features)) return features.reshape(-1, )
def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True): """Featurize individual compounds in dataframe. Given a featurizer that operates on individual chemical compounds or macromolecules, compute & add features for that compound to the features dataframe """ sample_elems = df[field].tolist() features = [] for ind, elem in enumerate(sample_elems): mol = Chem.MolFromSmiles(elem) # TODO (ytz) this is a bandage solution to reorder the atoms so # that they're always in the same canonical order. Presumably this # should be correctly implemented in the future for graph mols. if mol: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) if ind % log_every_N == 0: log("Featurizing sample %d" % ind, verbose) features.append(featurizer.featurize([mol])) valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features], dtype=bool) features = [ elt for (is_valid, elt) in zip(valid_inds, features) if is_valid ] return np.squeeze(np.array(features), axis=1), valid_inds
def smile2graph(smile, add_self_loop=False, atom_featurizer=CanonicalAtomFeaturizer(), bond_featurizer=None): """Convert SMILES into a DGLGraph. The **i** th atom in the molecule, i.e. ``mol.GetAtomWithIdx(i)``, corresponds to the **i** th node in the returned DGLGraph. The **i** th bond in the molecule, i.e. ``mol.GetBondWithIdx(i)``, corresponds to the **(2i)**-th and **(2i+1)**-th edges in the returned DGLGraph. The **(2i)**-th and **(2i+1)**-th edges will be separately from **u** to **v** and **v** to **u**, where **u** is ``bond.GetBeginAtomIdx()`` and **v** is ``bond.GetEndAtomIdx()``. If self loops are added, the last **n** edges will separately be self loops for atoms ``0, 1, ..., n-1``. Parameters ---------- smiles : str String of SMILES add_self_loop : bool Whether to add self loops in DGLGraphs. atom_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for atoms in a molecule, which can be used to update ndata for a DGLGraph. Default to CanonicalAtomFeaturizer(). bond_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for bonds in a molecule, which can be used to update edata for a DGLGraph. """ mol = Chem.MolFromSmiles(smile) new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) g = DGLGraph() num_atoms = mol.GetNumAtoms() g.add_nodes(num_atoms) src_list = [] dst_list = [] num_bonds = mol.GetNumBonds() for i in range(num_bonds): bond = mol.GetBondWithIdx(i) u = bond.GetBeginAtomIdx() v = bond.GetEndAtomIdx() src_list.extend([u, v]) dst_list.extend([v, u]) g.add_edges(src_list, dst_list) if add_self_loop: nodes = g.nodes() g.add_edges(nodes, nodes) # Featurization if atom_featurizer is not None: g.ndata.update(atom_featurizer(mol)) if bond_featurizer is not None: g.edata.update(bond_featurizer(mol)) return g
def mol_to_graph(mol, graph_constructor, atom_featurizer, bond_featurizer): """Convert an RDKit molecule object into a DGLGraph and featurize for it. Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule holder graph_constructor : callable Takes an RDKit molecule as input and returns a DGLGraph atom_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for atoms in a molecule, which can be used to update ndata for a DGLGraph. bond_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for bonds in a molecule, which can be used to update edata for a DGLGraph. Returns ------- g : DGLGraph Converted DGLGraph for the molecule """ new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) g = graph_constructor(mol) if atom_featurizer is not None: g.ndata.update(atom_featurizer(mol)) if bond_featurizer is not None: g.edata.update(bond_featurizer(mol)) return g
def featurize(self, molecules, log_every_n=1000) -> np.ndarray: """Calculate features for molecules. Parameters ---------- molecules: rdkit.Chem.rdchem.Mol / SMILES string / iterable RDKit Mol, or SMILES string or iterable sequence of RDKit mols/SMILES strings. log_every_n: int, default 1000 Logging messages reported every `log_every_n` samples. Returns ------- features: np.ndarray A numpy array containing a featurized representation of `datapoints`. """ try: from rdkit import Chem from rdkit.Chem import rdmolfiles from rdkit.Chem import rdmolops from rdkit.Chem.rdchem import Mol except ModuleNotFoundError: raise ImportError("This class requires RDKit to be installed.") # Special case handling of single molecule if isinstance(molecules, str) or isinstance(molecules, Mol): molecules = [molecules] else: # Convert iterables to list molecules = list(molecules) features = [] for i, mol in enumerate(molecules): if i % log_every_n == 0: logger.info("Featurizing datapoint %i" % i) try: if isinstance(mol, str): # mol must be a RDKit Mol object, so parse a SMILES mol = Chem.MolFromSmiles(mol) # SMILES is unique, so set a canonical order of atoms new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) features.append(self._featurize(mol)) except Exception as e: if isinstance(mol, Chem.rdchem.Mol): mol = Chem.MolToSmiles(mol) logger.warning( "Failed to featurize datapoint %d, %s. Appending empty array", i, mol) logger.warning("Exception message: {}".format(e)) features.append(np.array([])) features = np.asarray(features) return features
def fingerprint_features(smile_string, radius=2, size=2048): mol = MolFromSmiles(smile_string) new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) return rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits=size, useChirality=True, useBondTypes=True, useFeatures=False)
def mol_to_graph(mol, graph_constructor, node_featurizer, edge_featurizer, canonical_atom_order, explicit_hydrogens): """Convert an RDKit molecule object into a DGLGraph and featurize for it. This function can be used to construct any arbitrary ``DGLGraph`` from an RDKit molecule instance. Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule holder graph_constructor : callable Takes an RDKit molecule as input and returns a DGLGraph node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. canonical_atom_order : bool Whether to use a canonical order of atoms returned by RDKit. Setting it to true might change the order of atoms in the graph constructed. explicit_hydrogens : bool Whether to explicitly represent hydrogens as nodes in the graph. Returns ------- g : DGLGraph Converted DGLGraph for the molecule See Also -------- mol_to_bigraph mol_to_complete_graph mol_to_nearest_neighbor_graph """ # Whether to have hydrogen atoms as explicit nodes if explicit_hydrogens: mol = Chem.AddHs(mol) else: mol = Chem.RemoveHs(mol) if canonical_atom_order: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) g = graph_constructor(mol) if node_featurizer is not None: g.ndata.update(node_featurizer(mol)) if edge_featurizer is not None: g.edata.update(edge_featurizer(mol)) return g
def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True): """Featurize individual compounds in dataframe. Given a featurizer that operates on individual chemical compounds or macromolecules, compute & add features for that compound to the features dataframe """ sample_elems = df[field].tolist() features = [] from rdkit import Chem from rdkit.Chem import rdmolfiles from rdkit.Chem import rdmolops if 'Comet' in str(featurizer.__class__.__qualname__): mols = preprocess_df(sample_elems, NUM_WORKERS) mols_chunks = np.array_split(mols, len(mols) // BATCH_SIZE + 1) for chunk in mols_chunks: X, A, L = list(zip(*chunk)) X = np.array(X, dtype=np.uint8) A = np.array(A, dtype=np.float32) L = np.array(L, dtype=np.uint8) max_len = L[-1] X = X[:, :max_len, :] A = A[:, :max_len, :max_len] temp = featurizer._featurize((X, A)) features += list(temp) valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features], dtype=bool) features = [ elt for (is_valid, elt) in zip(valid_inds, features) if is_valid ] return np.array(features), valid_inds else: for ind, elem in enumerate(sample_elems): mol = Chem.MolFromSmiles(elem) # TODO (ytz) this is a bandage solution to reorder the atoms so # that they're always in the same canonical order. Presumably this # should be correctly implemented in the future for graph mols. if mol: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) if ind % log_every_N == 0: log("Featurizing sample %d" % ind, verbose) features.append(featurizer.featurize([mol])) valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features], dtype=bool) features = [ elt for (is_valid, elt) in zip(valid_inds, features) if is_valid ] return np.squeeze(np.array(features), axis=1), valid_inds
def fingerprint_features(smile_string, radius=2, size=256): mol = MolFromSmiles(smile_string) new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) arr = np.zeros((0,), dtype=np.int8) DataStructs.ConvertToNumpyArray( rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits=size, useChirality=True, useBondTypes=True, useFeatures=False ), arr) return arr
def build_graph_from_molecule(mol, use_master_atom=False): """ Param: mol - rdkit.Chem.rdchem.Mol Output: nodes - np.ndarray of shape (num_atoms, num_feat) canon_adj_list - list. index corresponds to the index of node and canon_adj_list[index] corresponds to indices of the nodes that node i is connected to. """ if not isinstance(mol, Chem.rdchem.Mol): raise TypeError("'mol' must be rdkit.Chem.rdchem.Mol obj") # what are the two lines below doing? # Answer found in deepchem.data.data_loader featurize_smiles_df # TODO (ytz) this is a bandage solution to reorder the atoms so # that they're always in the same canonical order. Presumably this # should be correctly implemented in the future for graph mols. new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) idx_nodes = [(atom.GetIdx(), encode_atom(atom)) for atom in mol.GetAtoms()] idx_nodes.sort() _, nodes = list(zip(*idx_nodes)) nodes = np.vstack(nodes) # Master atom is the "average" of all atoms that is connected to all atom # Introduced in https://arxiv.org/pdf/1704.01212.pdf if use_master_atom: master_atom_features = np.expand_dims(np.mean(nodes, axis=0), axis=0) nodes = np.concatenate([nodes, master_atom_features], axis=0) edge_list = [(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()) for bond in mol.GetBonds()] canon_adj_list = [[] for _ in range(len(nodes))] for edge in edge_list: canon_adj_list[edge[0]].append(edge[1]) canon_adj_list[edge[1]].append(edge[0]) if use_master_atom: fake_atom_index = len(nodes) - 1 for i in range(len(nodes) - 1): canon_adj_list[i].append(fake_atom_index) return (nodes, canon_adj_list)
def featurize(self, molecules, log_every_n=1000): """Calculate features for molecules. Parameters ---------- molecules: RDKit Mol / SMILES string /iterable RDKit Mol, or SMILES string or iterable sequence of RDKit mols/SMILES strings. Returns ------- A numpy array containing a featurized representation of `datapoints`. """ try: from rdkit import Chem from rdkit.Chem import rdmolfiles from rdkit.Chem import rdmolops from rdkit.Chem.rdchem import Mol except ModuleNotFoundError: raise ValueError("This class requires RDKit to be installed.") # Special case handling of single molecule if isinstance(molecules, str) or isinstance(molecules, Mol): molecules = [molecules] else: # Convert iterables to list molecules = list(molecules) features = [] for i, mol in enumerate(molecules): if i % log_every_n == 0: logger.info("Featurizing datapoint %i" % i) try: # Process only case of SMILES strings. if isinstance(mol, str): # mol must be a SMILES string so parse mol = Chem.MolFromSmiles(mol) # TODO (ytz) this is a bandage solution to reorder the atoms # so that they're always in the same canonical order. # Presumably this should be correctly implemented in the # future for graph mols. if mol: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) features.append(self._featurize(mol)) except: logger.warning( "Failed to featurize datapoint %d. Appending empty array") features.append(np.array([])) features = np.asarray(features) return features
def _featurize_smiles_df(df, featurizer, field, log_every_n=1000): """Featurize individual compounds in dataframe. Private helper that given a featurizer that operates on individual chemical compounds or macromolecules, compute & add features for that compound to the features dataframe Parameters ---------- df: pd.DataFrame DataFrame that holds SMILES strings featurizer: Featurizer A featurizer object field: str The name of a column in `df` that holds SMILES strings log_every_n: int, optional (default 1000) Emit a logging statement every `log_every_n` rows. Note ---- This function requires RDKit to be installed """ sample_elems = df[field].tolist() features = [] from rdkit import Chem from rdkit.Chem import rdmolfiles from rdkit.Chem import rdmolops for ind, elem in enumerate(sample_elems): mol = Chem.MolFromSmiles(elem) # TODO (ytz) this is a bandage solution to reorder the atoms # so that they're always in the same canonical order. # Presumably this should be correctly implemented in the # future for graph mols. if mol: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) if ind % log_every_n == 0: logger.info("Featurizing sample %d" % ind) features.append(featurizer.featurize([mol])) valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features], dtype=bool) features = [ elt for (is_valid, elt) in zip(valid_inds, features) if is_valid ] return np.squeeze(np.array(features), axis=1), valid_inds
def featurize_smiles(arr): featurizer = dc.feat.ConvMolFeaturizer() features = [] for ind, elem in enumerate(arr.tolist()): mol = Chem.MolFromSmiles(elem) if mol: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) features.append(featurizer([mol])) valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features], dtype=bool) features = [ elt for (is_valid, elt) in zip(valid_inds, features) if is_valid ] features = np.squeeze(np.array(features)) return features.reshape(-1, ), valid_inds
def get_statistics(molset_fname): with open(molset_fname, 'rb') as f: mols = pickle.load(f)[0] dataset_distinct_atoms = set() num_distinct_atoms = [] num_bonds = [] num_rotatable_bonds = [] molecular_mass = [] contains_symmetric_pair = [] for mol in mols: atoms = mol.GetAtoms() symbols = [atom.GetSymbol() for atom in atoms] dataset_distinct_atoms.update(symbols) num_distinct_atoms.append(len(set(symbols))) num_bonds.append(len(mol.GetBonds())) num_rotatable_bonds.append(Descriptors.NumRotatableBonds(mol)) molecular_mass.append(Descriptors.HeavyAtomMolWt(mol)) canonical_ranking = rdmolfiles.CanonicalRankAtoms(mol, breakTies=False) symmetric_pair = int( len(canonical_ranking) != len(set(canonical_ranking))) contains_symmetric_pair.append(symmetric_pair) num_distinct_atoms_dataset = len(dataset_distinct_atoms) num_distinct_atoms = np.array(num_distinct_atoms) num_distinct_atoms = (num_distinct_atoms.mean(), num_distinct_atoms.std()) num_bonds = np.array(num_bonds) num_bonds = (num_bonds.mean(), num_bonds.std()) num_rotatable_bonds = np.array(num_rotatable_bonds) num_rotatable_bonds = (num_rotatable_bonds.mean(), num_rotatable_bonds.std()) molecular_mass = np.array(molecular_mass) molecular_mass = (molecular_mass.mean(), molecular_mass.std()) contains_symmetric_pair = np.array(contains_symmetric_pair).mean() return [ num_distinct_atoms_dataset, num_distinct_atoms, num_bonds, num_rotatable_bonds, molecular_mass, contains_symmetric_pair ]
def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True): """Featurize individual compounds in dataframe. Given a featurizer that operates on individual chemical compounds or macromolecules, compute & add features for that compound to the features dataframe """ sample_elems = df[field].tolist() features = [] for ind, elem in enumerate(sample_elems): mol = Chem.MolFromSmiles(elem) if mol: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) if ind % log_every_N == 0: log("Featurizing sample %d" % ind, verbose) features.append(featurizer.featurize([mol])) valid_inds = torch.Tensor([1 if elt.size > 0 else 0 for elt in features], dtype=bool) features = [ elt for (is_valid, elt) in zip(valid_inds, features) if is_valid ] return torch.squeeze(torch.Tensor(features), axis=1), valid_inds
def mol_to_graph(mol, graph_constructor, node_featurizer, edge_featurizer, canonical_atom_order): """Convert an RDKit molecule object into a DGLGraph and featurize for it. Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule holder graph_constructor : callable Takes an RDKit molecule as input and returns a DGLGraph node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. canonical_atom_order : bool Whether to use a canonical order of atoms returned by RDKit. Setting it to true might change the order of atoms in the graph constructed. Returns ------- g : DGLGraph Converted DGLGraph for the molecule """ if canonical_atom_order: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) g = graph_constructor(mol) if node_featurizer is not None: g.ndata.update(node_featurizer(mol)) if edge_featurizer is not None: g.edata.update(edge_featurizer(mol)) return g
def mol_to_nearest_neighbor_graph(mol, coordinates, neighbor_cutoff, max_num_neighbors=None, p_distance=2, add_self_loop=False, node_featurizer=None, edge_featurizer=None, canonical_atom_order=True, keep_dists=False, dist_field='dist', explicit_hydrogens=False, num_virtual_nodes=0): """Convert an RDKit molecule into a nearest neighbor graph and featurize for it. Different from bigraph and complete graph, the nearest neighbor graph may not be symmetric since i is the closest neighbor of j does not necessarily suggest the other way. Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule holder coordinates : numpy.ndarray of shape (N, D) The coordinates of atoms in the molecule. N for the number of atoms and D for the dimensions of the coordinates. neighbor_cutoff : float If the distance between a pair of nodes is larger than neighbor_cutoff, they will not be considered as neighboring nodes. max_num_neighbors : int or None. If not None, then this specifies the maximum number of neighbors allowed for each atom. Default to None. p_distance : int We compute the distance between neighbors using Minkowski (:math:`l_p`) distance. When ``p_distance = 1``, Minkowski distance is equivalent to Manhattan distance. When ``p_distance = 2``, Minkowski distance is equivalent to the standard Euclidean distance. Default to 2. add_self_loop : bool Whether to add self loops in DGLGraphs. Default to False. node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. Default to None. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. Default to None. canonical_atom_order : bool Whether to use a canonical order of atoms returned by RDKit. Setting it to true might change the order of atoms in the graph constructed. Default to True. keep_dists : bool Whether to store the distance between neighboring atoms in ``edata`` of the constructed DGLGraphs. Default to False. dist_field : str Field for storing distance between neighboring atoms in ``edata``. This comes into effect only when ``keep_dists=True``. Default to ``'dist'``. explicit_hydrogens : bool Whether to explicitly represent hydrogens as nodes in the graph. If True, it will call rdkit.Chem.AddHs(mol). Default to False. num_virtual_nodes : int The number of virtual nodes to add. The virtual nodes will be connected to all real nodes with virtual edges. If the returned graph has any node/edge feature, an additional column of binary values will be used for each feature to indicate the identity of virtual node/edges. The features of the virtual nodes/edges will be zero vectors except for the additional column. Default to 0. Returns ------- DGLGraph or None Nearest neighbor DGLGraph for the molecule if :attr:`mol` is valid and None otherwise. Examples -------- >>> from dgllife.utils import mol_to_nearest_neighbor_graph >>> from rdkit import Chem >>> from rdkit.Chem import AllChem >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C') >>> AllChem.EmbedMolecule(mol) >>> AllChem.MMFFOptimizeMolecule(mol) >>> coords = get_mol_3d_coordinates(mol) >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25) >>> print(g) DGLGraph(num_nodes=23, num_edges=6, ndata_schemes={} edata_schemes={}) Quite often we will want to use the distance between end atoms of edges, this can be achieved with >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25, keep_dists=True) >>> print(g.edata['dist']) tensor([[1.2024], [1.2024], [1.2270], [1.2270], [1.2259], [1.2259]]) By default, we do not explicitly represent hydrogens as nodes, which can be done as follows. >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C') >>> mol = Chem.AddHs(mol) >>> AllChem.EmbedMolecule(mol) >>> AllChem.MMFFOptimizeMolecule(mol) >>> coords = get_mol_3d_coordinates(mol) >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25, >>> explicit_hydrogens=True) >>> print(g) DGLGraph(num_nodes=41, num_edges=42, ndata_schemes={} edata_schemes={}) See Also -------- get_mol_3d_coordinates k_nearest_neighbors smiles_to_nearest_neighbor_graph """ if mol is None: print('Invalid mol found') return None if explicit_hydrogens: mol = Chem.AddHs(mol) num_atoms = mol.GetNumAtoms() num_coords = coordinates.shape[0] assert num_atoms == num_coords, \ 'Expect the number of atoms to match the first dimension of coordinates, ' \ 'got {:d} and {:d}'.format(num_atoms, num_coords) if canonical_atom_order: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) srcs, dsts, dists = k_nearest_neighbors( coordinates=coordinates, neighbor_cutoff=neighbor_cutoff, max_num_neighbors=max_num_neighbors, p_distance=p_distance, self_loops=add_self_loop) g = dgl.graph(([], []), idtype=torch.int32) # Add nodes first since some nodes may be completely isolated g.add_nodes(num_atoms) # Add edges g.add_edges(srcs, dsts) if node_featurizer is not None: g.ndata.update(node_featurizer(mol)) if edge_featurizer is not None: g.edata.update(edge_featurizer(mol)) if keep_dists: assert dist_field not in g.edata, \ 'Expect {} to be reserved for distance between neighboring atoms.' g.edata[dist_field] = torch.tensor(dists).float().reshape(-1, 1) if num_virtual_nodes > 0: num_real_nodes = g.num_nodes() real_nodes = list(range(num_real_nodes)) g.add_nodes(num_virtual_nodes) # Change Topology virtual_src = [] virtual_dst = [] for count in range(num_virtual_nodes): virtual_node = num_real_nodes + count virtual_node_copy = [virtual_node] * num_real_nodes virtual_src.extend(real_nodes) virtual_src.extend(virtual_node_copy) virtual_dst.extend(virtual_node_copy) virtual_dst.extend(real_nodes) g.add_edges(virtual_src, virtual_dst) for nk, nv in g.ndata.items(): nv = torch.cat([nv, torch.zeros(g.num_nodes(), 1)], dim=1) nv[:-num_virtual_nodes, -1] = 1 g.ndata[nk] = nv for ek, ev in g.edata.items(): ev = torch.cat([ev, torch.zeros(g.num_edges(), 1)], dim=1) ev[:-num_virtual_nodes * num_real_nodes * 2, -1] = 1 g.edata[ek] = ev return g
def mol_to_graph(mol, graph_constructor, node_featurizer, edge_featurizer, canonical_atom_order, explicit_hydrogens, num_virtual_nodes=0): """Convert an RDKit molecule object into a DGLGraph and featurize for it. This function can be used to construct any arbitrary ``DGLGraph`` from an RDKit molecule instance. Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule holder graph_constructor : callable Takes an RDKit molecule as input and returns a DGLGraph node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. canonical_atom_order : bool Whether to use a canonical order of atoms returned by RDKit. Setting it to true might change the order of atoms in the graph constructed. explicit_hydrogens : bool Whether to explicitly represent hydrogens as nodes in the graph. If True, it will call rdkit.Chem.AddHs(mol). num_virtual_nodes : int The number of virtual nodes to add. The virtual nodes will be connected to all real nodes with virtual edges. If the returned graph has any node/edge feature, an additional column of binary values will be used for each feature to indicate the identity of virtual node/edges. The features of the virtual nodes/edges will be zero vectors except for the additional column. Default to 0. Returns ------- DGLGraph or None Converted DGLGraph for the molecule if :attr:`mol` is valid and None otherwise. See Also -------- mol_to_bigraph mol_to_complete_graph mol_to_nearest_neighbor_graph """ if mol is None: print('Invalid mol found') return None # Whether to have hydrogen atoms as explicit nodes if explicit_hydrogens: mol = Chem.AddHs(mol) if canonical_atom_order: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) g = graph_constructor(mol) if node_featurizer is not None: g.ndata.update(node_featurizer(mol)) if edge_featurizer is not None: g.edata.update(edge_featurizer(mol)) if num_virtual_nodes > 0: num_real_nodes = g.num_nodes() real_nodes = list(range(num_real_nodes)) g.add_nodes(num_virtual_nodes) # Change Topology virtual_src = [] virtual_dst = [] for count in range(num_virtual_nodes): virtual_node = num_real_nodes + count virtual_node_copy = [virtual_node] * num_real_nodes virtual_src.extend(real_nodes) virtual_src.extend(virtual_node_copy) virtual_dst.extend(virtual_node_copy) virtual_dst.extend(real_nodes) g.add_edges(virtual_src, virtual_dst) for nk, nv in g.ndata.items(): nv = torch.cat([nv, torch.zeros(g.num_nodes(), 1)], dim=1) nv[-num_virtual_nodes:, -1] = 1 g.ndata[nk] = nv for ek, ev in g.edata.items(): ev = torch.cat([ev, torch.zeros(g.num_edges(), 1)], dim=1) ev[-num_virtual_nodes * num_real_nodes * 2:, -1] = 1 g.edata[ek] = ev return g
def mol_to_nearest_neighbor_graph(mol, coordinates, neighbor_cutoff, max_num_neighbors=None, p_distance=2, add_self_loop=False, node_featurizer=None, edge_featurizer=None, canonical_atom_order=True, keep_dists=False, dist_field='dist'): """Convert an RDKit molecule into a nearest neighbor graph and featurize for it. Different from bigraph and complete graph, the nearest neighbor graph may not be symmetric since i is the closest neighbor of j does not necessarily suggest the other way. Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule holder coordinates : numpy.ndarray of shape (N, D) The coordinates of atoms in the molecule. N for the number of atoms and D for the dimensions of the coordinates. neighbor_cutoff : float If the distance between a pair of nodes is larger than neighbor_cutoff, they will not be considered as neighboring nodes. max_num_neighbors : int or None. If not None, then this specifies the maximum number of neighbors allowed for each atom. Default to None. p_distance : int We compute the distance between neighbors using Minkowski (:math:`l_p`) distance. When ``p_distance = 1``, Minkowski distance is equivalent to Manhattan distance. When ``p_distance = 2``, Minkowski distance is equivalent to the standard Euclidean distance. Default to 2. add_self_loop : bool Whether to add self loops in DGLGraphs. Default to False. node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. Default to None. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. Default to None. canonical_atom_order : bool Whether to use a canonical order of atoms returned by RDKit. Setting it to true might change the order of atoms in the graph constructed. Default to True. keep_dists : bool Whether to store the distance between neighboring atoms in ``edata`` of the constructed DGLGraphs. Default to False. dist_field : str Field for storing distance between neighboring atoms in ``edata``. This comes into effect only when ``keep_dists=True``. Default to ``'dist'``. """ if canonical_atom_order: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) srcs, dsts, dists = k_nearest_neighbors( coordinates=coordinates, neighbor_cutoff=neighbor_cutoff, max_num_neighbors=max_num_neighbors, p_distance=p_distance, self_loops=add_self_loop) g = DGLGraph() # Add nodes first since some nodes may be completely isolated num_atoms = mol.GetNumAtoms() g.add_nodes(num_atoms) # Add edges g.add_edges(srcs, dsts) if node_featurizer is not None: g.ndata.update(node_featurizer(mol)) if edge_featurizer is not None: g.edata.update(edge_featurizer(mol)) if keep_dists: assert dist_field not in g.edata, \ 'Expect {} to be reserved for distance between neighboring atoms.' g.edata[dist_field] = torch.tensor(dists).float().reshape(-1, 1) return g
def mol_to_nearest_neighbor_graph(mol, coordinates, neighbor_cutoff, max_num_neighbors=None, p_distance=2, add_self_loop=False, node_featurizer=None, edge_featurizer=None, canonical_atom_order=True, keep_dists=False, dist_field='dist', explicit_hydrogens=False): """Convert an RDKit molecule into a nearest neighbor graph and featurize for it. Different from bigraph and complete graph, the nearest neighbor graph may not be symmetric since i is the closest neighbor of j does not necessarily suggest the other way. Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule holder coordinates : numpy.ndarray of shape (N, D) The coordinates of atoms in the molecule. N for the number of atoms and D for the dimensions of the coordinates. neighbor_cutoff : float If the distance between a pair of nodes is larger than neighbor_cutoff, they will not be considered as neighboring nodes. max_num_neighbors : int or None. If not None, then this specifies the maximum number of neighbors allowed for each atom. Default to None. p_distance : int We compute the distance between neighbors using Minkowski (:math:`l_p`) distance. When ``p_distance = 1``, Minkowski distance is equivalent to Manhattan distance. When ``p_distance = 2``, Minkowski distance is equivalent to the standard Euclidean distance. Default to 2. add_self_loop : bool Whether to add self loops in DGLGraphs. Default to False. node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. Default to None. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. Default to None. canonical_atom_order : bool Whether to use a canonical order of atoms returned by RDKit. Setting it to true might change the order of atoms in the graph constructed. Default to True. keep_dists : bool Whether to store the distance between neighboring atoms in ``edata`` of the constructed DGLGraphs. Default to False. dist_field : str Field for storing distance between neighboring atoms in ``edata``. This comes into effect only when ``keep_dists=True``. Default to ``'dist'``. explicit_hydrogens : bool Whether to explicitly represent hydrogens as nodes in the graph. Default to False. Returns ------- g : DGLGraph Nearest neighbor DGLGraph for the molecule Examples -------- >>> from dgllife.utils import mol_to_nearest_neighbor_graph >>> from rdkit import Chem >>> from rdkit.Chem import AllChem >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C') >>> AllChem.EmbedMolecule(mol) >>> AllChem.MMFFOptimizeMolecule(mol) >>> coords = get_mol_3d_coordinates(mol) >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25) >>> print(g) DGLGraph(num_nodes=23, num_edges=6, ndata_schemes={} edata_schemes={}) Quite often we will want to use the distance between end atoms of edges, this can be achieved with >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25, keep_dists=True) >>> print(g.edata['dist']) tensor([[1.2024], [1.2024], [1.2270], [1.2270], [1.2259], [1.2259]]) By default, we do not explicitly represent hydrogens as nodes, which can be done as follows. >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C') >>> mol = Chem.AddHs(mol) >>> AllChem.EmbedMolecule(mol) >>> AllChem.MMFFOptimizeMolecule(mol) >>> coords = get_mol_3d_coordinates(mol) >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25, >>> explicit_hydrogens=True) >>> print(g) DGLGraph(num_nodes=41, num_edges=42, ndata_schemes={} edata_schemes={}) See Also -------- get_mol_3d_coordinates k_nearest_neighbors smiles_to_nearest_neighbor_graph """ if explicit_hydrogens: mol = Chem.AddHs(mol) else: mol = Chem.RemoveHs(mol) num_atoms = mol.GetNumAtoms() num_coords = coordinates.shape[0] assert num_atoms == num_coords, \ 'Expect the number of atoms to match the first dimension of coordinates, ' \ 'got {:d} and {:d}'.format(num_atoms, num_coords) if canonical_atom_order: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) srcs, dsts, dists = k_nearest_neighbors( coordinates=coordinates, neighbor_cutoff=neighbor_cutoff, max_num_neighbors=max_num_neighbors, p_distance=p_distance, self_loops=add_self_loop) g = DGLGraph() # Add nodes first since some nodes may be completely isolated g.add_nodes(num_atoms) # Add edges g.add_edges(srcs, dsts) if node_featurizer is not None: g.ndata.update(node_featurizer(mol)) if edge_featurizer is not None: g.edata.update(edge_featurizer(mol)) if keep_dists: assert dist_field not in g.edata, \ 'Expect {} to be reserved for distance between neighboring atoms.' g.edata[dist_field] = torch.tensor(dists).float().reshape(-1, 1) return g
def is_symmetric_chem(mol): if type(mol) == str: mol = Chem.MolFromSmiles(mol) z = list(rdmolfiles.CanonicalRankAtoms(mol, breakTies=False)) return len(z) != len(set(z))