def smile2graph(smile, add_self_loop=False, atom_featurizer=CanonicalAtomFeaturizer(), bond_featurizer=None): """Convert SMILES into a DGLGraph. The **i** th atom in the molecule, i.e. ``mol.GetAtomWithIdx(i)``, corresponds to the **i** th node in the returned DGLGraph. The **i** th bond in the molecule, i.e. ``mol.GetBondWithIdx(i)``, corresponds to the **(2i)**-th and **(2i+1)**-th edges in the returned DGLGraph. The **(2i)**-th and **(2i+1)**-th edges will be separately from **u** to **v** and **v** to **u**, where **u** is ``bond.GetBeginAtomIdx()`` and **v** is ``bond.GetEndAtomIdx()``. If self loops are added, the last **n** edges will separately be self loops for atoms ``0, 1, ..., n-1``. Parameters ---------- smiles : str String of SMILES add_self_loop : bool Whether to add self loops in DGLGraphs. atom_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for atoms in a molecule, which can be used to update ndata for a DGLGraph. Default to CanonicalAtomFeaturizer(). bond_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for bonds in a molecule, which can be used to update edata for a DGLGraph. """ mol = Chem.MolFromSmiles(smile) new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) g = DGLGraph() num_atoms = mol.GetNumAtoms() g.add_nodes(num_atoms) src_list = [] dst_list = [] num_bonds = mol.GetNumBonds() for i in range(num_bonds): bond = mol.GetBondWithIdx(i) u = bond.GetBeginAtomIdx() v = bond.GetEndAtomIdx() src_list.extend([u, v]) dst_list.extend([v, u]) g.add_edges(src_list, dst_list) if add_self_loop: nodes = g.nodes() g.add_edges(nodes, nodes) # Featurization if atom_featurizer is not None: g.ndata.update(atom_featurizer(mol)) if bond_featurizer is not None: g.edata.update(bond_featurizer(mol)) return g
def featurize_smiles_np(arr, featurizer, log_every_N=1000, verbose=True): """Featurize individual compounds in a numpy array. Given a featurizer that operates on individual chemical compounds or macromolecules, compute & add features for that compound to the features array """ features = [] from rdkit import Chem from rdkit.Chem import rdmolfiles from rdkit.Chem import rdmolops for ind, elem in enumerate(arr.tolist()): mol = Chem.MolFromSmiles(elem) if mol: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) if ind % log_every_N == 0: log("Featurizing sample %d" % ind, verbose) features.append(featurizer.featurize([mol])) valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features], dtype=bool) features = [ elt for (is_valid, elt) in zip(valid_inds, features) if is_valid ] features = np.squeeze(np.array(features)) return features.reshape(-1, )
def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True): """Featurize individual compounds in dataframe. Given a featurizer that operates on individual chemical compounds or macromolecules, compute & add features for that compound to the features dataframe """ sample_elems = df[field].tolist() features = [] for ind, elem in enumerate(sample_elems): mol = Chem.MolFromSmiles(elem) # TODO (ytz) this is a bandage solution to reorder the atoms so # that they're always in the same canonical order. Presumably this # should be correctly implemented in the future for graph mols. if mol: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) if ind % log_every_N == 0: log("Featurizing sample %d" % ind, verbose) features.append(featurizer.featurize([mol])) valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features], dtype=bool) features = [ elt for (is_valid, elt) in zip(valid_inds, features) if is_valid ] return np.squeeze(np.array(features), axis=1), valid_inds
def smiles2adjoin(smiles, explicit_hydrogens=True, canonical_atom_order=False): mol = Chem.MolFromSmiles(smiles) if mol is None: print('error') mol = Chem.MolFromSmiles(obsmitosmile(smiles)) assert mol is not None, smiles + ' is not valid ' if explicit_hydrogens: mol = Chem.AddHs(mol) else: mol = Chem.RemoveHs(mol) if canonical_atom_order: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) num_atoms = mol.GetNumAtoms() atoms_list = [] for i in range(num_atoms): atom = mol.GetAtomWithIdx(i) atoms_list.append(atom.GetSymbol()) adjoin_matrix = np.eye(num_atoms) # Add edges num_bonds = mol.GetNumBonds() for i in range(num_bonds): bond = mol.GetBondWithIdx(i) u = bond.GetBeginAtomIdx() v = bond.GetEndAtomIdx() adjoin_matrix[u, v] = 1.0 adjoin_matrix[v, u] = 1.0 return atoms_list, adjoin_matrix
def mol_to_graph(mol, graph_constructor, atom_featurizer, bond_featurizer): """Convert an RDKit molecule object into a DGLGraph and featurize for it. Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule holder graph_constructor : callable Takes an RDKit molecule as input and returns a DGLGraph atom_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for atoms in a molecule, which can be used to update ndata for a DGLGraph. bond_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for bonds in a molecule, which can be used to update edata for a DGLGraph. Returns ------- g : DGLGraph Converted DGLGraph for the molecule """ new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) g = graph_constructor(mol) if atom_featurizer is not None: g.ndata.update(atom_featurizer(mol)) if bond_featurizer is not None: g.edata.update(bond_featurizer(mol)) return g
def featurize(self, molecules, log_every_n=1000) -> np.ndarray: """Calculate features for molecules. Parameters ---------- molecules: rdkit.Chem.rdchem.Mol / SMILES string / iterable RDKit Mol, or SMILES string or iterable sequence of RDKit mols/SMILES strings. log_every_n: int, default 1000 Logging messages reported every `log_every_n` samples. Returns ------- features: np.ndarray A numpy array containing a featurized representation of `datapoints`. """ try: from rdkit import Chem from rdkit.Chem import rdmolfiles from rdkit.Chem import rdmolops from rdkit.Chem.rdchem import Mol except ModuleNotFoundError: raise ImportError("This class requires RDKit to be installed.") # Special case handling of single molecule if isinstance(molecules, str) or isinstance(molecules, Mol): molecules = [molecules] else: # Convert iterables to list molecules = list(molecules) features = [] for i, mol in enumerate(molecules): if i % log_every_n == 0: logger.info("Featurizing datapoint %i" % i) try: if isinstance(mol, str): # mol must be a RDKit Mol object, so parse a SMILES mol = Chem.MolFromSmiles(mol) # SMILES is unique, so set a canonical order of atoms new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) features.append(self._featurize(mol)) except Exception as e: if isinstance(mol, Chem.rdchem.Mol): mol = Chem.MolToSmiles(mol) logger.warning( "Failed to featurize datapoint %d, %s. Appending empty array", i, mol) logger.warning("Exception message: {}".format(e)) features.append(np.array([])) features = np.asarray(features) return features
def fingerprint_features(smile_string, radius=2, size=2048): mol = MolFromSmiles(smile_string) new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) return rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits=size, useChirality=True, useBondTypes=True, useFeatures=False)
def load_reaction_data(self, file_path): """Load reaction data from the raw file. Parameters ---------- file_path : str Path to read the file. Returns ------- all_mols : list of rdkit.Chem.rdchem.Mol RDKit molecule instances all_reactions : list of str Reactions all_graph_edits : list of str Graph edits in the reactions. """ all_mols = [] all_reactions = [] all_graph_edits = [] with open(file_path, 'r') as f: for i, line in enumerate(f): if i % 10000 == 0: print('Processing line {:d}'.format(i)) # Each line represents a reaction and the corresponding graph edits # # reaction example: # [CH3:14][OH:15].[NH2:12][NH2:13].[OH2:11].[n:1]1[n:2][cH:3][c:4] # ([C:7]([O:9][CH3:8])=[O:10])[cH:5][cH:6]1>>[n:1]1[n:2][cH:3][c:4] # ([C:7](=[O:9])[NH:12][NH2:13])[cH:5][cH:6]1 # The reactants are on the left-hand-side of the reaction and the product # is on the right-hand-side of the reaction. The numbers represent atom mapping. # # graph_edits example: # 23-33-1.0;23-25-0.0 # For a triplet a-b-c, a and b are the atoms that form or loss the bond. # c specifies the particular change, 0.0 for losing a bond, 1.0, 2.0, 3.0 and # 1.5 separately for forming a single, double, triple or aromatic bond. reaction, graph_edits = line.strip("\r\n ").split() reactants = reaction.split('>')[0] mol = Chem.MolFromSmiles(reactants) if mol is None: continue # Reorder atoms according to the order specified in the atom map atom_map_order = [-1 for _ in range(mol.GetNumAtoms())] for i in range(mol.GetNumAtoms()): atom = mol.GetAtomWithIdx(i) atom_map_order[atom.GetIntProp('molAtomMapNumber') - 1] = i mol = rdmolops.RenumberAtoms(mol, atom_map_order) all_mols.append(mol) all_reactions.append(reaction) all_graph_edits.append(graph_edits) return all_mols, all_reactions, all_graph_edits
def mol_to_graph(mol, graph_constructor, node_featurizer, edge_featurizer, canonical_atom_order, explicit_hydrogens): """Convert an RDKit molecule object into a DGLGraph and featurize for it. This function can be used to construct any arbitrary ``DGLGraph`` from an RDKit molecule instance. Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule holder graph_constructor : callable Takes an RDKit molecule as input and returns a DGLGraph node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. canonical_atom_order : bool Whether to use a canonical order of atoms returned by RDKit. Setting it to true might change the order of atoms in the graph constructed. explicit_hydrogens : bool Whether to explicitly represent hydrogens as nodes in the graph. Returns ------- g : DGLGraph Converted DGLGraph for the molecule See Also -------- mol_to_bigraph mol_to_complete_graph mol_to_nearest_neighbor_graph """ # Whether to have hydrogen atoms as explicit nodes if explicit_hydrogens: mol = Chem.AddHs(mol) else: mol = Chem.RemoveHs(mol) if canonical_atom_order: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) g = graph_constructor(mol) if node_featurizer is not None: g.ndata.update(node_featurizer(mol)) if edge_featurizer is not None: g.edata.update(edge_featurizer(mol)) return g
def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True): """Featurize individual compounds in dataframe. Given a featurizer that operates on individual chemical compounds or macromolecules, compute & add features for that compound to the features dataframe """ sample_elems = df[field].tolist() features = [] from rdkit import Chem from rdkit.Chem import rdmolfiles from rdkit.Chem import rdmolops if 'Comet' in str(featurizer.__class__.__qualname__): mols = preprocess_df(sample_elems, NUM_WORKERS) mols_chunks = np.array_split(mols, len(mols) // BATCH_SIZE + 1) for chunk in mols_chunks: X, A, L = list(zip(*chunk)) X = np.array(X, dtype=np.uint8) A = np.array(A, dtype=np.float32) L = np.array(L, dtype=np.uint8) max_len = L[-1] X = X[:, :max_len, :] A = A[:, :max_len, :max_len] temp = featurizer._featurize((X, A)) features += list(temp) valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features], dtype=bool) features = [ elt for (is_valid, elt) in zip(valid_inds, features) if is_valid ] return np.array(features), valid_inds else: for ind, elem in enumerate(sample_elems): mol = Chem.MolFromSmiles(elem) # TODO (ytz) this is a bandage solution to reorder the atoms so # that they're always in the same canonical order. Presumably this # should be correctly implemented in the future for graph mols. if mol: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) if ind % log_every_N == 0: log("Featurizing sample %d" % ind, verbose) features.append(featurizer.featurize([mol])) valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features], dtype=bool) features = [ elt for (is_valid, elt) in zip(valid_inds, features) if is_valid ] return np.squeeze(np.array(features), axis=1), valid_inds
def fingerprint_features(smile_string, radius=2, size=256): mol = MolFromSmiles(smile_string) new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) arr = np.zeros((0,), dtype=np.int8) DataStructs.ConvertToNumpyArray( rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits=size, useChirality=True, useBondTypes=True, useFeatures=False ), arr) return arr
def build_graph_from_molecule(mol, use_master_atom=False): """ Param: mol - rdkit.Chem.rdchem.Mol Output: nodes - np.ndarray of shape (num_atoms, num_feat) canon_adj_list - list. index corresponds to the index of node and canon_adj_list[index] corresponds to indices of the nodes that node i is connected to. """ if not isinstance(mol, Chem.rdchem.Mol): raise TypeError("'mol' must be rdkit.Chem.rdchem.Mol obj") # what are the two lines below doing? # Answer found in deepchem.data.data_loader featurize_smiles_df # TODO (ytz) this is a bandage solution to reorder the atoms so # that they're always in the same canonical order. Presumably this # should be correctly implemented in the future for graph mols. new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) idx_nodes = [(atom.GetIdx(), encode_atom(atom)) for atom in mol.GetAtoms()] idx_nodes.sort() _, nodes = list(zip(*idx_nodes)) nodes = np.vstack(nodes) # Master atom is the "average" of all atoms that is connected to all atom # Introduced in https://arxiv.org/pdf/1704.01212.pdf if use_master_atom: master_atom_features = np.expand_dims(np.mean(nodes, axis=0), axis=0) nodes = np.concatenate([nodes, master_atom_features], axis=0) edge_list = [(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()) for bond in mol.GetBonds()] canon_adj_list = [[] for _ in range(len(nodes))] for edge in edge_list: canon_adj_list[edge[0]].append(edge[1]) canon_adj_list[edge[1]].append(edge[0]) if use_master_atom: fake_atom_index = len(nodes) - 1 for i in range(len(nodes) - 1): canon_adj_list[i].append(fake_atom_index) return (nodes, canon_adj_list)
def featurize(self, molecules, log_every_n=1000): """Calculate features for molecules. Parameters ---------- molecules: RDKit Mol / SMILES string /iterable RDKit Mol, or SMILES string or iterable sequence of RDKit mols/SMILES strings. Returns ------- A numpy array containing a featurized representation of `datapoints`. """ try: from rdkit import Chem from rdkit.Chem import rdmolfiles from rdkit.Chem import rdmolops from rdkit.Chem.rdchem import Mol except ModuleNotFoundError: raise ValueError("This class requires RDKit to be installed.") # Special case handling of single molecule if isinstance(molecules, str) or isinstance(molecules, Mol): molecules = [molecules] else: # Convert iterables to list molecules = list(molecules) features = [] for i, mol in enumerate(molecules): if i % log_every_n == 0: logger.info("Featurizing datapoint %i" % i) try: # Process only case of SMILES strings. if isinstance(mol, str): # mol must be a SMILES string so parse mol = Chem.MolFromSmiles(mol) # TODO (ytz) this is a bandage solution to reorder the atoms # so that they're always in the same canonical order. # Presumably this should be correctly implemented in the # future for graph mols. if mol: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) features.append(self._featurize(mol)) except: logger.warning( "Failed to featurize datapoint %d. Appending empty array") features.append(np.array([])) features = np.asarray(features) return features
def featurize_smiles(arr): featurizer = dc.feat.ConvMolFeaturizer() features = [] for ind, elem in enumerate(arr.tolist()): mol = Chem.MolFromSmiles(elem) if mol: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) features.append(featurizer([mol])) valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features], dtype=bool) features = [ elt for (is_valid, elt) in zip(valid_inds, features) if is_valid ] features = np.squeeze(np.array(features)) return features.reshape(-1, ), valid_inds
def _featurize_smiles_df(df, featurizer, field, log_every_n=1000): """Featurize individual compounds in dataframe. Private helper that given a featurizer that operates on individual chemical compounds or macromolecules, compute & add features for that compound to the features dataframe Parameters ---------- df: pd.DataFrame DataFrame that holds SMILES strings featurizer: Featurizer A featurizer object field: str The name of a column in `df` that holds SMILES strings log_every_n: int, optional (default 1000) Emit a logging statement every `log_every_n` rows. Note ---- This function requires RDKit to be installed """ sample_elems = df[field].tolist() features = [] from rdkit import Chem from rdkit.Chem import rdmolfiles from rdkit.Chem import rdmolops for ind, elem in enumerate(sample_elems): mol = Chem.MolFromSmiles(elem) # TODO (ytz) this is a bandage solution to reorder the atoms # so that they're always in the same canonical order. # Presumably this should be correctly implemented in the # future for graph mols. if mol: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) if ind % log_every_n == 0: logger.info("Featurizing sample %d" % ind) features.append(featurizer.featurize([mol])) valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features], dtype=bool) features = [ elt for (is_valid, elt) in zip(valid_inds, features) if is_valid ] return np.squeeze(np.array(features), axis=1), valid_inds
def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True): """Featurize individual compounds in dataframe. Given a featurizer that operates on individual chemical compounds or macromolecules, compute & add features for that compound to the features dataframe """ sample_elems = df[field].tolist() features = [] for ind, elem in enumerate(sample_elems): mol = Chem.MolFromSmiles(elem) if mol: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) if ind % log_every_N == 0: log("Featurizing sample %d" % ind, verbose) features.append(featurizer.featurize([mol])) valid_inds = torch.Tensor([1 if elt.size > 0 else 0 for elt in features], dtype=bool) features = [ elt for (is_valid, elt) in zip(valid_inds, features) if is_valid ] return torch.squeeze(torch.Tensor(features), axis=1), valid_inds
def mol_to_graph(mol, graph_constructor, node_featurizer, edge_featurizer, canonical_atom_order): """Convert an RDKit molecule object into a DGLGraph and featurize for it. Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule holder graph_constructor : callable Takes an RDKit molecule as input and returns a DGLGraph node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. canonical_atom_order : bool Whether to use a canonical order of atoms returned by RDKit. Setting it to true might change the order of atoms in the graph constructed. Returns ------- g : DGLGraph Converted DGLGraph for the molecule """ if canonical_atom_order: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) g = graph_constructor(mol) if node_featurizer is not None: g.ndata.update(node_featurizer(mol)) if edge_featurizer is not None: g.edata.update(edge_featurizer(mol)) return g
#Calculate Boltzmann averaged k_B = 0.0019872041 #in kcal mol-1 K-1 as MOE gives energies in kcal mol-1 T = 298.15 #Room temperature in K. k_BT = k_B * T sum_exp_EkBT = 0 for i in range(0, len(conformers), 1): sum_exp_EkBT += math.exp(-energies[i] / k_BT) writer = Chem.SDWriter(args.output_file) #Loop over the conformer molecules and renumber them for i in range(0, len(conformers)): m = rdmolops.RenumberAtoms( conformers[i], ssm) #renumber molecules to match atom numbering in RefMol #calculate distances of restraints and indicate if both average restraints =<5Angstrom in the conformer restraint_1_dist = (Chem.rdMolTransforms.GetBondLength( m.GetConformer(), atom_dict[2], atom_dict[38]) + Chem.rdMolTransforms.GetBondLength( m.GetConformer(), atom_dict[4], atom_dict[38])) / 2 restraint_2_dist = (Chem.rdMolTransforms.GetBondLength( m.GetConformer(), atom_dict[1], atom_dict[28]) + Chem.rdMolTransforms.GetBondLength( m.GetConformer(), atom_dict[5], atom_dict[28])) / 2 m.SetProp("restraint_1_dist", str(restraint_1_dist)) m.SetProp("restraint_2_dist", str(restraint_2_dist)) if ((restraint_1_dist <= args.cutoff)
def update_ti_atoms(mol_list, off_list): assert len(mol_list) == 2 assert len(off_list) == 2 periodic = { '6': 'C', '1': 'H', '8': 'O', '7': 'N', '17': 'Cl', '9': 'F', '16': 'S', '35': 'Br', '15': 'P', '53': 'I' } matches = compare_mols(off_list[0], off_list[1]) MCS_atoms_amber = [] for i in matches: MCS_atoms_amber.append(off_list[0][i]) out_mols = [] out_off = [] for mol, mol_amber in zip(mol_list, off_list): ele_count = dict([(6, 1), (1, 1), (8, 1), (7, 1), (17, 1), (9, 1), (16, 1), (35, 1), (15, 1), (53, 1)]) write_core = [] write_last = [] mol_copy = Chem.Mol(mol) for i in range(0, len(MCS_atoms_amber)): for j in range(0, len(mol.GetAtoms())): if compare_atom(MCS_atoms_amber[i], mol_amber[j]) and j not in write_core: write_core.append(j) for i in range(0, len(mol.GetAtoms())): if i not in write_core: write_last.append(i) for i in range(0, len(mol.GetAtoms())): if i in write_core: mol_amber[i].core = True elif i in write_last: mol_amber[i].core = False for i in write_core: new_atom_name = periodic[str(mol_amber[i].element)] + str( ele_count[int(mol_amber[i].element)]) mol_amber[i].name = new_atom_name ele_count[int(mol_amber[i].element)] += 1 for i in range(0, len(mol.GetAtoms())): if mol_amber[i].core == False: new_atom_name = periodic[str(mol_amber[i].element)] + str( ele_count[int(mol_amber[i].element)]) mol_amber[i].name = new_atom_name ele_count[int(mol_amber[i].element)] += 1 # return a re-ordered mol mol_copy = rdmolops.RenumberAtoms(mol_copy, write_core + write_last) out_mols.append(mol_copy) # return matchin re-ordered amber off mol_amber = [mol_amber[i] for i in write_core + write_last] out_off.append(mol_amber) return out_mols, out_off
def mol_to_nearest_neighbor_graph(mol, coordinates, neighbor_cutoff, max_num_neighbors=None, p_distance=2, add_self_loop=False, node_featurizer=None, edge_featurizer=None, canonical_atom_order=True, keep_dists=False, dist_field='dist', explicit_hydrogens=False, num_virtual_nodes=0): """Convert an RDKit molecule into a nearest neighbor graph and featurize for it. Different from bigraph and complete graph, the nearest neighbor graph may not be symmetric since i is the closest neighbor of j does not necessarily suggest the other way. Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule holder coordinates : numpy.ndarray of shape (N, D) The coordinates of atoms in the molecule. N for the number of atoms and D for the dimensions of the coordinates. neighbor_cutoff : float If the distance between a pair of nodes is larger than neighbor_cutoff, they will not be considered as neighboring nodes. max_num_neighbors : int or None. If not None, then this specifies the maximum number of neighbors allowed for each atom. Default to None. p_distance : int We compute the distance between neighbors using Minkowski (:math:`l_p`) distance. When ``p_distance = 1``, Minkowski distance is equivalent to Manhattan distance. When ``p_distance = 2``, Minkowski distance is equivalent to the standard Euclidean distance. Default to 2. add_self_loop : bool Whether to add self loops in DGLGraphs. Default to False. node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. Default to None. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. Default to None. canonical_atom_order : bool Whether to use a canonical order of atoms returned by RDKit. Setting it to true might change the order of atoms in the graph constructed. Default to True. keep_dists : bool Whether to store the distance between neighboring atoms in ``edata`` of the constructed DGLGraphs. Default to False. dist_field : str Field for storing distance between neighboring atoms in ``edata``. This comes into effect only when ``keep_dists=True``. Default to ``'dist'``. explicit_hydrogens : bool Whether to explicitly represent hydrogens as nodes in the graph. If True, it will call rdkit.Chem.AddHs(mol). Default to False. num_virtual_nodes : int The number of virtual nodes to add. The virtual nodes will be connected to all real nodes with virtual edges. If the returned graph has any node/edge feature, an additional column of binary values will be used for each feature to indicate the identity of virtual node/edges. The features of the virtual nodes/edges will be zero vectors except for the additional column. Default to 0. Returns ------- DGLGraph or None Nearest neighbor DGLGraph for the molecule if :attr:`mol` is valid and None otherwise. Examples -------- >>> from dgllife.utils import mol_to_nearest_neighbor_graph >>> from rdkit import Chem >>> from rdkit.Chem import AllChem >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C') >>> AllChem.EmbedMolecule(mol) >>> AllChem.MMFFOptimizeMolecule(mol) >>> coords = get_mol_3d_coordinates(mol) >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25) >>> print(g) DGLGraph(num_nodes=23, num_edges=6, ndata_schemes={} edata_schemes={}) Quite often we will want to use the distance between end atoms of edges, this can be achieved with >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25, keep_dists=True) >>> print(g.edata['dist']) tensor([[1.2024], [1.2024], [1.2270], [1.2270], [1.2259], [1.2259]]) By default, we do not explicitly represent hydrogens as nodes, which can be done as follows. >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C') >>> mol = Chem.AddHs(mol) >>> AllChem.EmbedMolecule(mol) >>> AllChem.MMFFOptimizeMolecule(mol) >>> coords = get_mol_3d_coordinates(mol) >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25, >>> explicit_hydrogens=True) >>> print(g) DGLGraph(num_nodes=41, num_edges=42, ndata_schemes={} edata_schemes={}) See Also -------- get_mol_3d_coordinates k_nearest_neighbors smiles_to_nearest_neighbor_graph """ if mol is None: print('Invalid mol found') return None if explicit_hydrogens: mol = Chem.AddHs(mol) num_atoms = mol.GetNumAtoms() num_coords = coordinates.shape[0] assert num_atoms == num_coords, \ 'Expect the number of atoms to match the first dimension of coordinates, ' \ 'got {:d} and {:d}'.format(num_atoms, num_coords) if canonical_atom_order: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) srcs, dsts, dists = k_nearest_neighbors( coordinates=coordinates, neighbor_cutoff=neighbor_cutoff, max_num_neighbors=max_num_neighbors, p_distance=p_distance, self_loops=add_self_loop) g = dgl.graph(([], []), idtype=torch.int32) # Add nodes first since some nodes may be completely isolated g.add_nodes(num_atoms) # Add edges g.add_edges(srcs, dsts) if node_featurizer is not None: g.ndata.update(node_featurizer(mol)) if edge_featurizer is not None: g.edata.update(edge_featurizer(mol)) if keep_dists: assert dist_field not in g.edata, \ 'Expect {} to be reserved for distance between neighboring atoms.' g.edata[dist_field] = torch.tensor(dists).float().reshape(-1, 1) if num_virtual_nodes > 0: num_real_nodes = g.num_nodes() real_nodes = list(range(num_real_nodes)) g.add_nodes(num_virtual_nodes) # Change Topology virtual_src = [] virtual_dst = [] for count in range(num_virtual_nodes): virtual_node = num_real_nodes + count virtual_node_copy = [virtual_node] * num_real_nodes virtual_src.extend(real_nodes) virtual_src.extend(virtual_node_copy) virtual_dst.extend(virtual_node_copy) virtual_dst.extend(real_nodes) g.add_edges(virtual_src, virtual_dst) for nk, nv in g.ndata.items(): nv = torch.cat([nv, torch.zeros(g.num_nodes(), 1)], dim=1) nv[:-num_virtual_nodes, -1] = 1 g.ndata[nk] = nv for ek, ev in g.edata.items(): ev = torch.cat([ev, torch.zeros(g.num_edges(), 1)], dim=1) ev[:-num_virtual_nodes * num_real_nodes * 2, -1] = 1 g.edata[ek] = ev return g
def mol_to_graph(mol, graph_constructor, node_featurizer, edge_featurizer, canonical_atom_order, explicit_hydrogens, num_virtual_nodes=0): """Convert an RDKit molecule object into a DGLGraph and featurize for it. This function can be used to construct any arbitrary ``DGLGraph`` from an RDKit molecule instance. Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule holder graph_constructor : callable Takes an RDKit molecule as input and returns a DGLGraph node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. canonical_atom_order : bool Whether to use a canonical order of atoms returned by RDKit. Setting it to true might change the order of atoms in the graph constructed. explicit_hydrogens : bool Whether to explicitly represent hydrogens as nodes in the graph. If True, it will call rdkit.Chem.AddHs(mol). num_virtual_nodes : int The number of virtual nodes to add. The virtual nodes will be connected to all real nodes with virtual edges. If the returned graph has any node/edge feature, an additional column of binary values will be used for each feature to indicate the identity of virtual node/edges. The features of the virtual nodes/edges will be zero vectors except for the additional column. Default to 0. Returns ------- DGLGraph or None Converted DGLGraph for the molecule if :attr:`mol` is valid and None otherwise. See Also -------- mol_to_bigraph mol_to_complete_graph mol_to_nearest_neighbor_graph """ if mol is None: print('Invalid mol found') return None # Whether to have hydrogen atoms as explicit nodes if explicit_hydrogens: mol = Chem.AddHs(mol) if canonical_atom_order: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) g = graph_constructor(mol) if node_featurizer is not None: g.ndata.update(node_featurizer(mol)) if edge_featurizer is not None: g.edata.update(edge_featurizer(mol)) if num_virtual_nodes > 0: num_real_nodes = g.num_nodes() real_nodes = list(range(num_real_nodes)) g.add_nodes(num_virtual_nodes) # Change Topology virtual_src = [] virtual_dst = [] for count in range(num_virtual_nodes): virtual_node = num_real_nodes + count virtual_node_copy = [virtual_node] * num_real_nodes virtual_src.extend(real_nodes) virtual_src.extend(virtual_node_copy) virtual_dst.extend(virtual_node_copy) virtual_dst.extend(real_nodes) g.add_edges(virtual_src, virtual_dst) for nk, nv in g.ndata.items(): nv = torch.cat([nv, torch.zeros(g.num_nodes(), 1)], dim=1) nv[-num_virtual_nodes:, -1] = 1 g.ndata[nk] = nv for ek, ev in g.edata.items(): ev = torch.cat([ev, torch.zeros(g.num_edges(), 1)], dim=1) ev[-num_virtual_nodes * num_real_nodes * 2:, -1] = 1 g.edata[ek] = ev return g
def mol_to_nearest_neighbor_graph(mol, coordinates, neighbor_cutoff, max_num_neighbors=None, p_distance=2, add_self_loop=False, node_featurizer=None, edge_featurizer=None, canonical_atom_order=True, keep_dists=False, dist_field='dist'): """Convert an RDKit molecule into a nearest neighbor graph and featurize for it. Different from bigraph and complete graph, the nearest neighbor graph may not be symmetric since i is the closest neighbor of j does not necessarily suggest the other way. Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule holder coordinates : numpy.ndarray of shape (N, D) The coordinates of atoms in the molecule. N for the number of atoms and D for the dimensions of the coordinates. neighbor_cutoff : float If the distance between a pair of nodes is larger than neighbor_cutoff, they will not be considered as neighboring nodes. max_num_neighbors : int or None. If not None, then this specifies the maximum number of neighbors allowed for each atom. Default to None. p_distance : int We compute the distance between neighbors using Minkowski (:math:`l_p`) distance. When ``p_distance = 1``, Minkowski distance is equivalent to Manhattan distance. When ``p_distance = 2``, Minkowski distance is equivalent to the standard Euclidean distance. Default to 2. add_self_loop : bool Whether to add self loops in DGLGraphs. Default to False. node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. Default to None. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. Default to None. canonical_atom_order : bool Whether to use a canonical order of atoms returned by RDKit. Setting it to true might change the order of atoms in the graph constructed. Default to True. keep_dists : bool Whether to store the distance between neighboring atoms in ``edata`` of the constructed DGLGraphs. Default to False. dist_field : str Field for storing distance between neighboring atoms in ``edata``. This comes into effect only when ``keep_dists=True``. Default to ``'dist'``. """ if canonical_atom_order: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) srcs, dsts, dists = k_nearest_neighbors( coordinates=coordinates, neighbor_cutoff=neighbor_cutoff, max_num_neighbors=max_num_neighbors, p_distance=p_distance, self_loops=add_self_loop) g = DGLGraph() # Add nodes first since some nodes may be completely isolated num_atoms = mol.GetNumAtoms() g.add_nodes(num_atoms) # Add edges g.add_edges(srcs, dsts) if node_featurizer is not None: g.ndata.update(node_featurizer(mol)) if edge_featurizer is not None: g.edata.update(edge_featurizer(mol)) if keep_dists: assert dist_field not in g.edata, \ 'Expect {} to be reserved for distance between neighboring atoms.' g.edata[dist_field] = torch.tensor(dists).float().reshape(-1, 1) return g
def mol_to_nearest_neighbor_graph(mol, coordinates, neighbor_cutoff, max_num_neighbors=None, p_distance=2, add_self_loop=False, node_featurizer=None, edge_featurizer=None, canonical_atom_order=True, keep_dists=False, dist_field='dist', explicit_hydrogens=False): """Convert an RDKit molecule into a nearest neighbor graph and featurize for it. Different from bigraph and complete graph, the nearest neighbor graph may not be symmetric since i is the closest neighbor of j does not necessarily suggest the other way. Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule holder coordinates : numpy.ndarray of shape (N, D) The coordinates of atoms in the molecule. N for the number of atoms and D for the dimensions of the coordinates. neighbor_cutoff : float If the distance between a pair of nodes is larger than neighbor_cutoff, they will not be considered as neighboring nodes. max_num_neighbors : int or None. If not None, then this specifies the maximum number of neighbors allowed for each atom. Default to None. p_distance : int We compute the distance between neighbors using Minkowski (:math:`l_p`) distance. When ``p_distance = 1``, Minkowski distance is equivalent to Manhattan distance. When ``p_distance = 2``, Minkowski distance is equivalent to the standard Euclidean distance. Default to 2. add_self_loop : bool Whether to add self loops in DGLGraphs. Default to False. node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. Default to None. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. Default to None. canonical_atom_order : bool Whether to use a canonical order of atoms returned by RDKit. Setting it to true might change the order of atoms in the graph constructed. Default to True. keep_dists : bool Whether to store the distance between neighboring atoms in ``edata`` of the constructed DGLGraphs. Default to False. dist_field : str Field for storing distance between neighboring atoms in ``edata``. This comes into effect only when ``keep_dists=True``. Default to ``'dist'``. explicit_hydrogens : bool Whether to explicitly represent hydrogens as nodes in the graph. Default to False. Returns ------- g : DGLGraph Nearest neighbor DGLGraph for the molecule Examples -------- >>> from dgllife.utils import mol_to_nearest_neighbor_graph >>> from rdkit import Chem >>> from rdkit.Chem import AllChem >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C') >>> AllChem.EmbedMolecule(mol) >>> AllChem.MMFFOptimizeMolecule(mol) >>> coords = get_mol_3d_coordinates(mol) >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25) >>> print(g) DGLGraph(num_nodes=23, num_edges=6, ndata_schemes={} edata_schemes={}) Quite often we will want to use the distance between end atoms of edges, this can be achieved with >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25, keep_dists=True) >>> print(g.edata['dist']) tensor([[1.2024], [1.2024], [1.2270], [1.2270], [1.2259], [1.2259]]) By default, we do not explicitly represent hydrogens as nodes, which can be done as follows. >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C') >>> mol = Chem.AddHs(mol) >>> AllChem.EmbedMolecule(mol) >>> AllChem.MMFFOptimizeMolecule(mol) >>> coords = get_mol_3d_coordinates(mol) >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25, >>> explicit_hydrogens=True) >>> print(g) DGLGraph(num_nodes=41, num_edges=42, ndata_schemes={} edata_schemes={}) See Also -------- get_mol_3d_coordinates k_nearest_neighbors smiles_to_nearest_neighbor_graph """ if explicit_hydrogens: mol = Chem.AddHs(mol) else: mol = Chem.RemoveHs(mol) num_atoms = mol.GetNumAtoms() num_coords = coordinates.shape[0] assert num_atoms == num_coords, \ 'Expect the number of atoms to match the first dimension of coordinates, ' \ 'got {:d} and {:d}'.format(num_atoms, num_coords) if canonical_atom_order: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) srcs, dsts, dists = k_nearest_neighbors( coordinates=coordinates, neighbor_cutoff=neighbor_cutoff, max_num_neighbors=max_num_neighbors, p_distance=p_distance, self_loops=add_self_loop) g = DGLGraph() # Add nodes first since some nodes may be completely isolated g.add_nodes(num_atoms) # Add edges g.add_edges(srcs, dsts) if node_featurizer is not None: g.ndata.update(node_featurizer(mol)) if edge_featurizer is not None: g.edata.update(edge_featurizer(mol)) if keep_dists: assert dist_field not in g.edata, \ 'Expect {} to be reserved for distance between neighboring atoms.' g.edata[dist_field] = torch.tensor(dists).float().reshape(-1, 1) return g