def prepare_mol(self, mol: rdchem.Mol) -> Tuple[str, rdchem.Mol]: """Prepare both smiles and mol by standardizing to common rules. This method should be called before `get_input_feats`. Params: ------- mol: rdkit.Chem.rdchem.Mol Molecule of interest. Returns: -------- canonical_smiles: str Canonical SMILES representation of the molecule. mol: rdkit.Chem.rdchem.Mol Modified molecule w/ kekulization and Hs added, if specified. """ canonical_smiles = rdmolfiles.MolToSmiles(mol, canonical=True) mol = rdmolfiles.MolFromSmiles(canonical_smiles) if self.add_Hs: mol = rdmolops.AddHs(mol) if self.kekulize: rdmolops.Kekulize(mol) return canonical_smiles, mol
def process_molecule(self, pdb_file, use_esp=False): """ Processes a molecule from the passed PDB file if the file contents has no errors. :param pdb_file: path to the PDB file to process the molecule from. :return: a ProcessedMolecule object """ # NOTE: Gasteiger is an inappropriate algorithm for ESP calculation of proteins! # read a molecule from the PDB file try: mol = Chem.MolFromPDBFile(molFileName=pdb_file, removeHs=False, sanitize=True) except IOError: log.warning("Could not read PDB file.") return None if mol is None: log.warning("Bad pdb file found.") return None if use_esp: try: # add missing hydrogen atoms mol = rdMO.AddHs(mol, addCoords=True) # compute partial charges rdPC.ComputeGasteigerCharges(mol, throwOnParamFailure=True) except ValueError: log.warning("Bad Gasteiger charge evaluation.") return None # get the conformation of the molecule conformer = mol.GetConformer() # calculate the center of the molecule center = rdMT.ComputeCentroid(conformer, ignoreHs=False) atoms_count = mol.GetNumAtoms() atoms = mol.GetAtoms() def get_coords(i): coord = conformer.GetAtomPosition(i) return np.asarray([coord.x, coord.y, coord.z]) # set the coordinates, charges, VDW radii and atom count res = { "coords": np.asarray( [get_coords(i) for i in range(0, atoms_count)]) - np.asarray( [center.x, center.y, center.z]), "vdwradii": np.asarray( [self.periodic_table.GetRvdw(atom.GetAtomicNum()) for atom in atoms]) } if use_esp: res['charges'] = np.asarray([float(atom.GetProp("_GasteigerCharge")) for atom in atoms]) return res
def get_all_interactions(self, pdbfile, sdffile): residue_names, residue_atoms = get_residues_and_atoms(pdbfile) compounds = get_compounds(sdffile) all_interactions = {} for index, compound in enumerate(compounds): compound = rdmolops.AddHs(compound) interactions = self.get_interactions(compound, residue_atoms, residue_names) all_interactions[index] = interactions return all_interactions
def test_mols(): mols = [] all_smiles = [ 'CN=C=O', 'Cc1ccccc1', 'CC1=CC2CC(CC1)O2', 'CCCCCCCCCCCCCCCC' ] for smiles in all_smiles: mol = rdmolfiles.MolFromSmiles(smiles) mol = rdmolops.AddHs(mol, addCoords=True) rdDistGeom.EmbedMolecule(mol, rdDistGeom.ETKDG()) mol = rdmolops.RemoveHs(mol) mol.SetProp('Fitness', str(np.random.rand(1)[0])) mols.append(mol) return mols
def apply_retrorules(self, smile, rxns, explicit_hydrogens=False): '''Function takes a smile and dictionary of reactions, applys the reactions and returns a dictionary of rxn_names : products ''' try: substrate_molecule = AllChem.MolFromSmiles(smile) except: return {} if explicit_hydrogens == True: substrate_molecule = rdmolops.AddHs(substrate_molecule) rxn_product_dict = {} for rxn_name, rxn in rxns.items(): try: products = rxn.RunReactants((substrate_molecule, )) except: products = [] print('Error running reactants for: ' + str(smile)) smiles_products = [] for product in products: sub_list = [] for mol in product: mols = [mol] if explicit_hydrogens == True: mol = rdmolops.RemoveHs(mol) try: mols = rdmolops.GetMolFrags(mol, asMols=True) except: pass for mol in mols: try: p_smile = AllChem.MolToSmiles(mol) p_smile = rdkit_smile(p_smile) if self._check_valid_smile( p_smile, rxn_name=rxn_name) == True: sub_list.append(p_smile) except: pass if (sub_list not in smiles_products) and (len(sub_list) != 0): smiles_products.append(sub_list) if len(smiles_products) != 0: rxn_product_dict[rxn_name] = smiles_products return rxn_product_dict
def opt_geometry(mol, max_iter, mmffvariant, seed, max_attempts): err = 0 try: mol = rdmolops.AddHs(mol) a = AllChem.EmbedMolecule(mol, useRandomCoords=True, useBasicKnowledge=True, randomSeed=seed, clearConfs=True, maxAttempts=max_attempts) if a == -1: err = 0 AllChem.MMFFOptimizeMolecule(mol, maxIters=max_iter, mmffVariant=mmffvariant) except ValueError: err = 1 except TypeError: err = 1 return mol, err
def construct_pos_matrix(mol: rdchem.Mol, out_size: Optional[int] = -1) -> np.ndarray: """Construct relative positions from each atom within the molecule. Params: ------- mol: rdkit.Chem.rdchem.Mol Molecule of interest. out_size: int, optional, default=-1 The size of the returned array. If this option is negative, it does not take any effect. Otherwise, it must be larger than or equal to the number of atoms in the input molecule. If so, the end of the array is padded with zeros. Returns: -------- pos_matrix: np.ndarray, shape=(n,n,3) Relative position (XYZ) coordinates from one atom the others in the mol. Examples: --------- ```python >>> from rdkit import Chem >>> from rdkit.Chem import AllChem >>> smiles = 'N[C@@]([H])([C@]([H])(O2)C)C(=O)N[C@@]([H])(CC(=O)N)C(=O)N[C@@]([H])([C@]([H])' \ '(O)C)C(=O)N[C@@]([H])(Cc1ccc(O)cc1)C(=O)2' >>> mol = Chem.MolFromSmiles(smiles) >>> mol = Chem.AddHs(mol, addCoords=True) >>> AllChem.EmbedMolecule(mol, AllChem.ETKDG()) >>> mol = Chem.RemoveHs(mol) >>> pos_matrix = construct_pos_matrix(mol, out_size=-1) >>> pos_matrix.shape (34,34,3) >>> pos_matrix = construct_pos_matrix(mol, out_size=49) >>> pos_matrix.shape (49,49,3) ``` """ # Obtain initial distance geometry between atoms, if unavilable if mol.GetNumConformers() == 0: mol = rdmolops.AddHs(mol, addCoords=True) rdDistGeom.EmbedMolecule(mol, rdDistGeom.ETKDG()) mol = rdmolops.RemoveHs(mol) coords = mol.GetConformer().GetPositions() # shape=(N,3) N = mol.GetNumAtoms() # Determine appropiate output size to generate feature matrix of same size for all mols. if out_size < 0: size = N elif out_size >= N: size = out_size else: raise ValueError( '`out_size` (N={}) is smaller than number of atoms in mol (N={})'. format(out_size, N)) pos_matrix = np.zeros(shape=(size, size, 3), dtype=np.float) for atom_idx in range(N): atom_pos = coords[atom_idx] # central atom of interest for neighbor_idx in range(N): neigh_pos = coords[neighbor_idx] # neighboring atom pos_matrix[ atom_idx, neighbor_idx] = atom_pos - neigh_pos # dist between neighbor -> center return pos_matrix
def sample_mol(): mol = rdmolfiles.MolFromSmiles('CN=C=O') mol = rdmolops.AddHs(mol, addCoords=True) rdDistGeom.EmbedMolecule(mol, rdDistGeom.ETKDG()) return rdmolops.RemoveHs(mol)
def identify_functional_groups(smi): ## We decided to start from a SMILES and add explicit hydrogens inside the function mol = Chem.MolFromSmiles(smi) mol = rdmolops.AddHs(mol) try: marked = set() ## Since heteroatoms are included in PATT_TUPLE, we remove the first part of the original function for patt in PATT_TUPLE: for path in mol.GetSubstructMatches(patt): for atomindex in path: marked.add(atomindex) #merge all connected marked atoms to a single FG groups = [] while marked: grp = set([marked.pop()]) merge(mol, marked, grp) groups.append(grp) groups = [list(x) for x in groups] ## It seems that the initial filtering of heteroatoms was not enough, so we add this to remove groups with only aromatic atoms for g in groups: group_aromaticity = set( [mol.GetAtomWithIdx(idx).GetIsAromatic() for idx in g]) if group_aromaticity == {True}: groups.remove(g) ## Identify bonds to break and hydrogens to keep for every FG bonds = [] labels = [] for g in groups: group_bonds = [] group_labels = [] for idx in g: atom = mol.GetAtomWithIdx(idx) ## Carbon atoms if atom.GetAtomicNum() == 6: for nbr in atom.GetNeighbors(): ## Carbonyl groups to disciminate between aldehydes and ketones if nbr.GetAtomicNum() == 8 and str( mol.GetBondBetweenAtoms( idx, nbr.GetIdx()).GetBondType()) == "DOUBLE": PreserveH = True break else: PreserveH = False if PreserveH == True: for nbr in atom.GetNeighbors(): jdx = nbr.GetIdx() if jdx not in g and nbr.GetAtomicNum() != 1: group_bonds.append( mol.GetBondBetweenAtoms(idx, jdx).GetIdx()) group_labels.append((0, 0)) else: for nbr in atom.GetNeighbors(): jdx = nbr.GetIdx() if jdx not in g: group_bonds.append( mol.GetBondBetweenAtoms(idx, jdx).GetIdx()) group_labels.append((0, 0)) ## Nitrogen atoms elif atom.GetAtomicNum() == 7: ## To discriminate between anilines and amines (primary, secondary, etc) if len(g) == 1: neigh_atn = [ x.GetAtomicNum() for x in atom.GetNeighbors() if x.GetAtomicNum() != 1 ] if neigh_atn.count(6) == 1: for nbr in atom.GetNeighbors(): jdx = nbr.GetIdx() if jdx not in g and nbr.GetAtomicNum() != 1: group_bonds.append( mol.GetBondBetweenAtoms(idx, jdx).GetIdx()) if nbr.GetIsAromatic() == True: group_labels.append((1, 1)) else: group_labels.append((0, 0)) else: for nbr in atom.GetNeighbors(): jdx = nbr.GetIdx() if jdx not in g and nbr.GetAtomicNum() != 1: group_bonds.append( mol.GetBondBetweenAtoms(idx, jdx).GetIdx()) group_labels.append((0, 0)) else: for nbr in atom.GetNeighbors(): jdx = nbr.GetIdx() if jdx not in g: group_bonds.append( mol.GetBondBetweenAtoms(idx, jdx).GetIdx()) group_labels.append((0, 0)) ## Oxygen atoms elif atom.GetAtomicNum() == 8: ## To discriminate between alcohols from phenols and esthers from carboxylic acids if len(g) == 1: neigh_atn = [ x.GetAtomicNum() for x in atom.GetNeighbors() if x.GetAtomicNum() != 1 ] if len(neigh_atn) == 1 and neigh_atn.count(6) == 1: for nbr in atom.GetNeighbors(): jdx = nbr.GetIdx() if jdx not in g and (nbr.GetAtomicNum() != 1): group_bonds.append( mol.GetBondBetweenAtoms(idx, jdx).GetIdx()) if nbr.GetIsAromatic() == True: group_labels.append((1, 1)) else: group_labels.append((0, 0)) else: for nbr in atom.GetNeighbors(): jdx = nbr.GetIdx() if jdx not in g and nbr.GetAtomicNum() != 1: group_bonds.append( mol.GetBondBetweenAtoms(idx, jdx).GetIdx()) group_labels.append((0, 0)) else: for nbr in atom.GetNeighbors(): jdx = nbr.GetIdx() if jdx not in g and nbr.GetAtomicNum() != 1: group_bonds.append( mol.GetBondBetweenAtoms(idx, jdx).GetIdx()) group_labels.append((0, 0)) ## Sulfur atoms elif atom.GetAtomicNum() == 16: if len(g) == 1: for nbr in atom.GetNeighbors(): jdx = nbr.GetIdx() if jdx not in g and nbr.GetAtomicNum() != 1: group_bonds.append( mol.GetBondBetweenAtoms(idx, jdx).GetIdx()) group_labels.append((0, 0)) else: for nbr in atom.GetNeighbors(): jdx = nbr.GetIdx() if jdx not in g: group_bonds.append( mol.GetBondBetweenAtoms(idx, jdx).GetIdx()) group_labels.append((0, 0)) else: for nbr in atom.GetNeighbors(): jdx = nbr.GetIdx() if jdx not in g: group_bonds.append( mol.GetBondBetweenAtoms(idx, jdx).GetIdx()) group_labels.append((0, 0)) labels.append(group_labels) bonds.append(group_bonds) ## Build final fragments FGS_ENVS = [] for i in range(len(groups)): Frag = Chem.FragmentOnBonds(mol, bonds[i], dummyLabels=labels[i]) Frags = rdmolops.GetMolFrags(Frag) for j in Frags: if groups[i][0] in j: FGS_ENVS.append( Chem.MolFragmentToSmiles(Frag, j, canonical=True, allHsExplicit=True)) FGS_ENVS = list(set(FGS_ENVS)) for i in FGS_ENVS: if Chem.MolFromSmiles(i) == None: FG = Chem.MolFromSmarts(i) else: FG = Chem.MolFromSmiles(i) if set([ atom.GetIsAromatic() for atom in FG.GetAtoms() if atom.GetSymbol() not in ["*", "H"] ]) == {True}: FGS_ENVS.remove(i) return FGS_ENVS except: ## When the molecules is as small as a single FG FGS_ENVS = [Chem.MolToSmiles(mol, canonical=True, allHsExplicit=True)] return FGS_ENVS
def process_molecule(self, pdb_file): """ Splits the molecules into separate channels. :param pdb_file: the pdb file to be processed :return: a dictionary of the coordinates and vdwradii for each channel """ hydro_file_name = '_hydrogenized.'.join( os.path.basename(pdb_file).split('.')) hydrogenized_pdb_file = os.path.join(os.path.dirname(pdb_file), hydro_file_name) try: mol_rdkit = Chem.MolFromPDBFile(molFileName=pdb_file, removeHs=False, sanitize=True) if mol_rdkit is not None: mol_rdkit = rdMO.AddHs(mol_rdkit, addCoords=True) # get the conformation of the molecule conformer = mol_rdkit.GetConformer() # calculate the center of the molecule center = rdMT.ComputeCentroid(conformer, ignoreHs=False) mol_center = np.asarray([center.x, center.y, center.z]) else: raise ValueError pdbw = Chem.rdmolfiles.PDBWriter(fileName=hydrogenized_pdb_file) pdbw.write(mol_rdkit) pdbw.flush() pdbw.close() del mol_rdkit, pdbw except (IOError, ValueError): log.warning("Bad PDB file.") return None try: mol = pd.parsePDB(hydrogenized_pdb_file) except IOError: log.warning("Could not read PDB file.") return None if mol is None: log.warning("Bad pdb file found.") return None std_amino_acids = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL'] canonical_notation = lambda x: x[0].upper() + x[1:].lower() if len( x) > 1 else x res = {'coords': mol.getCoords() - mol_center, 'vdwradii': np.asarray([self.periodic_table.GetRvdw( self.periodic_table.GetAtomicNumber( canonical_notation(atom))) for atom in mol.getElements()])} # find the data for all the 20 amino acids for aa in std_amino_acids: all_aas_in_mol = mol.select('resname ' + aa) if all_aas_in_mol is not None: mask = all_aas_in_mol.getIndices() else: mask = np.array([], dtype=np.int32) res['coords_' + aa] = res['coords'][mask, :] res['vdwradii_' + aa] = res['vdwradii'][mask] # find the data for the backbones backbone_mask = mol.backbone.getIndices() res['coords_backbone'] = res['coords'][backbone_mask, :] res['vdwradii_backbone'] = res['vdwradii'][backbone_mask] # find the data for the heavy atoms (i.e. no H atoms) heavy_mask = mol.heavy.getIndices() res['coords_heavy'] = res['coords'][heavy_mask, :] res['vdwradii_heavy'] = res['vdwradii'][heavy_mask] # find the data for the heavy atoms (i.e. no H atoms) hydro_mask = mol.hydrogen.getIndices() res['coords_hydro'] = res['coords'][hydro_mask, :] res['vdwradii_hydro'] = res['vdwradii'][hydro_mask] return res