def get_multiring_atoms_bonds(self, rdk_mol: Mol, smiles): ''' Not used ''' atom_ring_times = [0] * rdk_mol.GetNumAtoms() bond_ring_times = [0] * rdk_mol.GetNumBonds() # TODO GetRingInfo gives SymmetricSSSR, not TRUE SSSR ri = rdk_mol.GetRingInfo() for id_atoms in ri.AtomRings(): for ida in id_atoms: atom_ring_times[ida] += 1 for id_bonds in ri.BondRings(): for idb in id_bonds: bond_ring_times[idb] += 1 n_atoms_multiring = len(list(filter(lambda x: x > 1, atom_ring_times))) n_bonds_multiring = len(list(filter(lambda x: x > 1, bond_ring_times))) py_mol = pybel.readstring('smi', smiles) if ri.NumRings() != len(py_mol.sssr): print( 'WARNING: SymmetricSSSR not equal to TRUE SSSR in rdkit. Use Openbabel instead:', smiles) n_atoms_multiring = pybel.Smarts('[R2]').findall(py_mol).__len__() n_bonds_multiring = n_atoms_multiring - 1 return n_atoms_multiring, n_bonds_multiring
def build_position_matrix(molecule: Mol) -> np.ndarray: conf = molecule.GetConformer() return np.array([[ conf.GetAtomPosition(k).x, conf.GetAtomPosition(k).y, conf.GetAtomPosition(k).z, ] for k in range(molecule.GetNumAtoms())])
def generate_conformers(lig_file, init='generate_conformers_init'): # option pdb_mol """Performs the following tasks: > Creates the receptor folders within binding_ligands and decoy_ligands > Converts all PDB crystal ligands into mol for future use > Generates conformers and saves them to crystal_ligands folder""" init = eval(init) lig_name = lig_file[len(init.lig_path)+1: ] # use rdkit to get a mol object from the PDB pdb_file = os.path.join(init.out_lig_path, lig_name) mol_file = os.path.join(init.mol_path, lig_name).replace('.pdb', '.sdf') # write the mol to a mol file for future use mol = Chem.MolFromPDBFile(lig_file) writer = SDWriter(mol_file) writer.write(mol) # generate conformers and get the number of atoms of the molecule mol2 = Chem.AddHs(mol) # addHs pdb_writer = PDBWriter(pdb_file) conf_ids = AllChem.EmbedMultipleConfs(mol2, init.num_conformers) # PDB has hydrogens for cid in conf_ids: AllChem.MMFFOptimizeMolecule(mol2, confId=cid) mol = Chem.RemoveHs(mol2) pdb_writer.write(mol) num_atoms = Mol.GetNumAtoms(mol) pdb_writer.close() # also has hydrogens print 'Generated conformers for one ligand' return [[pdb_file, mol_file, num_atoms]]
def build_adjacency_matrix(molecule: Mol) -> np.ndarray: adj_matrix = np.eye(molecule.GetNumAtoms()) for bond in molecule.GetBonds(): begin_atom = bond.GetBeginAtom().GetIdx() end_atom = bond.GetEndAtom().GetIdx() adj_matrix[begin_atom, end_atom] = adj_matrix[end_atom, begin_atom] = 1 return adj_matrix
def add_map_numbers(mol: Mol) -> Mol: # converting to smiles to mol and again to smiles makes atom order canonical mol = Chem.MolFromSmiles(Chem.MolToSmiles(mol)) map_nums = np.arange(mol.GetNumAtoms()) + 1 np.random.shuffle(map_nums) for i, a in enumerate(mol.GetAtoms()): a.SetAtomMapNum(int(map_nums[i])) return mol
def add_benzene_ring(mol: Mol, start_atom_ind: int, ring_atom_maps: List[int]): new_atom_ind = [] map2i = dict((a.GetAtomMapNum(), i) for i, a in enumerate(mol.GetAtoms())) start_atom = mol.GetAtomWithIdx(start_atom_ind) start_atom.SetBoolProp('is_edited', True) start_atom.SetIsAromatic(True) start_atom_map = start_atom.GetAtomMapNum() if start_atom.HasProp('in_reactant'): in_reactant = start_atom.GetBoolProp('in_reactant') else: in_reactant = False if start_atom.HasProp('mol_id'): mol_id = start_atom.GetIntProp('mol_id') else: mol_id = 1 for atom_map in ring_atom_maps: if atom_map != start_atom_map: if atom_map in map2i: new_atom_ind.append(map2i[atom_map]) else: num_atoms = mol.GetNumAtoms() new_a = Chem.Atom(6) # benzene has only carbon atoms new_a.SetAtomMapNum(atom_map) new_a.SetIsAromatic(True) new_a.SetBoolProp('is_edited', True) new_a.SetBoolProp('in_reactant', in_reactant) new_a.SetIntProp('mol_id', mol_id) mol.AddAtom(new_a) new_atom_ind.append(num_atoms) else: new_atom_ind.append(start_atom_ind) for i in range(len(new_atom_ind) - 1): bond = mol.GetBondBetweenAtoms(new_atom_ind[i], new_atom_ind[i + 1]) if bond is None: bond_idx = mol.AddBond(new_atom_ind[i], new_atom_ind[i + 1], order=Chem.rdchem.BondType.AROMATIC) - 1 bond = mol.GetBondWithIdx(bond_idx) bond.SetBoolProp('is_edited', True) bond = mol.GetBondBetweenAtoms(new_atom_ind[0], new_atom_ind[-1]) if bond is None: bond_idx = mol.AddBond(new_atom_ind[0], new_atom_ind[-1], order=Chem.rdchem.BondType.AROMATIC) - 1 bond = mol.GetBondWithIdx(bond_idx) bond.SetBoolProp('is_edited', True) return mol
def mol_to_extended_graph(molecule: Mol, seed: int = 0) -> Graph: rng = np.random.default_rng(seed=seed) start = rng.integers(low=0, high=molecule.GetNumAtoms(), size=1).item() bond_graph = build_bond_graph(molecule) sequence = get_random_bf_sequence(graph=bond_graph, start=start, rng=rng) graph = Graph() for new_node in sequence: embed_node_in_graph(graph, new_node=new_node, bond_graph=bond_graph, rng=rng) return graph
def rdmol_to_data(mol: Mol): assert mol.GetNumConformers() == 1 N = mol.GetNumAtoms() pos = torch.tensor(mol.GetConformer(0).GetPositions(), dtype=torch.float) atomic_number = [] aromatic = [] sp = [] sp2 = [] sp3 = [] num_hs = [] for atom in mol.GetAtoms(): atomic_number.append(atom.GetAtomicNum()) aromatic.append(1 if atom.GetIsAromatic() else 0) hybridization = atom.GetHybridization() sp.append(1 if hybridization == HybridizationType.SP else 0) sp2.append(1 if hybridization == HybridizationType.SP2 else 0) sp3.append(1 if hybridization == HybridizationType.SP3 else 0) z = torch.tensor(atomic_number, dtype=torch.long) row, col, edge_type = [], [], [] for bond in mol.GetBonds(): start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() row += [start, end] col += [end, start] edge_type += 2 * [BOND_TYPES[bond.GetBondType()]] edge_index = torch.tensor([row, col], dtype=torch.long) edge_type = torch.tensor(edge_type) perm = (edge_index[0] * N + edge_index[1]).argsort() edge_index = edge_index[:, perm] edge_type = edge_type[perm] row, col = edge_index hs = (z == 1).to(torch.float) num_hs = scatter(hs[row], col, dim_size=N).tolist() smiles = Chem.MolToSmiles(mol) data = Data(node_type=z, pos=pos, edge_index=edge_index, edge_type=edge_type, rdmol=copy.deepcopy(mol), smiles=smiles) data.nx = to_networkx(data, to_undirected=True) return data
def check_num_atoms(mol: rdchem.Mol, max_num_atoms: Optional[int] = -1) -> None: """Check number of atoms in `mol` does not exceed `max_num_atoms`. If number of atoms in `mol` exceeds the number `max_num_atoms`, it will raise `MolFeatureExtractionError` exception. Params: ------- mol: rdkit.Chem.rdchem.Mol The molecule to check. num_max_atoms: int, optional , default=-1 Maximum allowed number of atoms in a molecule. If negative, check passes unconditionally. """ num_atoms = mol.GetNumAtoms() if max_num_atoms >= 0 and num_atoms > max_num_atoms: raise MolFeatureExtractionError("Atoms in mol (N={}) exceeds " \ "num_max_atoms (N={}).".format(num_atoms, max_num_atoms))
def construct_mol_features(mol: rdchem.Mol, out_size: Optional[int] = -1) -> np.ndarray: """Returns the atom features of all the atoms in the molecule. Params: ------- mol: rdkit.Chem.rdchem.Mol Molecule of interest. out_size: int, optional, default=-1 The size of the returned array. If this option is negative, it does not take any effect. Otherwise, it must be larger than or equal to the number of atoms in the input molecule. If so, the end of the array is padded with zeros. Returns: -------- mol_feats: np.ndarray, shape=(n,m) Where `n` is the total number of atoms within the molecule, and `m` is the number of feats. """ # Caluclate charges and chirality of atoms within molecule rdPartialCharges.ComputeGasteigerCharges( mol) # stored under _GasteigerCharge rdmolops.AssignStereochemistry( mol) # stored under _CIPCode, see doc for more info # Retrieve atom index locations of matches HYDROGEN_DONOR = rdmolfiles.MolFromSmarts( "[$([N;!H0;v3,v4&+1]),$([O,S;H1;+0])" + ",n&H1&+0]") HYROGEN_ACCEPTOR = rdmolfiles.MolFromSmarts( "[$([O,S;H1;v2;!$(*-*=[O,N,P,S])])" + ",$([O,S;H0;v2]),$([O,S;-]),$([N;v3;!$(N-*=[O,N,P,S])]),n&H0&+0," + "$([o,s;+0;!$([o,s]:n);!$([o,s]:c:n)])]") ACIDIC = rdmolfiles.MolFromSmarts("[$([C,S](=[O,S,P])-[O;H1,-1])]") BASIC = rdmolfiles.MolFromSmarts( "[#7;+,$([N;H2&+0][$([C,a]);!$([C,a](=O))])" + ",$([N;H1&+0]([$([C,a]);!$([C,a](=O))])[$([C,a]);!$([C,a](=O))])," + "$([N;H0&+0]([C;!$(C(=O))])([C;!$(C(=O))])[C;!$(C(=O))])]") hydrogen_donor_match = sum(mol.GetSubstructMatches(HYDROGEN_DONOR), ()) hydrogen_acceptor_match = sum(mol.GetSubstructMatches(HYROGEN_ACCEPTOR), ()) acidic_match = sum(mol.GetSubstructMatches(ACIDIC), ()) basic_match = sum(mol.GetSubstructMatches(BASIC), ()) # Get ring info ring = mol.GetRingInfo() mol_feats = [] n_atoms = mol.GetNumAtoms() for atom_idx in range(n_atoms): atom = mol.GetAtomWithIdx(atom_idx) atom_feats = [] atom_feats += one_hot(atom.GetSymbol(), [ 'C', 'O', 'N', 'S', 'Cl', 'F', 'Br', 'P', 'I', 'Si', 'B', 'Na', 'Sn', 'Se', 'other' ]) atom_feats += one_hot(atom.GetDegree(), [1, 2, 3, 4, 5, 6]) atom_feats += one_hot(atom.GetHybridization(), list(rdchem.HybridizationType.names.values())) atom_feats += one_hot(atom.GetImplicitValence(), [0, 1, 2, 3, 4, 5, 6]) atom_feats += one_hot(atom.GetFormalCharge(), [-3, -2, -1, 0, 1, 2, 3]) g_charge = float(atom.GetProp("_GasteigerCharge")) atom_feats += [g_charge] if not np.isnan(g_charge) else [0.] atom_feats += [atom.GetIsAromatic()] atom_feats += [ ring.IsAtomInRingOfSize(atom_idx, size) for size in range(3, 9) ] atom_feats += one_hot(atom.GetTotalNumHs(), [0, 1, 2, 3, 4]) # Chirality try: atom_feats += one_hot(atom.GetProp('_CIPCode'), ["R", "S"]) + [ atom.HasProp("_ChiralityPossible") ] except: atom_feats += [False, False] + [atom.HasProp("_ChiralityPossible")] # Hydrogen bonding atom_feats += [atom_idx in hydrogen_donor_match] atom_feats += [atom_idx in hydrogen_acceptor_match] # Is Acidic/Basic atom_feats += [atom_idx in acidic_match] atom_feats += [atom_idx in basic_match] mol_feats.append(atom_feats) if out_size < 0: return np.array(mol_feats, dtype=np.float) elif out_size >= n_atoms: # 'empty' padding for `mol_feats`. Generate(s) feature matrix of same size for all mols # NOTE: len(mol_feats[0]) is the number of feats padded_mol_feats = np.zeros((out_size, len(mol_feats[0])), dtype=np.float) padded_mol_feats[:n_atoms] = np.array(mol_feats, dtype=np.float) return padded_mol_feats else: raise ValueError( '`out_size` (N={}) must be negative or larger than or ' 'equal to the number of atoms in the input molecules (N={}).'. format(out_size, n_atoms))
def construct_pos_matrix(mol: rdchem.Mol, out_size: Optional[int] = -1) -> np.ndarray: """Construct relative positions from each atom within the molecule. Params: ------- mol: rdkit.Chem.rdchem.Mol Molecule of interest. out_size: int, optional, default=-1 The size of the returned array. If this option is negative, it does not take any effect. Otherwise, it must be larger than or equal to the number of atoms in the input molecule. If so, the end of the array is padded with zeros. Returns: -------- pos_matrix: np.ndarray, shape=(n,n,3) Relative position (XYZ) coordinates from one atom the others in the mol. Examples: --------- ```python >>> from rdkit import Chem >>> from rdkit.Chem import AllChem >>> smiles = 'N[C@@]([H])([C@]([H])(O2)C)C(=O)N[C@@]([H])(CC(=O)N)C(=O)N[C@@]([H])([C@]([H])' \ '(O)C)C(=O)N[C@@]([H])(Cc1ccc(O)cc1)C(=O)2' >>> mol = Chem.MolFromSmiles(smiles) >>> mol = Chem.AddHs(mol, addCoords=True) >>> AllChem.EmbedMolecule(mol, AllChem.ETKDG()) >>> mol = Chem.RemoveHs(mol) >>> pos_matrix = construct_pos_matrix(mol, out_size=-1) >>> pos_matrix.shape (34,34,3) >>> pos_matrix = construct_pos_matrix(mol, out_size=49) >>> pos_matrix.shape (49,49,3) ``` """ # Obtain initial distance geometry between atoms, if unavilable if mol.GetNumConformers() == 0: mol = rdmolops.AddHs(mol, addCoords=True) rdDistGeom.EmbedMolecule(mol, rdDistGeom.ETKDG()) mol = rdmolops.RemoveHs(mol) coords = mol.GetConformer().GetPositions() # shape=(N,3) N = mol.GetNumAtoms() # Determine appropiate output size to generate feature matrix of same size for all mols. if out_size < 0: size = N elif out_size >= N: size = out_size else: raise ValueError( '`out_size` (N={}) is smaller than number of atoms in mol (N={})'. format(out_size, N)) pos_matrix = np.zeros(shape=(size, size, 3), dtype=np.float) for atom_idx in range(N): atom_pos = coords[atom_idx] # central atom of interest for neighbor_idx in range(N): neigh_pos = coords[neighbor_idx] # neighboring atom pos_matrix[ atom_idx, neighbor_idx] = atom_pos - neigh_pos # dist between neighbor -> center return pos_matrix
merge_cols=['cryst_lig_file'], order=pair_idx) # Extract filepaths to PDB and Mol files conn = sqlite3.connect(db_path) cursor = conn.cursor() cursor.execute("SELECT name FROM sqlite_master WHERE type='table';") _, _, _, _, out_table_generate = cursor.fetchall() out_table_generate = out_table_generate[0] conn.close() table_data = db_master.retrieve(out_table_generate, ['bind_lig_files', 'mol_files'], {'run_idx': '{}<100000'}) all_pdb_files = [table_data[0][i].encode('ascii','ignore') for i in range(len(table_data[0]))] all_mol_files = [table_data[1][i].encode('ascii','ignore') for i in range(len(table_data[0]))] all_mols = [Chem.MolFromMolFile(all_mol_files[i]) for i in range(len(all_mol_files))] all_num_atoms = [Mol.GetNumAtoms(all_mols[i]) for i in range(len(all_mols))] GetDecoysInit(all_pdb_files, all_mol_files, all_mols, all_num_atoms, max_atom_dif=2, max_substruct=4, max_num_decoys=10) afdb.run_multithread(func='get_decoys', arg_types=[str, str, int], arg_lists=[all_pdb_files, all_mol_files, all_num_atoms], out_types=[str, str], out_names=['bind_lig_files', 'decoy_ligs'], num_threads=100, commit_sec=1) print '\nGot decoys for each ligand in:', str(time.time()-start), 'seconds\n' """Generate conformers for all the decoy ligands---------------------------------"""
def get_smallest_root_match(self, mol: Mol) -> Mol: search_space: Set[Mol] = set(self.molecules.values()) all_idxs = set(range(0, mol.GetNumAtoms())) included_idxs = {0} bonds = {} def register_bond(from_idx: int, to_idx: int): entry = bonds.get(from_idx) if entry is None: entry = [] bonds[from_idx] = entry entry.append(to_idx) for bond in mol.GetBonds(): begin = bond.GetBeginAtomIdx() end = bond.GetEndAtomIdx() register_bond(begin, end) register_bond(end, begin) while len(included_idxs) < mol.GetNumAtoms(): frontier_permutations = reduce( lambda perms, from_idx: perms | set( map( lambda to_idx: frozenset([*included_idxs, to_idx]), filter(lambda idx: idx not in included_idxs, bonds[from_idx]) ) ), included_idxs, set() ) new_search_space = set() for perm in frontier_permutations: e_mol = Chem.EditableMol(mol) perm_idxs = list(all_idxs - perm) perm_idxs.sort(reverse=True) for idx in perm_idxs: e_mol.RemoveAtom(idx) display('mul') display_numbered(e_mol.GetMol()) new_search_space |= set(self.find_superstructures(e_mol.GetMol(), search_space)) included_idxs |= perm if len(new_search_space) == 0: return self.get_smallest_mol(list(search_space)) search_space = new_search_space if len(search_space) < 100: display("from mul") for s in search_space: display_numbered(s) if len(frontier_permutations) > 1: e_mol = Chem.EditableMol(mol) perm_idxs = list(all_idxs - included_idxs) perm_idxs.sort(reverse=True) for idx in perm_idxs: e_mol.RemoveAtom(idx) display('single') display_numbered(e_mol.GetMol()) new_search_space = set(self.find_superstructures(e_mol.GetMol(), search_space)) if len(new_search_space) == 0: return self.get_smallest_mol(list(search_space)) search_space = new_search_space if len(search_space) < 100: display("from mul") for s in search_space: display_numbered(s)