def write_vina_pdbqt(mol, directory, flexible=True, name_id=None): """Write single PDBQT molecule to a given directory. For proteins use `flexible=False` to avoid encoding torsions. Additionally an name ID can be appended to a name to avoid conflicts. """ if name_id is None: name_id = '' # We expect name such as 0_ZINC123456.pdbqt or simply ZINC123456.pdbqt if no # name_id is specified. All non alpha-numeric signs are replaced with underscore. mol_file = ('_'.join( filter(None, [str(name_id), re.sub('[^A-Za-z0-9]+', '_', mol.title)])) + '.pdbqt') # prepend path to filename mol_file = os.path.join(directory, mol_file) if is_openbabel_molecule(mol): if flexible: # auto bonding (b), perserve atom indices (p) and Hs (h) kwargs = {'opt': {'b': None, 'p': None, 'h': None}} else: # for proteins write rigid mol (r) and combine all frags in one (c) kwargs = {'opt': {'r': None, 'c': None, 'h': None}} else: kwargs = {'flexible': flexible} mol.write('pdbqt', mol_file, overwrite=True, **kwargs) return mol_file
def shuffle_mol(mol): new_mol = mol.clone new_order = list(range(len(mol.atoms))) shuffle(new_order) if is_openbabel_molecule(mol): new_mol.OBMol.RenumberAtoms([i + 1 for i in new_order]) else: new_mol.Mol = oddt.toolkits.rdk.Chem.RenumberAtoms(new_mol.Mol, new_order) return new_mol
def write_vina_pdbqt(mol, directory, flexible=True, name_id=None): """Write single PDBQT molecule to a given directory. For proteins use `flexible=False` to avoid encoding torsions. Additionally an name ID can be appended to a name to avoid conflicts. """ if name_id is None: name_id = '' # We expect name such as 0_ZINC123456.pdbqt or simply ZINC123456.pdbqt if no # name_id is specified. All non alpha-numeric signs are replaced with underscore. mol_file = ('_'.join( filter(None, [str(name_id), re.sub('[^A-Za-z0-9]+', '_', mol.title)])) + '.pdbqt') # prepend path to filename mol_file = os.path.join(directory, mol_file) if is_openbabel_molecule(mol): if flexible: # auto bonding (b), perserve atom indices (p) and Hs (h) kwargs = {'opt': {'b': None, 'p': None, 'h': None}} else: # for proteins write rigid mol (r) and combine all frags in one (c) kwargs = {'opt': {'r': None, 'c': None, 'h': None}} else: kwargs = {'flexible': flexible} # HACK: fix OB 2.3.2 PDBQT bugs if (not flexible and is_openbabel_molecule(mol) and oddt.toolkits.ob.__version__ < '2.4.0'): with open(mol_file, 'w') as f: for line in mol.write('pdbqt', overwrite=True, **kwargs).split('\n'): # remove OB 2.3 ROOT/ENDROOT tags if line in ['ROOT', 'ENDROOT']: continue elif line[:7] == 'TORSDOF': f.write('TER\n') else: f.write(line + '\n') else: mol.write('pdbqt', mol_file, overwrite=True, **kwargs) return mol_file
def shuffle_mol(mol): """Randomly reorder molecule atoms and return a shuffled copy of input.""" new_mol = mol.clone new_order = list(range(len(mol.atoms))) shuffle(new_order) if is_openbabel_molecule(mol): new_mol.OBMol.RenumberAtoms([i + 1 for i in new_order]) else: new_mol.Mol = oddt.toolkits.rdk.Chem.RenumberAtoms(new_mol.Mol, new_order) return new_mol
def write_vina_pdbqt(mol, directory, flexible=True, name_id=None): """Write single PDBQT molecule to a given directory. For proteins use `flexible=False` to avoid encoding torsions. Additionally an name ID can be appended to a name to avoid conflicts. """ if name_id is None: name_id = '' # We expect name such as 0_ZINC123456.pdbqt or simply ZINC123456.pdbqt if no # name_id is specified. All non alpha-numeric signs are replaced with underscore. mol_file = ('_'.join(filter(None, [str(name_id), re.sub('[^A-Za-z0-9]+', '_', mol.title)] )) + '.pdbqt') # prepend path to filename mol_file = os.path.join(directory, mol_file) if is_openbabel_molecule(mol): if flexible: # auto bonding (b), perserve atom names (n) indices (p) and Hs (h) kwargs = {'opt': {'b': None, 'p': None, 'h': None, 'n': None}} else: # for proteins write rigid mol (r) and combine all frags in one (c) kwargs = {'opt': {'r': None, 'c': None, 'h': None}} else: kwargs = {'flexible': flexible} # HACK: fix OB 2.3.2 PDBQT bugs if (not flexible and is_openbabel_molecule(mol) and oddt.toolkits.ob.__version__ < '2.4.0'): with open(mol_file, 'w') as f: for line in mol.write('pdbqt', overwrite=True, **kwargs).split('\n'): # remove OB 2.3 ROOT/ENDROOT tags if line in ['ROOT', 'ENDROOT']: continue elif line[:7] == 'TORSDOF': f.write('TER\n') else: f.write(line + '\n') else: mol.write('pdbqt', mol_file, overwrite=True, **kwargs) return mol_file
def get_atom_environments(mol, root_atom_idx, depth): """Get circular environments of atom indices up to certain depth. Atoms from each depth are kept separate. BFS search is done until atom outside of given depth is found. Parameters ---------- mol : oddt.toolkit.Molecule object Molecule object containing environments root_atom_idx : int 0-based index of root atom for all environments depth : int Maximum depth of environments to return Returns ------- envs: list (size = depth + 1) List of atoms at each respective environment depth """ if is_openbabel_molecule(mol): envs = OrderedDict([(i, []) for i in range(depth + 1)]) last_depth = 0 for atom, current_depth in oddt.toolkits.ob.ob.OBMolAtomBFSIter(mol.OBMol, root_atom_idx + 1): # FIX for disconnected fragments in OB if ((current_depth > depth + 1) or (last_depth > current_depth) or (last_depth == 1 and current_depth == 1)): break last_depth = current_depth if atom.GetAtomicNum() == 1: continue envs[current_depth - 1].append(atom.GetIdx() - 1) envs = list(envs.values()) else: envs = [[root_atom_idx]] visited = [root_atom_idx] for r in range(1, depth + 1): current_depth_atoms = [] for atom_idx in envs[r - 1]: for neighbor in mol.Mol.GetAtomWithIdx(atom_idx).GetNeighbors(): if neighbor.GetAtomicNum() == 1: continue n_idx = neighbor.GetIdx() if n_idx not in visited and n_idx not in current_depth_atoms: current_depth_atoms.append(n_idx) visited.append(n_idx) envs.append(current_depth_atoms) return envs
def get_molecular_shingles(mol, depth=2, atom_idxs=None): """Get molecular shingles of given depth. They are equivalent to ECFP environments, but use SMILES as a representation for each environment. Parameters ---------- mol: oddt.toolkit.Molecule instance Query molecule object detpth: int (default=2) Bond depth of environtment that is used for shingles generation atom_idxs: iterable of ints or None (default=None) Which atoms to use for shingles generation. By default use all atoms. Returns ------- shingles: list List of molecular shingles (canonical SMILES) References ---------- https://doi.org/10.1186/s13321-018-0321-8 """ shingles = [] atom_idxs = atom_idxs or range(len(mol.atoms)) for atom_idx in atom_idxs: env = list( chain.from_iterable( get_atom_environments(mol, root_atom_idx=atom_idx, depth=depth))) if is_openbabel_molecule(mol): atom_idx_string = ' '.join(str(i + 1) for i in env) # this is one-based # OB fragment smiles contains names and whitespaces fragment_smiles = mol.write('smi', opt={ 'c': None, 'F': atom_idx_string }).strip().split()[0] shingles.append(fragment_smiles) else: fragment_smiles = oddt.toolkit.Chem.MolFragmentToSmiles( mol.Mol, atomsToUse=env, isomericSmiles=True) shingles.append(fragment_smiles) return shingles
def _ECFP_atom_repr(mol, idx, use_pharm_features=False): """Simple description of atoms used in ECFP/FCFP. Bonds are not described accounted for. Hydrogens are explicitly forbidden, they raise Exception. Reference: Rogers D, Hahn M. Extended-connectivity fingerprints. J Chem Inf Model. 2010;50: 742-754. http://dx.doi.org/10.1021/ci100050t Parameters ---------- mol : oddt.toolkit.Molecule object Input molecule for the FP calculations idx : int Root atom index (0-based). use_pharm_features : bool (default=False) Switch to use pharmacophoric features as atom representation instead of explicit atomic numbers etc. Returns ------- atom_repr : tuple (size=6 or 7) Atom type desctiption or pharmacophoric features of atom. """ if use_pharm_features: atom_dict = mol.atom_dict[idx] if atom_dict['atomicnum'] == 1: raise Exception('ECFP should not hash Hydrogens') return (int(atom_dict['isdonor']), int(atom_dict['isacceptor']), int(atom_dict['ishydrophobe']), int(atom_dict['isplus']), int(atom_dict['isminus']), int(atom_dict['isaromatic'])) else: max_ring_size = 10 # dont catch macromolecular rings if is_openbabel_molecule(mol): atom = mol.OBMol.GetAtom(idx + 1) if atom.GetAtomicNum() == 1: raise Exception('ECFP should not hash Hydrogens') # OB 3.0 compatibility if hasattr(atom, 'GetHvyValence'): heavy_degree = atom.GetHvyValence() else: heavy_degree = atom.GetHvyDegree() if hasattr(atom, 'ImplicitHydrogenCount'): hs_count = atom.ImplicitHydrogenCount() + atom.ExplicitHydrogenCount() else: hs_count = atom.GetTotalDegree() - heavy_degree return (atom.GetAtomicNum(), atom.GetIsotope(), heavy_degree, hs_count, atom.GetFormalCharge(), int(0 < atom.MemberOfRingSize() <= max_ring_size), int(atom.IsAromatic()),) else: atom = mol.Mol.GetAtomWithIdx(idx) if atom.GetAtomicNum() == 1: raise Exception('ECFP should not hash Hydrogens') n_hs = atom.GetTotalNumHs(includeNeighbors=True) # get ring info for atom and check rign size isring = False if atom.IsInRing(): # FIXME: this is not efficient, fixed by rdkit/rdkit#1859 isring = any(atom.IsInRingSize(size) for size in range(3, max_ring_size + 1)) return (atom.GetAtomicNum(), atom.GetIsotope(), atom.GetTotalDegree() - n_hs, n_hs, atom.GetFormalCharge(), int(isring), int(atom.GetIsAromatic()),)
def _ECFP_atom_hash(mol, idx, depth=2, use_pharm_features=False, atom_repr_dict=None): """Generate hashed environments for single atom up to certain depth (bond-wise). Hydrogens are ignored during neighbor lookup. Reference: Rogers D, Hahn M. Extended-connectivity fingerprints. J Chem Inf Model. 2010;50: 742-754. http://dx.doi.org/10.1021/ci100050t Parameters ---------- mol : oddt.toolkit.Molecule object Input molecule for the FP calculations idx : int Root atom index (0-based). depth : int (deafult = 2) The depth of the fingerprint, i.e. the number of bonds in Morgan algorithm. Note: For ECFP2: depth = 1, ECFP4: depth = 2, etc. use_pharm_features : bool (default=False) Switch to use pharmacophoric features as atom representation instead of explicit atomic numbers etc. Returns ------- environment_hashes : list of ints Hashed environments for certain atom """ if is_openbabel_molecule(mol): envs = OrderedDict([(i, []) for i in range(depth + 1)]) last_depth = 0 for atom, current_depth in oddt.toolkits.ob.ob.OBMolAtomBFSIter(mol.OBMol, idx + 1): # FIX for disconnected fragments in OB if ((current_depth > depth + 1) or (last_depth > current_depth) or (last_depth == 1 and current_depth == 1)): break last_depth = current_depth if atom.GetAtomicNum() == 1: continue envs[current_depth - 1].append(atom.GetIdx() - 1) envs = list(envs.values()) else: envs = [[idx]] visited = [idx] for r in range(1, depth + 1): tmp = [] for atom_idx in envs[r - 1]: for neighbor in mol.Mol.GetAtomWithIdx(atom_idx).GetNeighbors(): if neighbor.GetAtomicNum() == 1: continue n_idx = neighbor.GetIdx() if n_idx not in visited and n_idx not in tmp: tmp.append(n_idx) visited.append(n_idx) envs.append(tmp) atom_env = [] for r in range(1, depth + 2): # there are depth + 1 elements, so +2 atom_env.append(list(chain(*envs[:r]))) # Get atom representation only once, pull indices from largest env if atom_repr_dict is None: atom_repr = [_ECFP_atom_repr(mol, aidx, use_pharm_features=use_pharm_features) for aidx in atom_env[-1]] elif isinstance(atom_repr_dict, dict): atom_repr = [atom_repr_dict[aidx] for aidx in atom_env[-1]] else: raise ValueError('`atom_repr_dict` must be a dictionary, as atom idxs ' 'do not need to be continuous (eg. missing Hs).') # Get atom invariants out_hash = [] for layer in atom_env: layer_invariant = tuple(sorted(atom_repr[:len(layer)])) out_hash.append(hash32(layer_invariant)) return out_hash
def _ECFP_atom_repr(mol, idx, use_pharm_features=False): """Simple description of atoms used in ECFP/FCFP. Bonds are not described accounted for. Hydrogens are explicitly forbidden, they raise Exception. Reference: Rogers D, Hahn M. Extended-connectivity fingerprints. J Chem Inf Model. 2010;50: 742-754. http://dx.doi.org/10.1021/ci100050t Parameters ---------- mol : oddt.toolkit.Molecule object Input molecule for the FP calculations idx : int Root atom index (0-based). use_pharm_features : bool (default=False) Switch to use pharmacophoric features as atom representation instead of explicit atomic numbers etc. Returns ------- atom_repr : tuple (size=6 or 7) Atom type desctiption or pharmacophoric features of atom. """ if use_pharm_features: atom_dict = mol.atom_dict[idx] if atom_dict['atomicnum'] == 1: raise Exception('ECFP should not hash Hydrogens') return (int(atom_dict['isdonor']), int(atom_dict['isacceptor']), int(atom_dict['ishydrophobe']), int(atom_dict['isplus']), int(atom_dict['isminus']), int(atom_dict['isaromatic'])) else: max_ring_size = 10 # dont catch macromolecular rings if is_openbabel_molecule(mol): atom = mol.OBMol.GetAtom(idx + 1) if atom.GetAtomicNum() == 1: raise Exception('ECFP should not hash Hydrogens') return (atom.GetAtomicNum(), atom.GetIsotope(), atom.GetHvyValence(), atom.ImplicitHydrogenCount() + atom.ExplicitHydrogenCount(), atom.GetFormalCharge(), int(0 < atom.MemberOfRingSize() <= max_ring_size), int(atom.IsAromatic()),) else: atom = mol.Mol.GetAtomWithIdx(idx) if atom.GetAtomicNum() == 1: raise Exception('ECFP should not hash Hydrogens') n_hs = atom.GetTotalNumHs(includeNeighbors=True) # get ring info for atom and check rign size isring = False if atom.IsInRing(): # FIXME: this is not efficient, fixed by rdkit/rdkit#1859 isring = any(atom.IsInRingSize(size) for size in range(3, max_ring_size + 1)) return (atom.GetAtomicNum(), atom.GetIsotope(), atom.GetTotalDegree() - n_hs, n_hs, atom.GetFormalCharge(), int(isring), int(atom.GetIsAromatic()),)
def _ECFP_atom_hash(mol, idx, depth=2, use_pharm_features=False, atom_repr_dict=None): """Generate hashed environments for single atom up to certain depth (bond-wise). Hydrogens are ignored during neighbor lookup. Reference: Rogers D, Hahn M. Extended-connectivity fingerprints. J Chem Inf Model. 2010;50: 742-754. http://dx.doi.org/10.1021/ci100050t Parameters ---------- mol : oddt.toolkit.Molecule object Input molecule for the FP calculations idx : int Root atom index (0-based). depth : int (deafult = 2) The depth of the fingerprint, i.e. the number of bonds in Morgan algorithm. Note: For ECFP2: depth = 1, ECFP4: depth = 2, etc. use_pharm_features : bool (default=False) Switch to use pharmacophoric features as atom representation instead of explicit atomic numbers etc. Returns ------- environment_hashes : list of ints Hashed environments for certain atom """ if is_openbabel_molecule(mol): envs = OrderedDict([(i, []) for i in range(depth + 1)]) last_depth = 0 for atom, current_depth in oddt.toolkits.ob.ob.OBMolAtomBFSIter( mol.OBMol, idx + 1): # FIX for disconnected fragments in OB if ((current_depth > depth + 1) or (last_depth > current_depth) or (last_depth == 1 and current_depth == 1)): break last_depth = current_depth if atom.GetAtomicNum() == 1: continue envs[current_depth - 1].append(atom.GetIdx() - 1) envs = list(envs.values()) else: envs = [[idx]] visited = [idx] for r in range(1, depth + 1): tmp = [] for atom_idx in envs[r - 1]: for neighbor in mol.Mol.GetAtomWithIdx( atom_idx).GetNeighbors(): if neighbor.GetAtomicNum() == 1: continue n_idx = neighbor.GetIdx() if n_idx not in visited and n_idx not in tmp: tmp.append(n_idx) visited.append(n_idx) envs.append(tmp) atom_env = [] for r in range(1, depth + 2): # there are depth + 1 elements, so +2 atom_env.append(list(chain(*envs[:r]))) # Get atom representation only once, pull indices from largest env if atom_repr_dict is None: atom_repr = [ _ECFP_atom_repr(mol, aidx, use_pharm_features=use_pharm_features) for aidx in atom_env[-1] ] elif isinstance(atom_repr_dict, dict): atom_repr = [atom_repr_dict[aidx] for aidx in atom_env[-1]] else: raise ValueError('`atom_repr_dict` must be a dictionary, as atom idxs ' 'do not need to be continuous (eg. missing Hs).') # Get atom invariants out_hash = [] for layer in atom_env: layer_invariant = tuple(sorted(atom_repr[:len(layer)])) out_hash.append(hash32(layer_invariant)) return out_hash
def _ECFP_atom_repr(mol, idx, use_pharm_features=False): """Simple description of atoms used in ECFP/FCFP. Bonds are not described accounted for. Hydrogens are explicitly forbidden, they raise Exception. Reference: Rogers D, Hahn M. Extended-connectivity fingerprints. J Chem Inf Model. 2010;50: 742-754. http://dx.doi.org/10.1021/ci100050t Parameters ---------- mol : oddt.toolkit.Molecule object Input molecule for the FP calculations idx : int Root atom index (0-based). use_pharm_features : bool (default=False) Switch to use pharmacophoric features as atom representation instead of explicit atomic numbers etc. Returns ------- atom_repr : tuple (size=6 or 7) Atom type desctiption or pharmacophoric features of atom. """ if use_pharm_features: atom_dict = mol.atom_dict[idx] if atom_dict['atomicnum'] == 1: raise Exception('ECFP should not hash Hydrogens') return (int(atom_dict['isdonor']), int(atom_dict['isacceptor']), int(atom_dict['ishydrophobe']), int(atom_dict['isplus']), int(atom_dict['isminus']), int(atom_dict['isaromatic'])) else: if is_openbabel_molecule(mol): atom = mol.OBMol.GetAtom(idx + 1) if atom.GetAtomicNum() == 1: raise Exception('ECFP should not hash Hydrogens') return ( atom.GetAtomicNum(), atom.GetIsotope(), atom.GetHvyValence(), atom.ImplicitHydrogenCount() + atom.ExplicitHydrogenCount(), atom.GetFormalCharge(), int(atom.IsInRing()), int(atom.IsAromatic()), ) else: atom = mol.Mol.GetAtomWithIdx(idx) if atom.GetAtomicNum() == 1: raise Exception('ECFP should not hash Hydrogens') n_hs = atom.GetTotalNumHs(includeNeighbors=True) return ( atom.GetAtomicNum(), atom.GetIsotope(), atom.GetTotalDegree() - n_hs, n_hs, atom.GetFormalCharge(), int(atom.IsInRing()), int(atom.GetIsAromatic()), )
def rmsd(ref, mol, ignore_h=True, method=None, normalize=False): """Computes root mean square deviation (RMSD) between two molecules (including or excluding Hydrogens). No symmetry checks are performed. Parameters ---------- ref : oddt.toolkit.Molecule object Reference molecule for the RMSD calculation mol : oddt.toolkit.Molecule object Query molecule for RMSD calculation ignore_h : bool (default=False) Flag indicating to ignore Hydrogen atoms while performing RMSD calculation. This toggle works only with 'hungarian' method and without sorting (method=None). method : str (default=None) The method to be used for atom asignment between ref and mol. None means that direct matching is applied, which is the default behavior. Available methods: - canonize - match heavy atoms using canonical ordering (it forces ignoring H's) - hungarian - minimize RMSD using Hungarian algorithm - min_symmetry - makes multiple molecule-molecule matches and finds minimal RMSD (the slowest). Hydrogens are ignored. normalize : bool (default=False) Normalize RMSD by square root of rot. bonds Returns ------- rmsd : float RMSD between two molecules """ if method == 'canonize': ref_atoms = ref.coords[ref.canonic_order] mol_atoms = mol.coords[mol.canonic_order] elif method == 'hungarian': mol_map = [] ref_map = [] for a_type in np.unique(mol.atom_dict['atomtype']): if a_type != 'H' or not ignore_h: mol_idx = np.argwhere(mol.atom_dict['atomtype'] == a_type).flatten() ref_idx = np.argwhere(ref.atom_dict['atomtype'] == a_type).flatten() if len(mol_idx) != len(ref_idx): raise ValueError('Unequal number of atoms type: %s' % a_type) if len(mol_idx) == 1: mol_map.append(mol_idx) ref_map.append(ref_idx) continue M = distance(mol.atom_dict['coords'][mol_idx], ref.atom_dict['coords'][ref_idx]) M = M - M.min(axis=0) - M.min(axis=1).reshape(-1, 1) tmp_mol, tmp_ref = linear_sum_assignment(M) mol_map.append(mol_idx[tmp_mol]) ref_map.append(ref_idx[tmp_ref]) mol_atoms = mol.atom_dict['coords'][np.hstack(mol_map)] ref_atoms = ref.atom_dict['coords'][np.hstack(ref_map)] elif method == 'min_symmetry': min_rmsd = None ref_atoms = ref.atom_dict[ref.atom_dict['atomicnum'] != 1]['coords'] mol_atoms = mol.atom_dict[mol.atom_dict['atomicnum'] != 1]['coords'] # safety swith to check if number of heavy atoms match if ref_atoms.shape == mol_atoms.shape: # match mol to ref, generate all matches to find best RMSD matches = oddt.toolkit.Smarts(ref).findall(mol, unique=False) if not matches: raise ValueError('Could not find any match between molecules.') # calculate RMSD between all matches and retain the smallest for match in matches: match = np.array(match, dtype=int) if is_openbabel_molecule(mol): match -= 1 # OB has 1-based indices tmp_dict = mol.atom_dict[match] mol_atoms = tmp_dict[tmp_dict['atomicnum'] != 1]['coords'] # following should not happen, although safety check is left if mol_atoms.shape != ref_atoms.shape: raise Exception('Molecular match got wrong number of atoms.') rmsd = np.sqrt(((mol_atoms - ref_atoms)**2).sum(axis=-1).mean()) if min_rmsd is None or rmsd < min_rmsd: min_rmsd = rmsd return min_rmsd elif ignore_h: mol_atoms = mol.coords[mol.atom_dict['atomicnum'] != 1] ref_atoms = ref.coords[ref.atom_dict['atomicnum'] != 1] else: mol_atoms = mol.coords ref_atoms = ref.coords if mol_atoms.shape == ref_atoms.shape: rmsd = np.sqrt(((mol_atoms - ref_atoms)**2).sum(axis=-1).mean()) if normalize: rmsd /= np.sqrt(mol.num_rotors) return rmsd # at this point raise an exception raise ValueError('Unequal number of atoms in molecules (%i and %i)' % (len(mol_atoms), len(ref_atoms)))
def rmsd(ref, mol, ignore_h=True, method=None, normalize=False): """Computes root mean square deviation (RMSD) between two molecules (including or excluding Hydrogens). No symmetry checks are performed. Parameters ---------- ref : oddt.toolkit.Molecule object Reference molecule for the RMSD calculation mol : oddt.toolkit.Molecule object Query molecule for RMSD calculation ignore_h : bool (default=False) Flag indicating to ignore Hydrogen atoms while performing RMSD calculation. This toggle works only with 'hungarian' method and without sorting (method=None). method : str (default=None) The method to be used for atom asignment between ref and mol. None means that direct matching is applied, which is the default behavior. Available methods: - canonize - match heavy atoms using canonical ordering (it forces ignoring H's) - hungarian - minimize RMSD using Hungarian algorithm - min_symmetry - makes multiple molecule-molecule matches and finds minimal RMSD (the slowest). Hydrogens are ignored. normalize : bool (default=False) Normalize RMSD by square root of rot. bonds Returns ------- rmsd : float RMSD between two molecules """ if method == 'canonize': ref_atoms = ref.coords[ref.canonic_order] mol_atoms = mol.coords[mol.canonic_order] elif method == 'hungarian': mol_map = [] ref_map = [] for a_type in np.unique(mol.atom_dict['atomtype']): if a_type != 'H' or not ignore_h: mol_idx = np.argwhere( mol.atom_dict['atomtype'] == a_type).flatten() ref_idx = np.argwhere( ref.atom_dict['atomtype'] == a_type).flatten() if len(mol_idx) != len(ref_idx): raise ValueError('Unequal number of atoms type: %s' % a_type) if len(mol_idx) == 1: mol_map.append(mol_idx) ref_map.append(ref_idx) continue M = distance(mol.atom_dict['coords'][mol_idx], ref.atom_dict['coords'][ref_idx]) M = M - M.min(axis=0) - M.min(axis=1).reshape(-1, 1) tmp_mol, tmp_ref = linear_sum_assignment(M) mol_map.append(mol_idx[tmp_mol]) ref_map.append(ref_idx[tmp_ref]) mol_atoms = mol.atom_dict['coords'][np.hstack(mol_map)] ref_atoms = ref.atom_dict['coords'][np.hstack(ref_map)] elif method == 'min_symmetry': min_rmsd = None ref_atoms = ref.atom_dict[ref.atom_dict['atomicnum'] != 1]['coords'] mol_atoms = mol.atom_dict[mol.atom_dict['atomicnum'] != 1]['coords'] # safety swith to check if number of heavy atoms match if ref_atoms.shape == mol_atoms.shape: # match mol to ref, generate all matches to find best RMSD matches = oddt.toolkit.Smarts(ref).findall(mol, unique=False) if not matches: raise ValueError('Could not find any match between molecules.') # calculate RMSD between all matches and retain the smallest for match in matches: match = np.array(match, dtype=int) if is_openbabel_molecule(mol): match -= 1 # OB has 1-based indices tmp_dict = mol.atom_dict[match] mol_atoms = tmp_dict[tmp_dict['atomicnum'] != 1]['coords'] # following should not happen, although safety check is left if mol_atoms.shape != ref_atoms.shape: raise Exception( 'Molecular match got wrong number of atoms.') rmsd = np.sqrt( ((mol_atoms - ref_atoms)**2).sum(axis=-1).mean()) if min_rmsd is None or rmsd < min_rmsd: min_rmsd = rmsd return min_rmsd elif ignore_h: mol_atoms = mol.coords[mol.atom_dict['atomicnum'] != 1] ref_atoms = ref.coords[ref.atom_dict['atomicnum'] != 1] else: mol_atoms = mol.coords ref_atoms = ref.coords if mol_atoms.shape == ref_atoms.shape: rmsd = np.sqrt(((mol_atoms - ref_atoms)**2).sum(axis=-1).mean()) if normalize: rmsd /= np.sqrt(mol.num_rotors) return rmsd # at this point raise an exception raise ValueError('Unequal number of atoms in molecules (%i and %i)' % (len(mol_atoms), len(ref_atoms)))
def dock(self, ligands, protein=None): """Automated docking procedure. Parameters ---------- ligands: iterable of oddt.toolkit.Molecule objects Ligands to dock protein: oddt.toolkit.Molecule object or None Protein object to be used. If None, then the default one is used, else the protein is new default. Returns ------- ligands : array of oddt.toolkit.Molecule objects Array of ligands (scores are stored in mol.data method) """ if protein: self.set_protein(protein) if not self.protein_file: raise IOError("No receptor.") if is_molecule(ligands): ligands = [ligands] ligand_dir = mkdtemp(dir=self.tmp_dir, prefix='ligands_') output_array = [] for n, ligand in enumerate(ligands): check_molecule(ligand, force_coords=True) ligand_file = write_vina_pdbqt(ligand, ligand_dir, name_id=n) ligand_outfile = ligand_file[:-6] + '_out.pdbqt' try: scores = parse_vina_docking_output( subprocess.check_output([ self.executable, '--receptor', self.protein_file, '--ligand', ligand_file, '--out', ligand_outfile ] + self.params + ['--cpu', str(self.n_cpu)], stderr=subprocess.STDOUT)) except subprocess.CalledProcessError as e: sys.stderr.write(e.output.decode('ascii')) if self.skip_bad_mols: continue # TODO: print some warning message else: raise Exception('Autodock Vina failed. Command: "%s"' % ' '.join(e.cmd)) # docked conformations may have wrong connectivity - use source ligand if is_openbabel_molecule(ligand): if oddt.toolkits.ob.__version__ >= '2.4.0': # find the order of PDBQT atoms assigned by OpenBabel with open(ligand_file) as f: write_order = [ int(line[7:12].strip()) for line in f if line[:4] == 'ATOM' ] new_order = sorted(range(len(write_order)), key=write_order.__getitem__) new_order = [i + 1 for i in new_order] # OBMol has 1 based idx assert len(new_order) == len(ligand.atoms) else: # Openbabel 2.3.2 does not support perserving atom order. # We read back the PDBQT ligand to get "correct" bonding. ligand = next(oddt.toolkit.readfile('pdbqt', ligand_file)) if 'REMARK' in ligand.data: del ligand.data['REMARK'] docked_ligands = oddt.toolkit.readfile('pdbqt', ligand_outfile) for docked_ligand, score in zip(docked_ligands, scores): # Renumber atoms to match the input ligand if (is_openbabel_molecule(docked_ligand) and oddt.toolkits.ob.__version__ >= '2.4.0'): docked_ligand.OBMol.RenumberAtoms(new_order) # HACK: copy docked coordinates onto source ligand # We assume that the order of atoms match between ligands clone = ligand.clone clone.clone_coords(docked_ligand) clone.data.update(score) # Calculate RMSD to the input pose try: clone.data['vina_rmsd_input'] = rmsd(ligand, clone) clone.data['vina_rmsd_input_min'] = rmsd( ligand, clone, method='min_symmetry') except Exception: pass output_array.append(clone) rmtree(ligand_dir) return output_array
def dock(self, ligands, protein=None): """Automated docking procedure. Parameters ---------- ligands: iterable of oddt.toolkit.Molecule objects Ligands to dock protein: oddt.toolkit.Molecule object or None Protein object to be used. If None, then the default one is used, else the protein is new default. Returns ------- ligands : array of oddt.toolkit.Molecule objects Array of ligands (scores are stored in mol.data method) """ if protein: self.set_protein(protein) if not self.protein_file: raise IOError("No receptor.") if is_molecule(ligands): ligands = [ligands] ligand_dir = mkdtemp(dir=self.tmp_dir, prefix='ligands_') output_array = [] for n, ligand in enumerate(ligands): check_molecule(ligand, force_coords=True) ligand_file = write_vina_pdbqt(ligand, ligand_dir, name_id=n) ligand_outfile = ligand_file[:-6] + '_out.pdbqt' try: scores = parse_vina_docking_output( subprocess.check_output([self.executable, '--receptor', self.protein_file, '--ligand', ligand_file, '--out', ligand_outfile] + self.params + ['--cpu', str(self.n_cpu)], stderr=subprocess.STDOUT)) except subprocess.CalledProcessError as e: sys.stderr.write(e.output.decode('ascii')) if self.skip_bad_mols: continue # TODO: print some warning message else: raise Exception('Autodock Vina failed. Command: "%s"' % ' '.join(e.cmd)) # docked conformations may have wrong connectivity - use source ligand if is_openbabel_molecule(ligand): if oddt.toolkits.ob.__version__ >= '2.4.0': # find the order of PDBQT atoms assigned by OpenBabel with open(ligand_file) as f: write_order = [int(line[7:12].strip()) for line in f if line[:4] == 'ATOM'] new_order = sorted(range(len(write_order)), key=write_order.__getitem__) new_order = [i + 1 for i in new_order] # OBMol has 1 based idx assert len(new_order) == len(ligand.atoms) else: # Openbabel 2.3.2 does not support perserving atom order. # We read back the PDBQT ligand to get "correct" bonding. ligand = next(oddt.toolkit.readfile('pdbqt', ligand_file)) if 'REMARK' in ligand.data: del ligand.data['REMARK'] docked_ligands = oddt.toolkit.readfile('pdbqt', ligand_outfile) for docked_ligand, score in zip(docked_ligands, scores): # Renumber atoms to match the input ligand if (is_openbabel_molecule(docked_ligand) and oddt.toolkits.ob.__version__ >= '2.4.0'): docked_ligand.OBMol.RenumberAtoms(new_order) # HACK: copy docked coordinates onto source ligand # We assume that the order of atoms match between ligands clone = ligand.clone clone.clone_coords(docked_ligand) clone.data.update(score) # Calculate RMSD to the input pose clone.data['vina_rmsd_input'] = rmsd(ligand, clone) clone.data['vina_rmsd_input_min'] = rmsd(ligand, clone, method='min_symmetry') output_array.append(clone) rmtree(ligand_dir) return output_array