def test_pdbqt_to_pdb(self): """Test that a PDBQT molecule can be converted back in to PDB.""" xyz, mol = rdkit_utils.load_molecule(self.protein_file, calc_charges=False, add_hydrogens=False) with tempfile.TemporaryDirectory() as tmp: out_pdb = os.path.join(tmp, "mol.pdb") out_pdbqt = os.path.join(tmp, "mol.pdbqt") rdkit_utils.write_molecule(mol, out_pdb, is_protein=True) rdkit_utils.write_molecule(mol, out_pdbqt, is_protein=True) pdb_block = pdbqt_utils.pdbqt_to_pdb(out_pdbqt) from rdkit import Chem pdb_mol = Chem.MolFromPDBBlock(pdb_block, sanitize=False, removeHs=False) xyz, pdbqt_mol = rdkit_utils.load_molecule(out_pdbqt, add_hydrogens=False, calc_charges=False) assert pdb_mol.GetNumAtoms() == pdbqt_mol.GetNumAtoms() for atom_idx in range(pdb_mol.GetNumAtoms()): atom1 = pdb_mol.GetAtoms()[atom_idx] atom2 = pdbqt_mol.GetAtoms()[atom_idx] assert atom1.GetAtomicNum() == atom2.GetAtomicNum()
def _featurize(self, mol_pdb_file, protein_pdb_file): try: frag1_coords, frag1_mol = load_molecule( mol_pdb_file, is_protein=False, sanitize=True, add_hydrogens=False) frag2_coords, frag2_mol = load_molecule( protein_pdb_file, is_protein=True, sanitize=True, add_hydrogens=False) except MoleculeLoadException: # Currently handles loading failures by returning None # TODO: Is there a better handling procedure? logging.warning("Some molecules cannot be loaded by Rdkit. Skipping") return None system_mol = merge_molecules([frag1_mol, frag2_mol]) system_coords = get_xyz_from_mol(system_mol) frag1_coords, frag1_mol = self._strip_hydrogens(frag1_coords, frag1_mol) frag2_coords, frag2_mol = self._strip_hydrogens(frag2_coords, frag2_mol) system_coords, system_mol = self._strip_hydrogens(system_coords, system_mol) try: frag1_coords, frag1_neighbor_list, frag1_z = self.featurize_mol( frag1_coords, frag1_mol, self.frag1_num_atoms) frag2_coords, frag2_neighbor_list, frag2_z = self.featurize_mol( frag2_coords, frag2_mol, self.frag2_num_atoms) system_coords, system_neighbor_list, system_z = self.featurize_mol( system_coords, system_mol, self.complex_num_atoms) except ValueError: logging.warning( "max_atoms was set too low. Some complexes too large and skipped") return None return frag1_coords, frag1_neighbor_list, frag1_z, frag2_coords, frag2_neighbor_list, frag2_z, \ system_coords, system_neighbor_list, system_z
def test_merge_molecules(self): current_dir = os.path.dirname(os.path.realpath(__file__)) ligand_file = os.path.join(current_dir, "../../dock/tests/1jld_ligand.sdf") xyz, mol = rdkit_utils.load_molecule( ligand_file, calc_charges=False, add_hydrogens=False) num_mol_atoms = mol.GetNumAtoms() # self.ligand_file is for 3ws9_ligand.sdf oth_xyz, oth_mol = rdkit_utils.load_molecule( self.ligand_file, calc_charges=False, add_hydrogens=False) num_oth_mol_atoms = oth_mol.GetNumAtoms() merged = rdkit_utils.merge_molecules([mol, oth_mol]) merged_num_atoms = merged.GetNumAtoms() assert merged_num_atoms == num_mol_atoms + num_oth_mol_atoms
def test_load_molecule2(self): current_dir = os.path.dirname(os.path.realpath(__file__)) ligand_file = os.path.join(current_dir, "../../dock/tests/1jld_ligand.sdf") xyz, mol = rdkit_utils.load_molecule( ligand_file, calc_charges=False, add_hydrogens=False) assert xyz is not None assert mol is not None
def _featurize(self, complex: Tuple[str, str]): """ Compute neighbor list for complex. Parameters ---------- complex: Tuple[str, str] Filenames for molecule and protein. """ mol_pdb_file, protein_pdb_file = complex mol_coords, ob_mol = load_molecule(mol_pdb_file) protein_coords, protein_mol = load_molecule(protein_pdb_file) system_coords = merge_molecules_xyz([mol_coords, protein_coords]) system_neighbor_list = compute_neighbor_list( system_coords, self.neighbor_cutoff, self.max_num_neighbors, None) return (system_coords, system_neighbor_list)
def test_write_molecule(self): current_dir = os.path.dirname(os.path.realpath(__file__)) ligand_file = os.path.join(current_dir, "../../dock/tests/1jld_ligand.sdf") xyz, mol = rdkit_utils.load_molecule( ligand_file, calc_charges=False, add_hydrogens=False) with tempfile.TemporaryDirectory() as tmp: outfile = os.path.join(tmp, "mol.sdf") rdkit_utils.write_molecule(mol, outfile) xyz, mol2 = rdkit_utils.load_molecule( outfile, calc_charges=False, add_hydrogens=False) assert mol.GetNumAtoms() == mol2.GetNumAtoms() for atom_idx in range(mol.GetNumAtoms()): atom1 = mol.GetAtoms()[atom_idx] atom2 = mol.GetAtoms()[atom_idx] assert atom1.GetAtomicNum() == atom2.GetAtomicNum()
def test_get_xyz_from_mol(self): current_dir = os.path.dirname(os.path.realpath(__file__)) ligand_file = os.path.join(current_dir, "../../dock/tests/1jld_ligand.sdf") xyz, mol = rdkit_utils.load_molecule( ligand_file, calc_charges=False, add_hydrogens=False) xyz2 = rdkit_utils.get_xyz_from_mol(mol) equal_array = np.all(xyz == xyz2) assert equal_array
def _featurize(self, mol_pdb_file, protein_pdb_file): """ Compute neighbor list for complex. Parameters ---------- mol_pdb_file: str Filename for ligand pdb file. protein_pdb_file: str Filename for protein pdb file. """ mol_coords, ob_mol = load_molecule(mol_pdb_file) protein_coords, protein_mol = load_molecule(protein_pdb_file) system_coords = merge_molecules_xyz([mol_coords, protein_coords]) system_neighbor_list = compute_neighbor_list( system_coords, self.neighbor_cutoff, self.max_num_neighbors, None) return (system_coords, system_neighbor_list)
def test_convert_protein_to_pdbqt(self): """Test a protein in a PDB can be converted to PDBQT.""" from rdkit import Chem xyz, mol = rdkit_utils.load_molecule(self.protein_file, calc_charges=False, add_hydrogens=False) with tempfile.TemporaryDirectory() as tmp: outfile = os.path.join(tmp, "mol.pdbqt") writer = Chem.PDBWriter(outfile) writer.write(mol) writer.close() pdbqt_utils.convert_protein_to_pdbqt(mol, outfile) pdbqt_xyz, pdbqt_mol = rdkit_utils.load_molecule( outfile, add_hydrogens=False, calc_charges=False) assert pdbqt_mol.GetNumAtoms() == pdbqt_mol.GetNumAtoms() for atom_idx in range(pdbqt_mol.GetNumAtoms()): atom1 = pdbqt_mol.GetAtoms()[atom_idx] atom2 = pdbqt_mol.GetAtoms()[atom_idx] assert atom1.GetAtomicNum() == atom2.GetAtomicNum()
def test_get_face_boxes_for_protein(self): """Tests that binding pockets are detected.""" current_dir = os.path.dirname(os.path.realpath(__file__)) protein_file = os.path.join(current_dir, "1jld_protein.pdb") coords = rdkit_utils.load_molecule(protein_file)[0] boxes = box_utils.get_face_boxes(coords) assert isinstance(boxes, list) # Pocket is of form ((x_min, x_max), (y_min, y_max), (z_min, z_max)) for pocket in boxes: assert isinstance(pocket, box_utils.CoordinateBox)
def test_merge_molecules_xyz(self): current_dir = os.path.dirname(os.path.realpath(__file__)) ligand_file = os.path.join(current_dir, "../../dock/tests/1jld_ligand.sdf") xyz, mol = rdkit_utils.load_molecule( ligand_file, calc_charges=False, add_hydrogens=False) merged = rdkit_utils.merge_molecules_xyz([xyz, xyz]) for i in range(len(xyz)): first_atom_equal = np.all(xyz[i] == merged[i]) second_atom_equal = np.all(xyz[i] == merged[i + len(xyz)]) assert first_atom_equal assert second_atom_equal
def test_load_molecule(self): # adding hydrogens and charges is tested in dc.utils from rdkit.Chem.AllChem import Mol for add_hydrogens in (True, False): for calc_charges in (True, False): mol_xyz, mol_rdk = rdkit_utils.load_molecule( self.ligand_file, add_hydrogens, calc_charges) num_atoms = mol_rdk.GetNumAtoms() self.assertIsInstance(mol_xyz, np.ndarray) self.assertIsInstance(mol_rdk, Mol) self.assertEqual(mol_xyz.shape, (num_atoms, 3))
def setUp(self): current_dir = os.path.dirname(os.path.realpath(__file__)) # simple flat ring from rdkit.Chem import MolFromSmiles from rdkit.Chem.rdDepictor import Compute2DCoords self.cycle4 = MolFromSmiles('C1CCC1') # self.cycle4.Compute2DCoords() Compute2DCoords(self.cycle4) # load and sanitize two real molecules _, self.prot = load_molecule(os.path.join( current_dir, '../../feat/tests/data/3ws9_protein_fixer_rdkit.pdb'), add_hydrogens=False, calc_charges=False, sanitize=True) _, self.lig = load_molecule(os.path.join( current_dir, '../../feat//tests/data/3ws9_ligand.sdf'), add_hydrogens=False, calc_charges=False, sanitize=True)
def test_compute_charges(self): current_dir = os.path.dirname(os.path.realpath(__file__)) ligand_file = os.path.join(current_dir, "../../dock/tests/1jld_ligand.sdf") xyz, mol = rdkit_utils.load_molecule( ligand_file, calc_charges=False, add_hydrogens=True) rdkit_utils.compute_charges(mol) has_a_charge = False for atom_idx in range(mol.GetNumAtoms()): atom = mol.GetAtoms()[atom_idx] value = atom.GetProp(str("_GasteigerCharge")) if value != 0: has_a_charge = True assert has_a_charge
def extract_active_site( protein_file: str, ligand_file: str, cutoff: float = 4.0) -> Tuple[CoordinateBox, np.ndarray]: """Extracts a box for the active site. Parameters ---------- protein_file : str Location of protein PDB ligand_file : str Location of ligand input file cutoff : float, optional (default 4.0) The distance in angstroms from the protein pocket to consider for featurization. Returns ------- Tuple[CoordinateBox, np.ndarray] A tuple of `(CoordinateBox, np.ndarray)` where the second entry is of shape `(N, 3)` with `N` the number of atoms in the active site. """ protein = load_molecule(protein_file, add_hydrogens=False) ligand = load_molecule(ligand_file, add_hydrogens=True, calc_charges=True) protein_contacts, ligand_contacts = get_contact_atom_indices( [protein, ligand], cutoff=cutoff) protein_coords = protein[0] pocket_coords = protein_coords[protein_contacts] x_min = int(np.floor(np.amin(pocket_coords[:, 0]))) x_max = int(np.ceil(np.amax(pocket_coords[:, 0]))) y_min = int(np.floor(np.amin(pocket_coords[:, 1]))) y_max = int(np.ceil(np.amax(pocket_coords[:, 1]))) z_min = int(np.floor(np.amin(pocket_coords[:, 2]))) z_max = int(np.ceil(np.amax(pocket_coords[:, 2]))) box = CoordinateBox((x_min, x_max), (y_min, y_max), (z_min, z_max)) return box, pocket_coords
def _featurize(self, datapoint, **kwargs): """ Compute neighbor list for complex. Parameters ---------- datapoint: Tuple[str, str] Filenames for molecule and protein. """ if 'complex' in kwargs: datapoint = kwargs.get("complex") raise DeprecationWarning( 'Complex is being phased out as a parameter, please pass "datapoint" instead.' ) mol_pdb_file, protein_pdb_file = datapoint mol_coords, ob_mol = load_molecule(mol_pdb_file) protein_coords, protein_mol = load_molecule(protein_pdb_file) system_coords = merge_molecules_xyz([mol_coords, protein_coords]) system_neighbor_list = compute_neighbor_list( system_coords, self.neighbor_cutoff, self.max_num_neighbors, None) return (system_coords, system_neighbor_list)
def find_all_pockets(self, protein_file: str) -> List[CoordinateBox]: """Find list of binding pockets on protein. Parameters ---------- protein_file : str Protein to load in. Returns ------- List[CoordinateBox] List of binding pockets on protein. Each pocket is a `CoordinateBox` """ coords, _ = load_molecule(protein_file) return get_face_boxes(coords, self.pad)
def featurize( # type: ignore[override] self, protein_file: str, pockets: List[CoordinateBox]) -> np.ndarray: """ Calculate atomic coodinates. Parameters ---------- protein_file: str Location of PDB file. Will be loaded by MDTraj pockets: List[CoordinateBox] List of `dc.utils.CoordinateBox` objects. Returns ------- np.ndarray A numpy array of shale `(len(pockets), n_residues)` """ try: import mdtraj except ModuleNotFoundError: raise ImportError("This class requires mdtraj to be installed.") protein_coords = load_molecule(protein_file, add_hydrogens=False, calc_charges=False)[0] mapping = boxes_to_atoms(protein_coords, pockets) protein = mdtraj.load(protein_file) n_pockets = len(pockets) n_residues = len(BindingPocketFeaturizer.residues) res_map = dict(zip(BindingPocketFeaturizer.residues, range(n_residues))) all_features = np.zeros((n_pockets, n_residues)) for pocket_num, pocket in enumerate(pockets): pocket_atoms = mapping[pocket] for ind, atom in enumerate(pocket_atoms): atom_name = str(protein.top.atom(atom)) # atom_name is of format RESX-ATOMTYPE # where X is a 1 to 4 digit number residue = atom_name[:3] if residue not in res_map: logger.info("Warning: Non-standard residue in PDB file") continue all_features[pocket_num, res_map[residue]] += 1 return all_features
def test_apply_pdbfixer(self): current_dir = os.path.dirname(os.path.realpath(__file__)) ligand_file = os.path.join(current_dir, "../../dock/tests/1jld_ligand.sdf") xyz, mol = rdkit_utils.load_molecule( ligand_file, calc_charges=False, add_hydrogens=False) original_hydrogen_count = 0 for atom_idx in range(mol.GetNumAtoms()): atom = mol.GetAtoms()[atom_idx] if atom.GetAtomicNum() == 1: original_hydrogen_count += 1 assert mol is not None mol = rdkit_utils.apply_pdbfixer(mol, hydrogenate=True, is_protein=False) assert mol is not None after_hydrogen_count = 0 for atom_idx in range(mol.GetNumAtoms()): atom = mol.GetAtoms()[atom_idx] if atom.GetAtomicNum() == 1: after_hydrogen_count += 1 assert after_hydrogen_count >= original_hydrogen_count
def featurize(self, protein_file, pockets): """ Calculate atomic coodinates. Parameters ---------- protein_file: str Location of PDB file. Will be loaded by MDTraj pockets: list[CoordinateBox] List of `dc.utils.CoordinateBox` objects. Returns ------- A numpy array of shale `(len(pockets), n_residues)` """ import mdtraj protein_coords = load_molecule(protein_file, add_hydrogens=False, calc_charges=False)[0] mapping = boxes_to_atoms(protein_coords, pockets) protein = mdtraj.load(protein_file) n_pockets = len(pockets) n_residues = len(BindingPocketFeaturizer.residues) res_map = dict(zip(BindingPocketFeaturizer.residues, range(n_residues))) all_features = np.zeros((n_pockets, n_residues)) for pocket_num, pocket in enumerate(pockets): pocket_atoms = mapping[pocket] for ind, atom in enumerate(pocket_atoms): atom_name = str(protein.top.atom(atom)) # atom_name is of format RESX-ATOMTYPE # where X is a 1 to 4 digit number residue = atom_name[:3] if residue not in res_map: logger.info("Warning: Non-standard residue in PDB file") continue atomtype = atom_name.split("-")[1] all_features[pocket_num, res_map[residue]] += 1 return all_features
def find_pockets(self, macromolecule_file: str) -> List[CoordinateBox]: """Find list of suitable binding pockets on protein. This function computes putative binding pockets on this protein. This class uses the `ConvexHull` to compute binding pockets. Each face of the hull is converted into a coordinate box used for binding. Parameters ---------- macromolecule_file : str Location of the macromolecule file to load Returns ------- List[CoordinateBox] List of pockets. Each pocket is a `CoordinateBox` """ coords, _ = load_molecule(macromolecule_file, add_hydrogens=False, calc_charges=False) boxes = get_face_boxes(coords, self.pad) boxes = merge_overlapping_boxes(boxes) return boxes
def test_strip_hydrogens(self): mol_xyz, mol_rdk = rdkit_utils.load_molecule(self.ligand_file) _ = MolecularFragment(mol_rdk.GetAtoms(), mol_xyz) # Test on RDKit _ = strip_hydrogens(mol_xyz, mol_rdk)
def generate_poses( self, molecular_complex: Tuple[str, str], centroid: Optional[np.ndarray] = None, box_dims: Optional[np.ndarray] = None, exhaustiveness: int = 10, num_modes: int = 9, num_pockets: Optional[int] = None, out_dir: Optional[str] = None, generate_scores: bool = True, **kwargs) -> Union[Tuple[DOCKED_POSES, np.ndarray], DOCKED_POSES]: """Generates the docked complex and outputs files for docked complex. Parameters ---------- molecular_complexes: Tuple[str, str] A representation of a molecular complex. This tuple is (protein_file, ligand_file). centroid: np.ndarray, optional (default None) The centroid to dock against. Is computed if not specified. box_dims: np.ndarray, optional (default None) A numpy array of shape `(3,)` holding the size of the box to dock. If not specified is set to size of molecular complex plus 4 angstroms. exhaustiveness: int (default 8) Tells GNINA how exhaustive it should be with pose generation. num_modes: int (default 9) Tells GNINA how many binding modes it should generate at each invocation. out_dir: str, optional If specified, write generated poses to this directory. generate_scores: bool, optional (default True) If `True`, the pose generator will return scores for complexes. This is used typically when invoking external docking programs that compute scores. kwargs: Any args supported by GNINA as documented https://github.com/gnina/gnina#usage Returns ------- Tuple[`docked_poses`, `scores`] or `docked_poses` Tuple of `(docked_poses, scores)` or `docked_poses`. `docked_poses` is a list of docked molecular complexes. Each entry in this list contains a `(protein_mol, ligand_mol)` pair of RDKit molecules. `scores` is an array of binding affinities (kcal/mol), CNN pose scores, and CNN affinities predicted by GNINA. """ if out_dir is None: out_dir = tempfile.mkdtemp() if not os.path.exists(out_dir): os.makedirs(out_dir) # Parse complex if len(molecular_complex) > 2: raise ValueError( "GNINA can only dock protein-ligand complexes and not more general molecular complexes." ) (protein_file, ligand_file) = molecular_complex # check filetypes if not protein_file.endswith('.pdb'): raise ValueError('Protein file must be in .pdb format.') if not ligand_file.endswith('.sdf'): raise ValueError('Ligand file must be in .sdf format.') protein_mol = load_molecule(protein_file, calc_charges=True, add_hydrogens=True) ligand_name = os.path.basename(ligand_file).split(".")[0] # Define locations of log and output files log_file = os.path.join(out_dir, "%s_log.txt" % ligand_name) out_file = os.path.join(out_dir, "%s_docked.pdbqt" % ligand_name) logger.info("About to call GNINA.") # Write GNINA conf file conf_file = os.path.join(out_dir, "conf.txt") write_gnina_conf(protein_filename=protein_file, ligand_filename=ligand_file, conf_filename=conf_file, num_modes=num_modes, exhaustiveness=exhaustiveness, **kwargs) # Run GNINA args = [ self.gnina_cmd, "--config", conf_file, "--log", log_file, "--out", out_file ] process = Popen(args, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() # read output and log ligands, _ = load_docked_ligands(out_file) docked_complexes = [(protein_mol[1], ligand) for ligand in ligands] scores = read_gnina_log(log_file) if generate_scores: return docked_complexes, scores else: return docked_complexes
def generate_poses( self, molecular_complex: Tuple[str, str], centroid: Optional[np.ndarray] = None, box_dims: Optional[np.ndarray] = None, exhaustiveness: int = 10, num_modes: int = 9, num_pockets: Optional[int] = None, out_dir: Optional[str] = None, generate_scores: Optional[bool] = False, **kwargs) -> Union[Tuple[DOCKED_POSES, List[float]], DOCKED_POSES]: """Generates the docked complex and outputs files for docked complex. Parameters ---------- molecular_complexes: Tuple[str, str] A representation of a molecular complex. This tuple is (protein_file, ligand_file). The protein should be a pdb file and the ligand should be an sdf file. centroid: np.ndarray, optional The centroid to dock against. Is computed if not specified. box_dims: np.ndarray, optional A numpy array of shape `(3,)` holding the size of the box to dock. If not specified is set to size of molecular complex plus 5 angstroms. exhaustiveness: int, optional (default 10) Tells Autodock Vina how exhaustive it should be with pose generation. A higher value of exhaustiveness implies more computation effort for the docking experiment. num_modes: int, optional (default 9) Tells Autodock Vina how many binding modes it should generate at each invocation. num_pockets: int, optional (default None) If specified, `self.pocket_finder` must be set. Will only generate poses for the first `num_pockets` returned by `self.pocket_finder`. out_dir: str, optional If specified, write generated poses to this directory. generate_score: bool, optional (default False) If `True`, the pose generator will return scores for complexes. This is used typically when invoking external docking programs that compute scores. kwargs: The kwargs - cpu, min_rmsd, max_evals, energy_range supported by VINA are as documented in https://autodock-vina.readthedocs.io/en/latest/vina.html Returns ------- Tuple[`docked_poses`, `scores`] or `docked_poses` Tuple of `(docked_poses, scores)` or `docked_poses`. `docked_poses` is a list of docked molecular complexes. Each entry in this list contains a `(protein_mol, ligand_mol)` pair of RDKit molecules. `scores` is a list of binding free energies predicted by Vina. Raises ------ `ValueError` if `num_pockets` is set but `self.pocket_finder is None`. """ if "cpu" in kwargs: cpu = kwargs["cpu"] else: cpu = 0 if "min_rmsd" in kwargs: min_rmsd = kwargs["min_rmsd"] else: min_rmsd = 1.0 if "max_evals" in kwargs: max_evals = kwargs["max_evals"] else: max_evals = 0 if "energy_range" in kwargs: energy_range = kwargs["energy_range"] else: energy_range = 3.0 try: from vina import Vina except ModuleNotFoundError: raise ImportError("This function requires vina to be installed") if out_dir is None: out_dir = tempfile.mkdtemp() if num_pockets is not None and self.pocket_finder is None: raise ValueError( "If num_pockets is specified, pocket_finder must have been provided at construction time." ) # Parse complex if len(molecular_complex) > 2: raise ValueError( "Autodock Vina can only dock protein-ligand complexes and not more general molecular complexes." ) (protein_file, ligand_file) = molecular_complex # Prepare protein protein_name = os.path.basename(protein_file).split(".")[0] protein_hyd = os.path.join(out_dir, "%s_hyd.pdb" % protein_name) protein_pdbqt = os.path.join(out_dir, "%s.pdbqt" % protein_name) protein_mol = load_molecule(protein_file, calc_charges=True, add_hydrogens=True) write_molecule(protein_mol[1], protein_hyd, is_protein=True) write_molecule(protein_mol[1], protein_pdbqt, is_protein=True) # Get protein centroid and range if centroid is not None and box_dims is not None: centroids = [centroid] dimensions = [box_dims] else: if self.pocket_finder is None: logger.info( "Pockets not specified. Will use whole protein to dock") centroids = [compute_centroid(protein_mol[0])] dimensions = [compute_protein_range(protein_mol[0]) + 5.0] else: logger.info("About to find putative binding pockets") pockets = self.pocket_finder.find_pockets(protein_file) logger.info("%d pockets found in total" % len(pockets)) logger.info("Computing centroid and size of proposed pockets.") centroids, dimensions = [], [] for pocket in pockets: (x_min, x_max), (y_min, y_max), ( z_min, z_max) = pocket.x_range, pocket.y_range, pocket.z_range # TODO(rbharath: Does vina divide box dimensions by 2? x_box = (x_max - x_min) / 2. y_box = (y_max - y_min) / 2. z_box = (z_max - z_min) / 2. centroids.append(pocket.center()) dimensions.append(np.array((x_box, y_box, z_box))) if num_pockets is not None: logger.info( "num_pockets = %d so selecting this many pockets for docking." % num_pockets) centroids = centroids[:num_pockets] dimensions = dimensions[:num_pockets] # Prepare ligand ligand_name = os.path.basename(ligand_file).split(".")[0] ligand_pdbqt = os.path.join(out_dir, "%s.pdbqt" % ligand_name) ligand_mol = load_molecule(ligand_file, calc_charges=True, add_hydrogens=True) write_molecule(ligand_mol[1], ligand_pdbqt) docked_complexes = [] all_scores = [] vpg = Vina(sf_name='vina', cpu=cpu, seed=0, no_refine=False, verbosity=1) for i, (protein_centroid, box_dims) in enumerate(zip(centroids, dimensions)): logger.info("Docking in pocket %d/%d" % (i + 1, len(centroids))) logger.info("Docking with center: %s" % str(protein_centroid)) logger.info("Box dimensions: %s" % str(box_dims)) # Write Vina conf file conf_file = os.path.join(out_dir, "conf.txt") write_vina_conf(protein_pdbqt, ligand_pdbqt, protein_centroid, box_dims, conf_file, num_modes=num_modes, exhaustiveness=exhaustiveness) # Define locations of output files out_pdbqt = os.path.join(out_dir, "%s_docked.pdbqt" % ligand_name) logger.info("About to call Vina") vpg.set_receptor(protein_pdbqt) vpg.set_ligand_from_file(ligand_pdbqt) vpg.compute_vina_maps(center=protein_centroid, box_size=box_dims) vpg.dock(exhaustiveness=exhaustiveness, n_poses=num_modes, min_rmsd=min_rmsd, max_evals=max_evals) vpg.write_poses(out_pdbqt, n_poses=num_modes, energy_range=energy_range, overwrite=True) ligands, scores = load_docked_ligands(out_pdbqt) docked_complexes += [(protein_mol[1], ligand) for ligand in ligands] all_scores += scores if generate_scores: return docked_complexes, all_scores else: return docked_complexes
def _featurize(self, complex): """Computes grid featurization of protein/ligand complex. Takes as input filenames pdb of the protein, pdb of the ligand. This function then computes the centroid of the ligand; decrements this centroid from the atomic coordinates of protein and ligand atoms, and then merges the translated protein and ligand. This combined system/complex is then saved. This function then computes a featurization with scheme specified by the user. Parameters ---------- complex: Tuple[str, str] Filenames for molecule and protein. """ try: mol_pdb_file, protein_pdb_file = complex time1 = time.time() protein_xyz, protein_rdk = load_molecule(protein_pdb_file, calc_charges=True, sanitize=self.sanitize) time2 = time.time() logger.info( "TIMING: Loading protein coordinates took %0.3f s" % (time2 - time1), self.verbose) time1 = time.time() ligand_xyz, ligand_rdk = load_molecule(mol_pdb_file, calc_charges=True, sanitize=self.sanitize) time2 = time.time() logger.info( "TIMING: Loading ligand coordinates took %0.3f s" % (time2 - time1), self.verbose) except MoleculeLoadException: logger.warning( "Some molecules cannot be loaded by Rdkit. Skipping") return None time1 = time.time() centroid = compute_centroid(ligand_xyz) ligand_xyz = subtract_centroid(ligand_xyz, centroid) protein_xyz = subtract_centroid(protein_xyz, centroid) time2 = time.time() logger.info( "TIMING: Centroid processing took %0.3f s" % (time2 - time1), self.verbose) pairwise_distances = compute_pairwise_distances( protein_xyz, ligand_xyz) transformed_systems = {} transformed_systems[(0, 0)] = [protein_xyz, ligand_xyz] for i in range(self.nb_rotations): rotated_system = rotate_molecules([protein_xyz, ligand_xyz]) transformed_systems[(i + 1, 0)] = rotated_system features_dict = {} for system_id, (protein_xyz, ligand_xyz) in transformed_systems.items(): feature_arrays = [] for is_flat, function_name in self.feature_types: result = self._compute_feature( function_name, protein_xyz, protein_rdk, ligand_xyz, ligand_rdk, pairwise_distances, ) feature_arrays += result if self.flatten: features_dict[system_id] = np.concatenate([ feature_array.flatten() for feature_array in feature_arrays ]) else: features_dict[system_id] = np.concatenate(feature_arrays, axis=-1) # TODO(rbharath): Is this squeeze OK? features = np.squeeze(np.array(list(features_dict.values()))) return features
def generate_poses( self, molecular_complex: Tuple[str, str], centroid: Optional[np.ndarray] = None, box_dims: Optional[np.ndarray] = None, exhaustiveness: int = 10, num_modes: int = 9, num_pockets: Optional[int] = None, out_dir: Optional[str] = None, generate_scores: bool = False ) -> Union[Tuple[DOCKED_POSES, List[float]], DOCKED_POSES]: """Generates the docked complex and outputs files for docked complex. TODO: How can this work on Windows? We need to install a .msi file and invoke it correctly from Python for this to work. Parameters ---------- molecular_complexes: Tuple[str, str] A representation of a molecular complex. This tuple is (protein_file, ligand_file). centroid: np.ndarray, optional The centroid to dock against. Is computed if not specified. box_dims: np.ndarray, optional A numpy array of shape `(3,)` holding the size of the box to dock. If not specified is set to size of molecular complex plus 5 angstroms. exhaustiveness: int, optional (default 10) Tells Autodock Vina how exhaustive it should be with pose generation. num_modes: int, optional (default 9) Tells Autodock Vina how many binding modes it should generate at each invocation. num_pockets: int, optional (default None) If specified, `self.pocket_finder` must be set. Will only generate poses for the first `num_pockets` returned by `self.pocket_finder`. out_dir: str, optional If specified, write generated poses to this directory. generate_score: bool, optional (default False) If `True`, the pose generator will return scores for complexes. This is used typically when invoking external docking programs that compute scores. Returns ------- Tuple[`docked_poses`, `scores`] or `docked_poses` Tuple of `(docked_poses, scores)` or `docked_poses`. `docked_poses` is a list of docked molecular complexes. Each entry in this list contains a `(protein_mol, ligand_mol)` pair of RDKit molecules. `scores` is a list of binding free energies predicted by Vina. Raises ------ `ValueError` if `num_pockets` is set but `self.pocket_finder is None`. """ if out_dir is None: out_dir = tempfile.mkdtemp() if num_pockets is not None and self.pocket_finder is None: raise ValueError( "If num_pockets is specified, pocket_finder must have been provided at construction time." ) # Parse complex if len(molecular_complex) > 2: raise ValueError( "Autodock Vina can only dock protein-ligand complexes and not more general molecular complexes." ) (protein_file, ligand_file) = molecular_complex # Prepare protein protein_name = os.path.basename(protein_file).split(".")[0] protein_hyd = os.path.join(out_dir, "%s_hyd.pdb" % protein_name) protein_pdbqt = os.path.join(out_dir, "%s.pdbqt" % protein_name) protein_mol = load_molecule(protein_file, calc_charges=True, add_hydrogens=True) write_molecule(protein_mol[1], protein_hyd, is_protein=True) write_molecule(protein_mol[1], protein_pdbqt, is_protein=True) # Get protein centroid and range if centroid is not None and box_dims is not None: centroids = [centroid] dimensions = [box_dims] else: if self.pocket_finder is None: logger.info( "Pockets not specified. Will use whole protein to dock") protein_centroid = compute_centroid(protein_mol[0]) protein_range = compute_protein_range(protein_mol[0]) box_dims = protein_range + 5.0 centroids, dimensions = [protein_centroid], [box_dims] else: logger.info("About to find putative binding pockets") pockets = self.pocket_finder.find_pockets(protein_file) logger.info("%d pockets found in total" % len(pockets)) logger.info("Computing centroid and size of proposed pockets.") centroids, dimensions = [], [] for pocket in pockets: protein_centroid = pocket.center() (x_min, x_max), (y_min, y_max), ( z_min, z_max) = pocket.x_range, pocket.y_range, pocket.z_range # TODO(rbharath: Does vina divide box dimensions by 2? x_box = (x_max - x_min) / 2. y_box = (y_max - y_min) / 2. z_box = (z_max - z_min) / 2. box_dims = (x_box, y_box, z_box) centroids.append(protein_centroid) dimensions.append(box_dims) if num_pockets is not None: logger.info( "num_pockets = %d so selecting this many pockets for docking." % num_pockets) centroids = centroids[:num_pockets] dimensions = dimensions[:num_pockets] # Prepare protein ligand_name = os.path.basename(ligand_file).split(".")[0] ligand_pdbqt = os.path.join(out_dir, "%s.pdbqt" % ligand_name) ligand_mol = load_molecule(ligand_file, calc_charges=True, add_hydrogens=True) write_molecule(ligand_mol[1], ligand_pdbqt) docked_complexes = [] all_scores = [] for i, (protein_centroid, box_dims) in enumerate(zip(centroids, dimensions)): logger.info("Docking in pocket %d/%d" % (i + 1, len(centroids))) logger.info("Docking with center: %s" % str(protein_centroid)) logger.info("Box dimensions: %s" % str(box_dims)) # Write Vina conf file conf_file = os.path.join(out_dir, "conf.txt") write_vina_conf(protein_pdbqt, ligand_pdbqt, protein_centroid, box_dims, conf_file, num_modes=num_modes, exhaustiveness=exhaustiveness) # Define locations of log and output files log_file = os.path.join(out_dir, "%s_log.txt" % ligand_name) out_pdbqt = os.path.join(out_dir, "%s_docked.pdbqt" % ligand_name) logger.info("About to call Vina") if platform.system() == 'Windows': args = [ self.vina_cmd, "--config", conf_file, "--log", log_file, "--out", out_pdbqt ] else: # I'm not sure why specifying the args as a list fails on other platforms, # but for some reason it only works if I pass it as a string. # FIXME: Incompatible types in assignment args = "%s --config %s --log %s --out %s" % ( # type: ignore self.vina_cmd, conf_file, log_file, out_pdbqt) # FIXME: We should use `subprocess.run` instead of `call` call(args, shell=True) ligands, scores = load_docked_ligands(out_pdbqt) docked_complexes += [(protein_mol[1], ligand) for ligand in ligands] all_scores += scores if generate_scores: return docked_complexes, all_scores else: return docked_complexes
def test_create_molecular_fragment(self): mol_xyz, mol_rdk = rdkit_utils.load_molecule(self.ligand_file) fragment = MolecularFragment(mol_rdk.GetAtoms(), mol_xyz) assert len(mol_rdk.GetAtoms()) == len(fragment.GetAtoms()) assert (fragment.GetCoords() == mol_xyz).all()
def test_merge_molecular_fragments(self): mol_xyz, mol_rdk = rdkit_utils.load_molecule(self.ligand_file) fragment1 = MolecularFragment(mol_rdk.GetAtoms(), mol_xyz) fragment2 = MolecularFragment(mol_rdk.GetAtoms(), mol_xyz) joint = merge_molecular_fragments([fragment1, fragment2]) assert len(mol_rdk.GetAtoms()) * 2 == len(joint.GetAtoms())