Пример #1
0
def featurize_binding_pocket_sybyl(protein_xyz,
                                   protein,
                                   ligand_xyz,
                                   ligand,
                                   pairwise_distances=None,
                                   cutoff=7.0):
    """Computes Sybyl dicts for ligand and binding pocket of the protein.

  Parameters
  ----------
  protein_xyz: np.ndarray
    Of shape (N_protein_atoms, 3)
  protein: Rdkit Molecule
    Contains more metadata.
  ligand_xyz: np.ndarray
    Of shape (N_ligand_atoms, 3)
  ligand: Rdkit Molecule
    Contains more metadata
  pairwise_distances: np.ndarray
    Array of pairwise protein-ligand distances (Angstroms)
  cutoff: float
    Cutoff distance for contact consideration.
  """

    if pairwise_distances is None:
        pairwise_distances = compute_pairwise_distances(
            protein_xyz, ligand_xyz)
    contacts = np.nonzero((pairwise_distances < cutoff))
    protein_atoms = set([int(c) for c in contacts[0].tolist()])

    protein_sybyl_dict = compute_all_sybyl(protein, indices=protein_atoms)
    ligand_sybyl_dict = compute_all_sybyl(ligand)
    return (protein_sybyl_dict, ligand_sybyl_dict)
Пример #2
0
    def _featurize(self, complex: Tuple[str, str]):
        """
    Compute featurization for a molecular complex

    Parameters
    ----------
    complex: Tuple[str, str]
      Filenames for molecule and protein.
    """
        try:
            fragments = load_complex(complex, add_hydrogens=False)

        except MoleculeLoadException:
            logger.warning(
                "This molecule cannot be loaded by Rdkit. Returning None")
            return None
        pairwise_features = []
        # We compute pairwise contact fingerprints
        for (frag1, frag2) in itertools.combinations(fragments, 2):
            # Get coordinates
            distances = compute_pairwise_distances(frag1[0], frag2[0])
            vector = [
                vectorize(hash_ecfp, feature_dict=ecfp_dict, size=self.size)
                for ecfp_dict in featurize_contacts_ecfp(
                    frag1,
                    frag2,
                    distances,
                    cutoff=self.cutoff,
                    ecfp_degree=self.radius)
            ]
            pairwise_features += vector

        pairwise_features = np.concatenate(pairwise_features)
        return pairwise_features
    def _featurize(self, mol_pdb: str, complex_pdb: str):
        """
    Compute featurization for a molecular complex

    Parameters
    ----------
    mol_pdb: str
      Filename for ligand molecule
    complex_pdb: str
      Filename for protein molecule
    """
        molecular_complex = (mol_pdb, complex_pdb)
        try:
            fragments = load_complex(molecular_complex, add_hydrogens=False)

        except MoleculeLoadException:
            logger.warning(
                "This molecule cannot be loaded by Rdkit. Returning None")
            return None
        pairwise_features = []
        # We compute pairwise contact fingerprints
        for (frag1, frag2) in itertools.combinations(fragments, 2):
            # Get coordinates
            distances = compute_pairwise_distances(frag1[0], frag2[0])
            # distances = compute_pairwise_distances(prot_xyz, lig_xyz)
            vectors = [
                vectorize(hash_ecfp_pair,
                          feature_dict=splif_dict,
                          size=self.size)
                for splif_dict in featurize_splif(
                    frag1, frag2, self.contact_bins, distances, self.radius)
            ]
            pairwise_features += vectors
        pairwise_features = np.concatenate(pairwise_features)
        return pairwise_features
Пример #4
0
    def test_compute_pairwise_distances(self):
        n1 = 10
        n2 = 50
        coords1 = np.random.rand(n1, 3)
        coords2 = np.random.rand(n2, 3)

        distance = compute_pairwise_distances(coords1, coords2)
        self.assertEqual(distance.shape, (n1, n2))
        self.assertTrue((distance >= 0).all())
        # random coords between 0 and 1, so the max possible distance in sqrt(2)
        self.assertTrue((distance <= 2.0**0.5).all())

        # check if correct distance metric was used
        coords1 = np.array([[0, 0, 0], [1, 0, 0]])
        coords2 = np.array([[1, 0, 0], [2, 0, 0], [3, 0, 0]])
        distance = compute_pairwise_distances(coords1, coords2)
        self.assertTrue((distance == [[1, 2, 3], [0, 1, 2]]).all())
Пример #5
0
    def _featurize(self, datapoint, **kwargs):  # -> Optional[np.ndarray]:
        """
    Compute featurization for a single mol/protein complex

    Parameters
    ----------
    datapoint: Tuple[str, str]
      Filenames for molecule and protein.
    """
        if 'complex' in kwargs:
            datapoint = kwargs.get("complex")
            raise DeprecationWarning(
                'Complex is being phased out as a parameter, please pass "datapoint" instead.'
            )
        try:
            fragments = rdkit_utils.load_complex(datapoint,
                                                 add_hydrogens=False)

        except MoleculeLoadException:
            logger.warning(
                "This molecule cannot be loaded by Rdkit. Returning None")
            return None
        pairwise_features = []
        # We compute pairwise contact fingerprints
        centroid = compute_contact_centroid(fragments, cutoff=self.cutoff)
        if self.reduce_to_contacts:
            fragments = reduce_molecular_complex_to_contacts(
                fragments, self.cutoff)
        for (frag1_ind,
             frag2_ind) in itertools.combinations(range(len(fragments)), 2):
            frag1, frag2 = fragments[frag1_ind], fragments[frag2_ind]
            distances = compute_pairwise_distances(frag1[0], frag2[0])
            frag1_xyz = subtract_centroid(frag1[0], centroid)
            frag2_xyz = subtract_centroid(frag2[0], centroid)
            xyzs = [frag1_xyz, frag2_xyz]
            # rdks = [frag1[1], frag2[1]]
            pairwise_features.append(
                np.concatenate([
                    sum([
                        voxelize(convert_atom_pair_to_voxel,
                                 hash_function=None,
                                 box_width=self.box_width,
                                 voxel_width=self.voxel_width,
                                 coordinates=xyz,
                                 feature_list=hbond_list,
                                 nb_channel=1) for xyz in xyzs
                    ]) for hbond_list in compute_hydrogen_bonds(
                        frag1, frag2, distances, self.distance_bins,
                        self.angle_cutoffs)
                ],
                               axis=-1))
        # Features are of shape (voxels_per_edge, voxels_per_edge, voxels_per_edge, 1) so we should concatenate on the last axis.
        return np.concatenate(pairwise_features, axis=-1)
Пример #6
0
    def _featurize(self, mol_pdb: str, protein_pdb: str):
        """
    Compute featurization for a molecular complex

    Parameters
    ----------
    mol_pdb: str
      Filename for ligand molecule
    protein_pdb: str
      Filename for protein molecule
    """
        molecular_complex = (mol_pdb, protein_pdb)
        try:
            fragments = load_complex(molecular_complex, add_hydrogens=False)

        except MoleculeLoadException:
            logger.warning(
                "This molecule cannot be loaded by Rdkit. Returning None")
            return None
        pairwise_features: List[np.ndarray] = []
        # We compute pairwise contact fingerprints
        centroid = compute_contact_centroid(fragments, cutoff=self.cutoff)
        for (frag1, frag2) in itertools.combinations(fragments, 2):
            distances = compute_pairwise_distances(frag1[0], frag2[0])
            frag1_xyz = subtract_centroid(frag1[0], centroid)
            frag2_xyz = subtract_centroid(frag2[0], centroid)
            xyzs = [frag1_xyz, frag2_xyz]
            pairwise_features.append(
                sum([
                    voxelize(convert_atom_to_voxel,
                             xyz,
                             self.box_width,
                             self.voxel_width,
                             hash_function=hash_ecfp,
                             feature_dict=ecfp_dict,
                             nb_channel=self.size)
                    for xyz, ecfp_dict in zip(
                        xyzs,
                        featurize_contacts_ecfp(frag1,
                                                frag2,
                                                distances,
                                                cutoff=self.cutoff,
                                                ecfp_degree=self.radius))
                ]))
        if self.flatten:
            return np.concatenate(
                [features.flatten() for features in pairwise_features])
        else:
            # Features are of shape (voxels_per_edge, voxels_per_edge,
            # voxels_per_edge, num_feat) so we should concatenate on the last
            # axis.
            return np.concatenate(pairwise_features, axis=-1)
Пример #7
0
  def _featurize(self, mol_pdb: str, protein_pdb: str) -> np.ndarray:
    """
    Compute featurization for a single mol/protein complex

    Parameters
    ----------
    mol_pdb: str
      Filename for ligand molecule
    protein_pdb: str
      Filename for protein molecule
    """
    molecular_complex = (mol_pdb, protein_pdb)
    try:
      fragments = rdkit_utils.load_complex(
          molecular_complex, add_hydrogens=False)

    except MoleculeLoadException:
      logger.warning("This molecule cannot be loaded by Rdkit. Returning None")
      return None
    pairwise_features = []
    # We compute pairwise contact fingerprints
    centroid = compute_contact_centroid(fragments, cutoff=self.cutoff)
    if self.reduce_to_contacts:
      fragments = reduce_molecular_complex_to_contacts(fragments, self.cutoff)
    for (frag1_ind, frag2_ind) in itertools.combinations(
        range(len(fragments)), 2):
      frag1, frag2 = fragments[frag1_ind], fragments[frag2_ind]
      distances = compute_pairwise_distances(frag1[0], frag2[0])
      frag1_xyz = subtract_centroid(frag1[0], centroid)
      frag2_xyz = subtract_centroid(frag2[0], centroid)
      xyzs = [frag1_xyz, frag2_xyz]
      # rdks = [frag1[1], frag2[1]]
      pairwise_features.append(
          np.concatenate(
              [
                  sum([
                      voxelize(
                          convert_atom_pair_to_voxel,
                          hash_function=None,
                          box_width=self.box_width,
                          voxel_width=self.voxel_width,
                          coordinates=xyz,
                          feature_list=hbond_list,
                          nb_channel=1) for xyz in xyzs
                  ]) for hbond_list in compute_hydrogen_bonds(
                      frag1, frag2, distances, self.distance_bins,
                      self.angle_cutoffs)
              ],
              axis=-1))
    # Features are of shape (voxels_per_edge, voxels_per_edge, voxels_per_edge, 1) so we should concatenate on the last axis.
    return np.concatenate(pairwise_features, axis=-1)
Пример #8
0
    def _featurize(self, complex: Tuple[str, str]) -> Optional[np.ndarray]:
        """
    Compute featurization for a single mol/protein complex

    Parameters
    ----------
    complex: Tuple[str, str]
      Filenames for molecule and protein.
    """
        try:
            fragments = rdkit_utils.load_complex(complex, add_hydrogens=False)

        except MoleculeLoadException:
            logger.warning(
                "This molecule cannot be loaded by Rdkit. Returning None")
            return None
        pairwise_features = []
        # We compute pairwise contact fingerprints
        centroid = compute_contact_centroid(fragments, cutoff=self.cutoff)
        if self.reduce_to_contacts:
            fragments = reduce_molecular_complex_to_contacts(
                fragments, self.cutoff)
        for (frag1_ind,
             frag2_ind) in itertools.combinations(range(len(fragments)), 2):
            frag1, frag2 = fragments[frag1_ind], fragments[frag2_ind]
            distances = compute_pairwise_distances(frag1[0], frag2[0])
            frag1_xyz = subtract_centroid(frag1[0], centroid)
            frag2_xyz = subtract_centroid(frag2[0], centroid)
            xyzs = [frag1_xyz, frag2_xyz]
            # rdks = [frag1[1], frag2[1]]
            pairwise_features.append(
                sum([
                    voxelize(convert_atom_pair_to_voxel,
                             hash_function=None,
                             coordinates=xyz,
                             box_width=self.box_width,
                             voxel_width=self.voxel_width,
                             feature_list=compute_salt_bridges(
                                 frag1[1],
                                 frag2[1],
                                 distances,
                                 cutoff=self.cutoff),
                             nb_channel=1) for xyz in xyzs
                ]))
        # Features are of shape (voxels_per_edge, voxels_per_edge, voxels_per_edge, 1) so we should concatenate on the last axis.
        return np.concatenate(pairwise_features, axis=-1)
Пример #9
0
    def _featurize(self, datapoint, **kwargs):
        """
    Compute featurization for a molecular complex

    Parameters
    ----------
    datapoint: Tuple[str, str]
      Filenames for molecule and protein.
    """
        if 'complex' in kwargs:
            datapoint = kwargs.get("complex")
            raise DeprecationWarning(
                'Complex is being phased out as a parameter, please pass "datapoint" instead.'
            )

        try:
            fragments = load_complex(datapoint, add_hydrogens=False)

        except MoleculeLoadException:
            logger.warning(
                "This molecule cannot be loaded by Rdkit. Returning None")
            return None
        pairwise_features = []
        # We compute pairwise contact fingerprints
        centroid = compute_contact_centroid(fragments, cutoff=self.cutoff)
        for (frag1, frag2) in itertools.combinations(fragments, 2):
            distances = compute_pairwise_distances(frag1[0], frag2[0])
            frag1_xyz = subtract_centroid(frag1[0], centroid)
            frag2_xyz = subtract_centroid(frag2[0], centroid)
            xyzs = [frag1_xyz, frag2_xyz]
            pairwise_features.append(
                np.concatenate([
                    voxelize(convert_atom_pair_to_voxel,
                             hash_function=hash_ecfp_pair,
                             coordinates=xyzs,
                             box_width=self.box_width,
                             voxel_width=self.voxel_width,
                             feature_dict=splif_dict,
                             nb_channel=self.size) for splif_dict in
                    featurize_splif(frag1, frag2, self.contact_bins, distances,
                                    self.radius)
                ],
                               axis=-1))
        # Features are of shape (voxels_per_edge, voxels_per_edge, voxels_per_edge, 1) so we should concatenate on the last axis.
        return np.concatenate(pairwise_features, axis=-1)
Пример #10
0
def get_contact_atom_indices(fragments: List[Tuple[np.ndarray, RDKitMol]],
                             cutoff: float = 4.5) -> List[List[int]]:
    """Compute that atoms close to contact region.

  Molecular complexes can get very large. This can make it unwieldy to
  compute functions on them. To improve memory usage, it can be very
  useful to trim out atoms that aren't close to contact regions. This
  function computes pairwise distances between all pairs of molecules
  in the molecular complex. If an atom is within cutoff distance of
  any atom on another molecule in the complex, it is regarded as a
  contact atom. Otherwise it is trimmed.

  Parameters
  ----------
  fragments: List[Tuple[np.ndarray, RDKit Mol]]
    As returned by `rdkit_utils.load_complex`, a list of tuples of
    `(coords, mol)` where `coords` is a `(N_atoms, 3)` array and `mol`
    is the rdkit molecule object.
  cutoff: float, optional (default 4.5)
    The cutoff distance in angstroms.

  Returns
  -------
  List[List[int]]
    A list of length `len(molecular_complex)`. Each entry in this list
    is a list of atom indices from that molecule which should be kept, in
    sorted order.
  """
    # indices to atoms to keep
    keep_inds: List[Set[int]] = [set([]) for _ in fragments]
    for (ind1, ind2) in itertools.combinations(range(len(fragments)), 2):
        frag1, frag2 = fragments[ind1], fragments[ind2]
        pairwise_distances = compute_pairwise_distances(frag1[0], frag2[0])
        # contacts is of form (x_coords, y_coords), a tuple of 2 lists
        contacts = np.nonzero((pairwise_distances < cutoff))
        # contacts[0] is the x_coords, that is the frag1 atoms that have
        # nonzero contact.
        frag1_atoms = set([int(c) for c in contacts[0].tolist()])
        # contacts[1] is the y_coords, the frag2 atoms with nonzero contacts
        frag2_atoms = set([int(c) for c in contacts[1].tolist()])
        keep_inds[ind1] = keep_inds[ind1].union(frag1_atoms)
        keep_inds[ind2] = keep_inds[ind2].union(frag2_atoms)
    sorted_keep_inds = [sorted(list(keep)) for keep in keep_inds]
    return sorted_keep_inds
Пример #11
0
def featurize_contacts_ecfp(
        frag1: Tuple,
        frag2: Tuple,
        pairwise_distances: np.ndarray = None,
        cutoff: float = 4.5,
        ecfp_degree: int = 2) -> Tuple[Dict[int, str], Dict[int, str]]:
    """Computes ECFP dicts for pairwise interaction between two molecular fragments.

  Parameters
  ----------
  frag1: Tuple
    A tuple of (coords, mol) returned by `load_molecule`.
  frag2: Tuple
    A tuple of (coords, mol) returned by `load_molecule`.
  pairwise_distances: np.ndarray
    Array of pairwise fragment-fragment distances (Angstroms)
  cutoff: float
    Cutoff distance for contact consideration
  ecfp_degree: int
    ECFP radius

  Returns
  -------
  Tuple of dictionaries of ECFP contact fragments
  """
    if pairwise_distances is None:
        pairwise_distances = compute_pairwise_distances(frag1[0], frag2[0])
    # contacts is of form (x_coords, y_coords), a tuple of 2 lists
    contacts = np.nonzero((pairwise_distances < cutoff))
    # contacts[0] is the x_coords, that is the frag1 atoms that have
    # nonzero contact.
    frag1_atoms = set([int(c) for c in contacts[0].tolist()])
    # contacts[1] is the y_coords, the frag2 atoms with nonzero contacts
    frag2_atoms = set([int(c) for c in contacts[1].tolist()])

    frag1_ecfp_dict = compute_all_ecfp(frag1[1],
                                       indices=frag1_atoms,
                                       degree=ecfp_degree)
    frag2_ecfp_dict = compute_all_ecfp(frag2[1],
                                       indices=frag2_atoms,
                                       degree=ecfp_degree)

    return (frag1_ecfp_dict, frag2_ecfp_dict)
    def _featurize(self, mol_pdb: str, complex_pdb: str):
        """
    Compute featurization for a molecular complex

    Parameters
    ----------
    mol_pdb: str
      Filename for ligand molecule
    complex_pdb: str
      Filename for protein molecule
    """
        molecular_complex = (mol_pdb, complex_pdb)
        try:
            fragments = load_complex(molecular_complex, add_hydrogens=False)

        except MoleculeLoadException:
            logger.warning(
                "This molecule cannot be loaded by Rdkit. Returning None")
            return None
        pairwise_features = []
        # We compute pairwise contact fingerprints
        centroid = compute_contact_centroid(fragments, cutoff=self.cutoff)
        for (frag1, frag2) in itertools.combinations(fragments, 2):
            distances = compute_pairwise_distances(frag1[0], frag2[0])
            frag1_xyz = subtract_centroid(frag1[0], centroid)
            frag2_xyz = subtract_centroid(frag2[0], centroid)
            xyzs = [frag1_xyz, frag2_xyz]
            pairwise_features.append(
                np.concatenate([
                    voxelize(convert_atom_pair_to_voxel,
                             hash_function=hash_ecfp_pair,
                             coordinates=xyzs,
                             box_width=self.box_width,
                             voxel_width=self.voxel_width,
                             feature_dict=splif_dict,
                             nb_channel=self.size) for splif_dict in
                    featurize_splif(frag1, frag2, self.contact_bins, distances,
                                    self.radius)
                ],
                               axis=-1))
        # Features are of shape (voxels_per_edge, voxels_per_edge, voxels_per_edge, 1) so we should concatenate on the last axis.
        return np.concatenate(pairwise_features, axis=-1)
Пример #13
0
    def _featurize(self, complex: Tuple[str, str]) -> Optional[np.ndarray]:
        """
    Compute featurization for a single mol/protein complex

    Parameters
    ----------
    complex: Tuple[str, str]
      Filenames for molecule and protein.
    """
        try:
            fragments = rdkit_utils.load_complex(complex, add_hydrogens=False)

        except MoleculeLoadException:
            logger.warning(
                "This molecule cannot be loaded by Rdkit. Returning None")
            return None
        pairwise_features = []
        # We compute pairwise contact fingerprints
        # centroid = compute_contact_centroid(fragments, cutoff=self.cutoff)
        if self.reduce_to_contacts:
            fragments = reduce_molecular_complex_to_contacts(
                fragments, self.cutoff)
        # We compute pairwise contact fingerprints
        for (frag1_ind,
             frag2_ind) in itertools.combinations(range(len(fragments)), 2):
            frag1, frag2 = fragments[frag1_ind], fragments[frag2_ind]
            distances = compute_pairwise_distances(frag1[0], frag2[0])
            # frag1_xyz = subtract_centroid(frag1[0], centroid)
            # frag2_xyz = subtract_centroid(frag2[0], centroid)
            # xyzs = [frag1_xyz, frag2_xyz]
            # rdks = [frag1[1], frag2[1]]
            pairwise_features.append(
                np.concatenate([
                    np.array([len(hbond_list)])
                    for hbond_list in compute_hydrogen_bonds(
                        frag1, frag2, distances, self.distance_bins,
                        self.angle_cutoffs)
                ],
                               axis=-1))
        # Features are of shape (voxels_per_edge, voxels_per_edge, voxels_per_edge, 1) so we should concatenate on the last axis.
        return np.concatenate(pairwise_features, axis=-1)
Пример #14
0
    def _featurize(self, datapoint, **kwargs):
        """
    Compute featurization for a molecular complex

    Parameters
    ----------
    datapoint: Tuple[str, str]
      Filenames for molecule and protein.
    """
        if 'complex' in kwargs:
            datapoint = kwargs.get("complex")
            raise DeprecationWarning(
                'Complex is being phased out as a parameter, please pass "datapoint" instead.'
            )

        try:
            fragments = load_complex(datapoint, add_hydrogens=False)

        except MoleculeLoadException:
            logger.warning(
                "This molecule cannot be loaded by Rdkit. Returning None")
            return None
        pairwise_features = []
        # We compute pairwise contact fingerprints
        for (frag1, frag2) in itertools.combinations(fragments, 2):
            # Get coordinates
            distances = compute_pairwise_distances(frag1[0], frag2[0])
            vector = [
                vectorize(hash_ecfp, feature_dict=ecfp_dict, size=self.size)
                for ecfp_dict in featurize_contacts_ecfp(
                    frag1,
                    frag2,
                    distances,
                    cutoff=self.cutoff,
                    ecfp_degree=self.radius)
            ]
            pairwise_features += vector

        pairwise_features = np.concatenate(pairwise_features)
        return pairwise_features
Пример #15
0
    def _featurize(self, complex):
        """Computes grid featurization of protein/ligand complex.

    Takes as input filenames pdb of the protein, pdb of the ligand.

    This function then computes the centroid of the ligand; decrements this
    centroid from the atomic coordinates of protein and ligand atoms, and then
    merges the translated protein and ligand. This combined system/complex is
    then saved.

    This function then computes a featurization with scheme specified by the user.

    Parameters
    ----------
    complex: Tuple[str, str]
      Filenames for molecule and protein.
    """
        try:
            mol_pdb_file, protein_pdb_file = complex
            time1 = time.time()

            protein_xyz, protein_rdk = load_molecule(protein_pdb_file,
                                                     calc_charges=True,
                                                     sanitize=self.sanitize)
            time2 = time.time()
            logger.info(
                "TIMING: Loading protein coordinates took %0.3f s" %
                (time2 - time1), self.verbose)
            time1 = time.time()
            ligand_xyz, ligand_rdk = load_molecule(mol_pdb_file,
                                                   calc_charges=True,
                                                   sanitize=self.sanitize)
            time2 = time.time()
            logger.info(
                "TIMING: Loading ligand coordinates took %0.3f s" %
                (time2 - time1), self.verbose)
        except MoleculeLoadException:
            logger.warning(
                "Some molecules cannot be loaded by Rdkit. Skipping")
            return None

        time1 = time.time()
        centroid = compute_centroid(ligand_xyz)
        ligand_xyz = subtract_centroid(ligand_xyz, centroid)
        protein_xyz = subtract_centroid(protein_xyz, centroid)
        time2 = time.time()
        logger.info(
            "TIMING: Centroid processing took %0.3f s" % (time2 - time1),
            self.verbose)

        pairwise_distances = compute_pairwise_distances(
            protein_xyz, ligand_xyz)

        transformed_systems = {}
        transformed_systems[(0, 0)] = [protein_xyz, ligand_xyz]

        for i in range(self.nb_rotations):
            rotated_system = rotate_molecules([protein_xyz, ligand_xyz])
            transformed_systems[(i + 1, 0)] = rotated_system

        features_dict = {}
        for system_id, (protein_xyz,
                        ligand_xyz) in transformed_systems.items():
            feature_arrays = []
            for is_flat, function_name in self.feature_types:

                result = self._compute_feature(
                    function_name,
                    protein_xyz,
                    protein_rdk,
                    ligand_xyz,
                    ligand_rdk,
                    pairwise_distances,
                )
                feature_arrays += result

                if self.flatten:
                    features_dict[system_id] = np.concatenate([
                        feature_array.flatten()
                        for feature_array in feature_arrays
                    ])
                else:
                    features_dict[system_id] = np.concatenate(feature_arrays,
                                                              axis=-1)

        # TODO(rbharath): Is this squeeze OK?
        features = np.squeeze(np.array(list(features_dict.values())))
        return features
Пример #16
0
    def _featurize(self, complex) -> Optional[np.ndarray]:
        """
    Compute featurization for a single mol/protein complex

    Parameters
    ----------
    complex: Tuple[str, str]
      Filenames for molecule and protein.
    """
        try:
            fragments = rdkit_utils.load_complex(complex, add_hydrogens=False)

        except MoleculeLoadException:
            logger.warning(
                "This molecule cannot be loaded by Rdkit. Returning None")
            return None
        pairwise_features = []
        # We compute pairwise contact fingerprints
        centroid = compute_contact_centroid(fragments, cutoff=self.cutoff)
        for (frag1_ind,
             frag2_ind) in itertools.combinations(range(len(fragments)), 2):
            frag1, frag2 = fragments[frag1_ind], fragments[frag2_ind]
            distances = compute_pairwise_distances(frag1[0], frag2[0])
            frag1_xyz = subtract_centroid(frag1[0], centroid)
            frag2_xyz = subtract_centroid(frag2[0], centroid)
            xyzs = [frag1_xyz, frag2_xyz]
            # rdks = [frag1[1], frag2[1]]
            protein_pi_t, protein_pi_parallel, ligand_pi_t, ligand_pi_parallel = (
                compute_pi_stack(frag1[1],
                                 frag2[1],
                                 distances,
                                 dist_cutoff=self.cutoff,
                                 angle_cutoff=self.angle_cutoff))
            pi_parallel_tensor = sum([
                voxelize(convert_atom_to_voxel,
                         hash_function=None,
                         box_width=self.box_width,
                         voxel_width=self.voxel_width,
                         coordinates=xyz,
                         feature_dict=feature_dict,
                         nb_channel=1)
                for (xyz, feature_dict
                     ) in zip(xyzs, [ligand_pi_parallel, protein_pi_parallel])
            ])

            pi_t_tensor = sum([
                voxelize(convert_atom_to_voxel,
                         hash_function=None,
                         box_width=self.box_width,
                         voxel_width=self.voxel_width,
                         coordinates=frag1_xyz,
                         feature_dict=protein_pi_t,
                         nb_channel=1)
                for (xyz,
                     feature_dict) in zip(xyzs, [ligand_pi_t, protein_pi_t])
            ])

            pairwise_features.append(
                np.concatenate([pi_parallel_tensor, pi_t_tensor], axis=-1))
        # Features are of shape (voxels_per_edge, voxels_per_edge, voxels_per_edge, 2) so we should concatenate on the last axis.
        return np.concatenate(pairwise_features, axis=-1)