예제 #1
0
  def _featurize(self, datapoint: RDKitMol, **kwargs) -> GraphData:
    """Calculate molecule graph features from RDKit mol object.

    Parameters
    ----------
    datapoint: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    graph: GraphData
      A molecule graph with some features.
    """
    if 'mol' in kwargs:
      datapoint = kwargs.get("mol")
      raise DeprecationWarning(
          'Mol is being phased out as a parameter, please pass "datapoint" instead.'
      )

    node_features = np.asarray(
        [self._pagtn_atom_featurizer(atom) for atom in datapoint.GetAtoms()],
        dtype=np.float)
    edge_index, edge_features = self._pagtn_edge_featurizer(datapoint)
    graph = GraphData(node_features, edge_index, edge_features)
    return graph
예제 #2
0
def convert_protein_to_pdbqt(mol: RDKitMol, outfile: str) -> None:
    """Convert a protein PDB file into a pdbqt file.

  Writes the extra PDBQT terms directly to `outfile`.

  Parameters
  ----------
  mol: RDKit Mol
    Protein molecule
  outfile: str
    filename which already has a valid pdb representation of mol
  """
    lines = [x.strip() for x in open(outfile).readlines()]
    out_lines = []
    for line in lines:
        if "ROOT" in line or "ENDROOT" in line or "TORSDOF" in line:
            out_lines.append("%s\n" % line)
            continue
        if not line.startswith("ATOM"):
            continue
        line = line[:66]
        atom_index = int(line[6:11])
        atom = mol.GetAtoms()[atom_index - 1]
        line = "%s    +0.000 %s\n" % (line, atom.GetSymbol().ljust(2))
        out_lines.append(line)
    with open(outfile, 'w') as fout:
        for line in out_lines:
            fout.write(line)
예제 #3
0
    def _featurize(self, datapoint: RDKitMol, **kwargs) -> np.ndarray:
        """Calculate symmetry function.

    Parameters
    ----------
    datapoint: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      A numpy array of symmetry function. The shape is `(max_atoms, 4)`.
    """
        if 'mol' in kwargs:
            datapoint = kwargs.get("mol")
            raise DeprecationWarning(
                'Mol is being phased out as a parameter, please pass "datapoint" instead.'
            )
        coordinates = self.coordfeat._featurize(datapoint)
        atom_numbers = np.array(
            [atom.GetAtomicNum() for atom in datapoint.GetAtoms()])
        atom_numbers = np.expand_dims(atom_numbers, axis=1)
        assert atom_numbers.shape[0] == coordinates.shape[0]
        features = np.concatenate([atom_numbers, coordinates], axis=1)
        return pad_array(features, (self.max_atoms, 4))
예제 #4
0
def max_pair_distance_pairs(mol: RDKitMol,
                            max_pair_distance: Optional[int]) -> np.ndarray:
  """Helper method which finds atom pairs within max_pair_distance graph distance.

  This helper method is used to find atoms which are within max_pair_distance
  graph_distance of one another. This is done by using the fact that the
  powers of an adjacency matrix encode path connectivity information. In
  particular, if `adj` is the adjacency matrix, then `adj**k` has a nonzero
  value at `(i, j)` if and only if there exists a path of graph distance `k`
  between `i` and `j`. To find all atoms within `max_pair_distance` of each
  other, we can compute the adjacency matrix powers `[adj, adj**2,
  ...,adj**max_pair_distance]` and find pairs which are nonzero in any of
  these matrices. Since adjacency matrices and their powers are positive
  numbers, this is simply the nonzero elements of `adj + adj**2 + ... +
  adj**max_pair_distance`.

  Parameters
  ----------
  mol: rdkit.Chem.rdchem.Mol
    RDKit molecules
  max_pair_distance: Optional[int], (default None)
    This value can be a positive integer or None. This
    parameter determines the maximum graph distance at which pair
    features are computed. For example, if `max_pair_distance==2`,
    then pair features are computed only for atoms at most graph
    distance 2 apart. If `max_pair_distance` is `None`, all pairs are
    considered (effectively infinite `max_pair_distance`)


  Returns
  -------
  np.ndarray
    Of shape `(2, num_pairs)` where `num_pairs` is the total number of pairs
    within `max_pair_distance` of one another.
  """
  from rdkit import Chem
  from rdkit.Chem import rdmolops
  N = len(mol.GetAtoms())
  if (max_pair_distance is None or max_pair_distance >= N):
    max_distance = N
  elif max_pair_distance is not None and max_pair_distance <= 0:
    raise ValueError(
        "max_pair_distance must either be a positive integer or None")
  elif max_pair_distance is not None:
    max_distance = max_pair_distance
  adj = rdmolops.GetAdjacencyMatrix(mol)
  # Handle edge case of self-pairs (i, i)
  sum_adj = np.eye(N)
  for i in range(max_distance):
    # Increment by 1 since we don't want 0-indexing
    power = i + 1
    sum_adj += np.linalg.matrix_power(adj, power)
  nonzero_locs = np.where(sum_adj != 0)
  num_pairs = len(nonzero_locs[0])
  # This creates a matrix of shape (2, num_pairs)
  pair_edges = np.reshape(np.array(list(zip(nonzero_locs))), (2, num_pairs))
  return pair_edges
    def _featurize(self, mol: RDKitMol) -> GraphData:
        """Calculate molecule graph features from RDKit mol object.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    graph: GraphData
      A molecule graph with some features.
    """
        if self.use_partial_charge:
            try:
                mol.GetAtomWithIdx(0).GetProp('_GasteigerCharge')
            except:
                # If partial charges were not computed
                try:
                    from rdkit.Chem import AllChem
                    AllChem.ComputeGasteigerCharges(mol)
                except ModuleNotFoundError:
                    raise ImportError(
                        "This class requires RDKit to be installed.")

        # construct atom (node) feature
        h_bond_infos = construct_hydrogen_bonding_info(mol)
        atom_features = np.asarray(
            [
                _construct_atom_feature(atom, h_bond_infos, self.use_chirality,
                                        self.use_partial_charge)
                for atom in mol.GetAtoms()
            ],
            dtype=float,
        )

        # construct edge (bond) index
        src, dest = [], []
        for bond in mol.GetBonds():
            # add edge list considering a directed graph
            start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
            src += [start, end]
            dest += [end, start]

        # construct edge (bond) feature
        bond_features = None  # deafult None
        if self.use_edges:
            features = []
            for bond in mol.GetBonds():
                features += 2 * [_construct_bond_feature(bond)]
            bond_features = np.asarray(features, dtype=float)

        return GraphData(node_features=atom_features,
                         edge_index=np.asarray([src, dest], dtype=int),
                         edge_features=bond_features)
    def _featurize(self, mol: RDKitMol) -> GraphData:
        """Calculate molecule graph features from RDKit mol object.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    graph: GraphData
      A molecule graph with some features.
    """
        from rdkit import Chem
        from rdkit.Chem import AllChem

        # construct atom and bond features
        try:
            mol.GetAtomWithIdx(0).GetProp('_GasteigerCharge')
        except:
            # If partial charges were not computed
            AllChem.ComputeGasteigerCharges(mol)

        h_bond_infos = construct_hydrogen_bonding_info(mol)
        sssr = Chem.GetSymmSSSR(mol)

        # construct atom (node) feature
        atom_features = np.array(
            [
                _construct_atom_feature(atom, h_bond_infos, sssr)
                for atom in mol.GetAtoms()
            ],
            dtype=np.float,
        )

        # construct edge (bond) information
        src, dest, bond_features = [], [], []
        for bond in mol.GetBonds():
            # add edge list considering a directed graph
            start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
            src += [start, end]
            dest += [end, start]
            bond_features += 2 * [_construct_bond_feature(bond)]

        if self.add_self_edges:
            num_atoms = mol.GetNumAtoms()
            src += [i for i in range(num_atoms)]
            dest += [i for i in range(num_atoms)]
            # add dummy edge features
            bond_fea_length = len(bond_features[0])
            bond_features += num_atoms * [[0 for _ in range(bond_fea_length)]]

        return GraphData(node_features=atom_features,
                         edge_index=np.array([src, dest], dtype=np.int),
                         edge_features=np.array(bond_features, dtype=np.float))
예제 #7
0
    def _featurize(self, datapoint: RDKitMol,
                   **kwargs) -> Optional[GraphMatrix]:
        """
    Calculate adjacency matrix and nodes features for RDKitMol.
    It strips any chirality and charges

    Parameters
    ----------
    datapoint: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    graph: GraphMatrix
      A molecule graph with some features.
    """

        try:
            from rdkit import Chem
        except ModuleNotFoundError:
            raise ImportError("This method requires RDKit to be installed.")
        if 'mol' in kwargs:
            datapoint = kwargs.get("mol")
            raise DeprecationWarning(
                'Mol is being phased out as a parameter, please pass "datapoint" instead.'
            )

        if self.kekulize:
            Chem.Kekulize(datapoint)

        A = np.zeros(shape=(self.max_atom_count, self.max_atom_count),
                     dtype=np.float32)
        bonds = datapoint.GetBonds()

        begin, end = [b.GetBeginAtomIdx()
                      for b in bonds], [b.GetEndAtomIdx() for b in bonds]
        bond_type = [self.bond_encoder[b.GetBondType()] for b in bonds]

        A[begin, end] = bond_type
        A[end, begin] = bond_type

        degree = np.sum(A[:datapoint.GetNumAtoms(), :datapoint.GetNumAtoms()],
                        axis=-1)
        X = np.array(
            [
                self.atom_encoder[atom.GetAtomicNum()]
                for atom in datapoint.GetAtoms()
            ] + [0] * (self.max_atom_count - datapoint.GetNumAtoms()),
            dtype=np.int32,
        )
        graph = GraphMatrix(A, X)

        return graph if (degree > 0).all() else None
예제 #8
0
  def construct_node_features_matrix(self, mol: RDKitMol) -> np.ndarray:
    """
    This function constructs a matrix of atom features for all atoms in a given molecule using the atom_features function.

    Parameters
    ----------
    mol: RDKitMol
      RDKit Mol object.

    Returns
    ----------
    Atom_features: ndarray
      Numpy array containing atom features.
    """
    return np.array([self.atom_features(atom) for atom in mol.GetAtoms()])
예제 #9
0
    def _featurize(self, mol: RDKitMol) -> Optional[GraphMatrix]:
        """
    Calculate adjacency matrix and nodes features for RDKitMol.
    It strips any chirality and charges

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    graph: GraphMatrix
      A molecule graph with some features.
    """

        try:
            from rdkit import Chem
        except ModuleNotFoundError:
            raise ImportError("This method requires RDKit to be installed.")

        if self.kekulize:
            Chem.Kekulize(mol)

        A = np.zeros(shape=(self.max_atom_count, self.max_atom_count),
                     dtype=np.float32)
        bonds = mol.GetBonds()

        begin, end = [b.GetBeginAtomIdx()
                      for b in bonds], [b.GetEndAtomIdx() for b in bonds]
        bond_type = [self.bond_encoder[b.GetBondType()] for b in bonds]

        A[begin, end] = bond_type
        A[end, begin] = bond_type

        degree = np.sum(A[:mol.GetNumAtoms(), :mol.GetNumAtoms()], axis=-1)
        X = np.array(
            [
                self.atom_encoder[atom.GetAtomicNum()]
                for atom in mol.GetAtoms()
            ] + [0] * (self.max_atom_count - mol.GetNumAtoms()),
            dtype=np.int32,
        )
        graph = GraphMatrix(A, X)

        return graph if (degree > 0).all() else None
예제 #10
0
    def coulomb_matrix(self, mol: RDKitMol) -> np.ndarray:
        """
    Generate Coulomb matrices for each conformer of the given molecule.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      The coulomb matrices of the given molecule
    """
        try:
            from rdkit import Chem
            from rdkit.Chem import AllChem
        except ModuleNotFoundError:
            raise ImportError("This class requires RDKit to be installed.")

        # Check whether num_confs >=1 or not
        num_confs = len(mol.GetConformers())
        if num_confs == 0:
            mol = Chem.AddHs(mol)
            AllChem.EmbedMolecule(mol, AllChem.ETKDG())

        if self.remove_hydrogens:
            mol = Chem.RemoveHs(mol)
        n_atoms = mol.GetNumAtoms()
        z = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
        rval = []
        for conf in mol.GetConformers():
            d = self.get_interatomic_distances(conf)
            m = np.outer(z, z) / d
            m[range(n_atoms), range(n_atoms)] = 0.5 * np.array(z)**2.4
            if self.randomize:
                for random_m in self.randomize_coulomb_matrix(m):
                    random_m = pad_array(random_m, self.max_atoms)
                    rval.append(random_m)
            else:
                m = pad_array(m, self.max_atoms)
                rval.append(m)
        rval = np.asarray(rval)
        return rval
    def _featurize(self, mol: RDKitMol) -> GraphData:
        """Calculate molecule graph features from RDKit mol object.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    graph: GraphData
      A molecule graph with some features.
    """
        node_features = np.asarray(
            [self._pagtn_atom_featurizer(atom) for atom in mol.GetAtoms()],
            dtype=np.float)
        edge_index, edge_features = self._pagtn_edge_featurizer(mol)
        graph = GraphData(node_features, edge_index, edge_features)
        return graph
예제 #12
0
  def _featurize(self, mol: RDKitMol) -> np.ndarray:
    """Calculate symmetry function.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      A numpy array of symmetry function. The shape is `(max_atoms, 4)`.
    """
    coordinates = self.coordfeat._featurize(mol)
    atom_numbers = np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()])
    atom_numbers = np.expand_dims(atom_numbers, axis=1)
    assert atom_numbers.shape[0] == coordinates.shape[0]
    features = np.concatenate([atom_numbers, coordinates], axis=1)
    return pad_array(features, (self.max_atoms, 4))
    def _featurize(self, mol: RDKitMol) -> GraphMatrix:
        """Calculate adjacency matrix and nodes features for RDKitMol.

        Parameters
        ----------
        mol: rdkit.Chem.rdchem.Mol
          RDKit mol object.
        Returns
        -------
        graph: GraphMatrix
          A molecule graph with some features.
        """
        if self.kekulize:
            Chem.Kekulize(mol)

        A = np.zeros(shape=(self.max_atom_count, self.max_atom_count),
                     dtype=np.float32)
        bonds = mol.GetBonds()

        begin, end = [b.GetBeginAtomIdx()
                      for b in bonds], [b.GetEndAtomIdx() for b in bonds]
        bond_type = [self.bond_encoder[b.GetBondType()] for b in bonds]

        A[begin, end] = bond_type
        A[end, begin] = bond_type

        degree = np.sum(A[:mol.GetNumAtoms(), :mol.GetNumAtoms()], axis=-1)
        X = np.array(
            [
                self.atom_encoder[atom.GetAtomicNum()]
                for atom in mol.GetAtoms()
            ] + [0] * (self.max_atom_count - mol.GetNumAtoms()),
            dtype=np.int32,
        )
        graph = GraphMatrix(A, X)

        return graph if (degree > 0).all() else None
예제 #14
0
def compute_all_ecfp(mol: RDKitMol,
                     indices: Optional[Set[int]] = None,
                     degree: int = 2) -> Dict[int, str]:
    """Obtain molecular fragment for all atoms emanating outward to given degree.

  For each fragment, compute SMILES string (for now) and hash to
  an int. Return a dictionary mapping atom index to hashed
  SMILES.

  Parameters
  ----------
  mol: rdkit Molecule
    Molecule to compute ecfp fragments on
  indices: Optional[Set[int]]
    List of atom indices for molecule. Default is all indices. If
    specified will only compute fragments for specified atoms.
  degree: int
    Graph degree to use when computing ECFP fingerprints

  Returns
  ----------
  dict
    Dictionary mapping atom index to hashed smiles.
  """

    ecfp_dict = {}
    from rdkit import Chem
    for i in range(mol.GetNumAtoms()):
        if indices is not None and i not in indices:
            continue
        env = Chem.FindAtomEnvironmentOfRadiusN(mol, degree, i, useHs=True)
        submol = Chem.PathToSubmol(mol, env)
        smile = Chem.MolToSmiles(submol)
        ecfp_dict[i] = "%s,%s" % (mol.GetAtoms()[i].GetAtomicNum(), smile)

    return ecfp_dict
예제 #15
0
    def _featurize(self, datapoint: RDKitMol, **kwargs) -> GraphData:
        """Calculate molecule graph features from RDKit mol object.

    Parameters
    ----------
    datapoint: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    graph: GraphData
      A molecule graph with some features.
    """
        assert datapoint.GetNumAtoms(
        ) > 1, "More than one atom should be present in the molecule for this featurizer to work."
        if 'mol' in kwargs:
            datapoint = kwargs.get("mol")
            raise DeprecationWarning(
                'Mol is being phased out as a parameter, please pass "datapoint" instead.'
            )

        if self.use_partial_charge:
            try:
                datapoint.GetAtomWithIdx(0).GetProp('_GasteigerCharge')
            except:
                # If partial charges were not computed
                try:
                    from rdkit.Chem import AllChem
                    AllChem.ComputeGasteigerCharges(datapoint)
                except ModuleNotFoundError:
                    raise ImportError(
                        "This class requires RDKit to be installed.")

        # construct atom (node) feature
        h_bond_infos = construct_hydrogen_bonding_info(datapoint)
        atom_features = np.asarray(
            [
                _construct_atom_feature(atom, h_bond_infos, self.use_chirality,
                                        self.use_partial_charge)
                for atom in datapoint.GetAtoms()
            ],
            dtype=float,
        )

        # construct edge (bond) index
        src, dest = [], []
        for bond in datapoint.GetBonds():
            # add edge list considering a directed graph
            start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
            src += [start, end]
            dest += [end, start]

        # construct edge (bond) feature
        bond_features = None  # deafult None
        if self.use_edges:
            features = []
            for bond in datapoint.GetBonds():
                features += 2 * [_construct_bond_feature(bond)]
            bond_features = np.asarray(features, dtype=float)

        return GraphData(node_features=atom_features,
                         edge_index=np.asarray([src, dest], dtype=int),
                         edge_features=bond_features)