Exemplo n.º 1
0
def smiles2adjoin(smiles, explicit_hydrogens=True, canonical_atom_order=False):

    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        print('error')
        mol = Chem.MolFromSmiles(obsmitosmile(smiles))
        assert mol is not None, smiles + ' is not valid '

    if explicit_hydrogens:
        mol = Chem.AddHs(mol)
    else:
        mol = Chem.RemoveHs(mol)

    if canonical_atom_order:
        new_order = rdmolfiles.CanonicalRankAtoms(mol)
        mol = rdmolops.RenumberAtoms(mol, new_order)
    num_atoms = mol.GetNumAtoms()
    atoms_list = []
    for i in range(num_atoms):
        atom = mol.GetAtomWithIdx(i)
        atoms_list.append(atom.GetSymbol())

    adjoin_matrix = np.eye(num_atoms)
    # Add edges
    num_bonds = mol.GetNumBonds()
    for i in range(num_bonds):
        bond = mol.GetBondWithIdx(i)
        u = bond.GetBeginAtomIdx()
        v = bond.GetEndAtomIdx()
        adjoin_matrix[u, v] = 1.0
        adjoin_matrix[v, u] = 1.0
    return atoms_list, adjoin_matrix
Exemplo n.º 2
0
def featurize_smiles_np(arr, featurizer, log_every_N=1000, verbose=True):
    """Featurize individual compounds in a numpy array.

    Given a featurizer that operates on individual chemical compounds
    or macromolecules, compute & add features for that compound to the
    features array
    """
    features = []
    from rdkit import Chem
    from rdkit.Chem import rdmolfiles
    from rdkit.Chem import rdmolops
    for ind, elem in enumerate(arr.tolist()):
        mol = Chem.MolFromSmiles(elem)
        if mol:
            new_order = rdmolfiles.CanonicalRankAtoms(mol)
            mol = rdmolops.RenumberAtoms(mol, new_order)
        if ind % log_every_N == 0:
            log("Featurizing sample %d" % ind, verbose)
        features.append(featurizer.featurize([mol]))

    valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features],
                          dtype=bool)
    features = [
        elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
    ]
    features = np.squeeze(np.array(features))
    return features.reshape(-1, )
Exemplo n.º 3
0
def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True):
    """Featurize individual compounds in dataframe.

  Given a featurizer that operates on individual chemical compounds 
  or macromolecules, compute & add features for that compound to the 
  features dataframe
  """
    sample_elems = df[field].tolist()

    features = []
    for ind, elem in enumerate(sample_elems):
        mol = Chem.MolFromSmiles(elem)
        # TODO (ytz) this is a bandage solution to reorder the atoms so
        # that they're always in the same canonical order. Presumably this
        # should be correctly implemented in the future for graph mols.
        if mol:
            new_order = rdmolfiles.CanonicalRankAtoms(mol)
            mol = rdmolops.RenumberAtoms(mol, new_order)
        if ind % log_every_N == 0:
            log("Featurizing sample %d" % ind, verbose)
        features.append(featurizer.featurize([mol]))
    valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features],
                          dtype=bool)
    features = [
        elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
    ]
    return np.squeeze(np.array(features), axis=1), valid_inds
Exemplo n.º 4
0
def smile2graph(smile,
                add_self_loop=False,
                atom_featurizer=CanonicalAtomFeaturizer(),
                bond_featurizer=None):
    """Convert SMILES into a DGLGraph.

    The **i** th atom in the molecule, i.e. ``mol.GetAtomWithIdx(i)``, corresponds to the
    **i** th node in the returned DGLGraph.

    The **i** th bond in the molecule, i.e. ``mol.GetBondWithIdx(i)``, corresponds to the
    **(2i)**-th and **(2i+1)**-th edges in the returned DGLGraph. The **(2i)**-th and
    **(2i+1)**-th edges will be separately from **u** to **v** and **v** to **u**, where
    **u** is ``bond.GetBeginAtomIdx()`` and **v** is ``bond.GetEndAtomIdx()``.

    If self loops are added, the last **n** edges will separately be self loops for
    atoms ``0, 1, ..., n-1``.

    Parameters
    ----------
    smiles : str
        String of SMILES
    add_self_loop : bool
        Whether to add self loops in DGLGraphs.
    atom_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for atoms in a molecule, which can be used to update
        ndata for a DGLGraph. Default to CanonicalAtomFeaturizer().
    bond_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for bonds in a molecule, which can be used to update
        edata for a DGLGraph.
    """
    mol = Chem.MolFromSmiles(smile)
    new_order = rdmolfiles.CanonicalRankAtoms(mol)
    mol = rdmolops.RenumberAtoms(mol, new_order)
    g = DGLGraph()
    num_atoms = mol.GetNumAtoms()
    g.add_nodes(num_atoms)

    src_list = []
    dst_list = []
    num_bonds = mol.GetNumBonds()
    for i in range(num_bonds):
        bond = mol.GetBondWithIdx(i)
        u = bond.GetBeginAtomIdx()
        v = bond.GetEndAtomIdx()
        src_list.extend([u, v])
        dst_list.extend([v, u])
    g.add_edges(src_list, dst_list)

    if add_self_loop:
        nodes = g.nodes()
        g.add_edges(nodes, nodes)

    # Featurization
    if atom_featurizer is not None:
        g.ndata.update(atom_featurizer(mol))

    if bond_featurizer is not None:
        g.edata.update(bond_featurizer(mol))

    return g
Exemplo n.º 5
0
def mol_to_graph(mol, graph_constructor, atom_featurizer, bond_featurizer):
    """Convert an RDKit molecule object into a DGLGraph and featurize for it.

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule holder
    graph_constructor : callable
        Takes an RDKit molecule as input and returns a DGLGraph
    atom_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for atoms in a molecule, which can be used to update
        ndata for a DGLGraph.
    bond_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for bonds in a molecule, which can be used to update
        edata for a DGLGraph.

    Returns
    -------
    g : DGLGraph
        Converted DGLGraph for the molecule
    """
    new_order = rdmolfiles.CanonicalRankAtoms(mol)
    mol = rdmolops.RenumberAtoms(mol, new_order)
    g = graph_constructor(mol)

    if atom_featurizer is not None:
        g.ndata.update(atom_featurizer(mol))

    if bond_featurizer is not None:
        g.edata.update(bond_featurizer(mol))

    return g
Exemplo n.º 6
0
    def featurize(self, molecules, log_every_n=1000) -> np.ndarray:
        """Calculate features for molecules.

    Parameters
    ----------
    molecules: rdkit.Chem.rdchem.Mol / SMILES string / iterable
      RDKit Mol, or SMILES string or iterable sequence of RDKit mols/SMILES
      strings.
    log_every_n: int, default 1000
      Logging messages reported every `log_every_n` samples.

    Returns
    -------
    features: np.ndarray
      A numpy array containing a featurized representation of `datapoints`.
    """
        try:
            from rdkit import Chem
            from rdkit.Chem import rdmolfiles
            from rdkit.Chem import rdmolops
            from rdkit.Chem.rdchem import Mol
        except ModuleNotFoundError:
            raise ImportError("This class requires RDKit to be installed.")

        # Special case handling of single molecule
        if isinstance(molecules, str) or isinstance(molecules, Mol):
            molecules = [molecules]
        else:
            # Convert iterables to list
            molecules = list(molecules)

        features = []
        for i, mol in enumerate(molecules):
            if i % log_every_n == 0:
                logger.info("Featurizing datapoint %i" % i)

            try:
                if isinstance(mol, str):
                    # mol must be a RDKit Mol object, so parse a SMILES
                    mol = Chem.MolFromSmiles(mol)
                    # SMILES is unique, so set a canonical order of atoms
                    new_order = rdmolfiles.CanonicalRankAtoms(mol)
                    mol = rdmolops.RenumberAtoms(mol, new_order)

                features.append(self._featurize(mol))
            except Exception as e:
                if isinstance(mol, Chem.rdchem.Mol):
                    mol = Chem.MolToSmiles(mol)
                logger.warning(
                    "Failed to featurize datapoint %d, %s. Appending empty array",
                    i, mol)
                logger.warning("Exception message: {}".format(e))
                features.append(np.array([]))

        features = np.asarray(features)
        return features
Exemplo n.º 7
0
def fingerprint_features(smile_string, radius=2, size=2048):
    mol = MolFromSmiles(smile_string)
    new_order = rdmolfiles.CanonicalRankAtoms(mol)
    mol = rdmolops.RenumberAtoms(mol, new_order)
    return rdMolDescriptors.GetMorganFingerprintAsBitVect(mol,
                                                          radius,
                                                          nBits=size,
                                                          useChirality=True,
                                                          useBondTypes=True,
                                                          useFeatures=False)
Exemplo n.º 8
0
def mol_to_graph(mol, graph_constructor, node_featurizer, edge_featurizer,
                 canonical_atom_order, explicit_hydrogens):
    """Convert an RDKit molecule object into a DGLGraph and featurize for it.

    This function can be used to construct any arbitrary ``DGLGraph`` from an
    RDKit molecule instance.

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule holder
    graph_constructor : callable
        Takes an RDKit molecule as input and returns a DGLGraph
    node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for nodes like atoms in a molecule, which can be used to
        update ndata for a DGLGraph.
    edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for edges like bonds in a molecule, which can be used to
        update edata for a DGLGraph.
    canonical_atom_order : bool
        Whether to use a canonical order of atoms returned by RDKit. Setting it
        to true might change the order of atoms in the graph constructed.
    explicit_hydrogens : bool
        Whether to explicitly represent hydrogens as nodes in the graph.

    Returns
    -------
    g : DGLGraph
        Converted DGLGraph for the molecule

    See Also
    --------
    mol_to_bigraph
    mol_to_complete_graph
    mol_to_nearest_neighbor_graph
    """
    # Whether to have hydrogen atoms as explicit nodes
    if explicit_hydrogens:
        mol = Chem.AddHs(mol)
    else:
        mol = Chem.RemoveHs(mol)

    if canonical_atom_order:
        new_order = rdmolfiles.CanonicalRankAtoms(mol)
        mol = rdmolops.RenumberAtoms(mol, new_order)
    g = graph_constructor(mol)

    if node_featurizer is not None:
        g.ndata.update(node_featurizer(mol))

    if edge_featurizer is not None:
        g.edata.update(edge_featurizer(mol))

    return g
Exemplo n.º 9
0
def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True):
    """Featurize individual compounds in dataframe.

    Given a featurizer that operates on individual chemical compounds
    or macromolecules, compute & add features for that compound to the
    features dataframe
    """
    sample_elems = df[field].tolist()

    features = []
    from rdkit import Chem
    from rdkit.Chem import rdmolfiles
    from rdkit.Chem import rdmolops

    if 'Comet' in str(featurizer.__class__.__qualname__):
        mols = preprocess_df(sample_elems, NUM_WORKERS)
        mols_chunks = np.array_split(mols, len(mols) // BATCH_SIZE + 1)
        for chunk in mols_chunks:
            X, A, L = list(zip(*chunk))
            X = np.array(X, dtype=np.uint8)
            A = np.array(A, dtype=np.float32)
            L = np.array(L, dtype=np.uint8)
            max_len = L[-1]
            X = X[:, :max_len, :]
            A = A[:, :max_len, :max_len]
            temp = featurizer._featurize((X, A))
            features += list(temp)

        valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features],
                              dtype=bool)
        features = [
            elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
        ]
        return np.array(features), valid_inds

    else:
        for ind, elem in enumerate(sample_elems):
            mol = Chem.MolFromSmiles(elem)
            # TODO (ytz) this is a bandage solution to reorder the atoms so
            # that they're always in the same canonical order. Presumably this
            # should be correctly implemented in the future for graph mols.
            if mol:
                new_order = rdmolfiles.CanonicalRankAtoms(mol)
                mol = rdmolops.RenumberAtoms(mol, new_order)
            if ind % log_every_N == 0:
                log("Featurizing sample %d" % ind, verbose)
            features.append(featurizer.featurize([mol]))
        valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features],
                              dtype=bool)
        features = [
            elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
        ]
        return np.squeeze(np.array(features), axis=1), valid_inds
Exemplo n.º 10
0
def fingerprint_features(smile_string, radius=2, size=256):
    mol = MolFromSmiles(smile_string)
    new_order = rdmolfiles.CanonicalRankAtoms(mol)
    mol = rdmolops.RenumberAtoms(mol, new_order)
    arr = np.zeros((0,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(
        rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius,
                                                       nBits=size,
                                                       useChirality=True,
                                                       useBondTypes=True,
                                                       useFeatures=False
                                                       ), arr)
    return arr
Exemplo n.º 11
0
def build_graph_from_molecule(mol, use_master_atom=False):
    """
    Param:
        mol - rdkit.Chem.rdchem.Mol
    Output:
        nodes - np.ndarray of shape (num_atoms, num_feat)
        canon_adj_list - list. index corresponds to the index of node
                         and canon_adj_list[index] corresponds to indices
                         of the nodes that node i is connected to.
    """
    if not isinstance(mol, Chem.rdchem.Mol):
        raise TypeError("'mol' must be rdkit.Chem.rdchem.Mol obj")

    # what are the two lines below doing?
    # Answer found in deepchem.data.data_loader featurize_smiles_df
    # TODO (ytz) this is a bandage solution to reorder the atoms so
    # that they're always in the same canonical order. Presumably this
    # should be correctly implemented in the future for graph mols.
    new_order = rdmolfiles.CanonicalRankAtoms(mol)
    mol = rdmolops.RenumberAtoms(mol, new_order)
    
    
    idx_nodes = [(atom.GetIdx(), encode_atom(atom))
                 for atom in mol.GetAtoms()]
    idx_nodes.sort()
    _, nodes = list(zip(*idx_nodes))

    nodes = np.vstack(nodes)

    # Master atom is the "average" of all atoms that is connected to all atom
    # Introduced in https://arxiv.org/pdf/1704.01212.pdf
    if use_master_atom:
        master_atom_features = np.expand_dims(np.mean(nodes, axis=0), axis=0)
        nodes = np.concatenate([nodes, master_atom_features], axis=0)

    edge_list = [(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())
                for bond in mol.GetBonds()]

    canon_adj_list = [[] for _ in range(len(nodes))]

    for edge in edge_list:
        canon_adj_list[edge[0]].append(edge[1])
        canon_adj_list[edge[1]].append(edge[0])

    if use_master_atom:
        fake_atom_index = len(nodes) - 1

        for i in range(len(nodes) - 1):
            canon_adj_list[i].append(fake_atom_index)

    return (nodes, canon_adj_list)
Exemplo n.º 12
0
    def featurize(self, molecules, log_every_n=1000):
        """Calculate features for molecules.

    Parameters
    ----------
    molecules: RDKit Mol / SMILES string /iterable
        RDKit Mol, or SMILES string or iterable sequence of RDKit mols/SMILES
        strings.

    Returns
    -------
    A numpy array containing a featurized representation of
    `datapoints`.
    """
        try:
            from rdkit import Chem
            from rdkit.Chem import rdmolfiles
            from rdkit.Chem import rdmolops
            from rdkit.Chem.rdchem import Mol
        except ModuleNotFoundError:
            raise ValueError("This class requires RDKit to be installed.")
        # Special case handling of single molecule
        if isinstance(molecules, str) or isinstance(molecules, Mol):
            molecules = [molecules]
        else:
            # Convert iterables to list
            molecules = list(molecules)
        features = []
        for i, mol in enumerate(molecules):
            if i % log_every_n == 0:
                logger.info("Featurizing datapoint %i" % i)
            try:
                # Process only case of SMILES strings.
                if isinstance(mol, str):
                    # mol must be a SMILES string so parse
                    mol = Chem.MolFromSmiles(mol)
                    # TODO (ytz) this is a bandage solution to reorder the atoms
                    # so that they're always in the same canonical order.
                    # Presumably this should be correctly implemented in the
                    # future for graph mols.
                    if mol:
                        new_order = rdmolfiles.CanonicalRankAtoms(mol)
                        mol = rdmolops.RenumberAtoms(mol, new_order)
                features.append(self._featurize(mol))
            except:
                logger.warning(
                    "Failed to featurize datapoint %d. Appending empty array")
                features.append(np.array([]))

        features = np.asarray(features)
        return features
Exemplo n.º 13
0
def _featurize_smiles_df(df, featurizer, field, log_every_n=1000):
    """Featurize individual compounds in dataframe.

  Private helper that given a featurizer that operates on individual
  chemical compounds or macromolecules, compute & add features for
  that compound to the features dataframe

  Parameters
  ----------
  df: pd.DataFrame
    DataFrame that holds SMILES strings
  featurizer: Featurizer
    A featurizer object
  field: str
    The name of a column in `df` that holds SMILES strings
  log_every_n: int, optional (default 1000)
    Emit a logging statement every `log_every_n` rows.

  Note
  ----
  This function requires RDKit to be installed
  """
    sample_elems = df[field].tolist()

    features = []
    from rdkit import Chem
    from rdkit.Chem import rdmolfiles
    from rdkit.Chem import rdmolops
    for ind, elem in enumerate(sample_elems):
        mol = Chem.MolFromSmiles(elem)
        # TODO (ytz) this is a bandage solution to reorder the atoms
        # so that they're always in the same canonical order.
        # Presumably this should be correctly implemented in the
        # future for graph mols.
        if mol:
            new_order = rdmolfiles.CanonicalRankAtoms(mol)
            mol = rdmolops.RenumberAtoms(mol, new_order)
        if ind % log_every_n == 0:
            logger.info("Featurizing sample %d" % ind)
        features.append(featurizer.featurize([mol]))
    valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features],
                          dtype=bool)
    features = [
        elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
    ]
    return np.squeeze(np.array(features), axis=1), valid_inds
Exemplo n.º 14
0
def featurize_smiles(arr):
    featurizer = dc.feat.ConvMolFeaturizer()
    features = []
    for ind, elem in enumerate(arr.tolist()):
        mol = Chem.MolFromSmiles(elem)
        if mol:
            new_order = rdmolfiles.CanonicalRankAtoms(mol)
            mol = rdmolops.RenumberAtoms(mol, new_order)
        features.append(featurizer([mol]))

    valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features],
                          dtype=bool)
    features = [
        elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
    ]
    features = np.squeeze(np.array(features))
    return features.reshape(-1, ), valid_inds
Exemplo n.º 15
0
def get_statistics(molset_fname):
    with open(molset_fname, 'rb') as f:
        mols = pickle.load(f)[0]

    dataset_distinct_atoms = set()
    num_distinct_atoms = []
    num_bonds = []
    num_rotatable_bonds = []
    molecular_mass = []
    contains_symmetric_pair = []

    for mol in mols:
        atoms = mol.GetAtoms()
        symbols = [atom.GetSymbol() for atom in atoms]
        dataset_distinct_atoms.update(symbols)
        num_distinct_atoms.append(len(set(symbols)))
        num_bonds.append(len(mol.GetBonds()))
        num_rotatable_bonds.append(Descriptors.NumRotatableBonds(mol))
        molecular_mass.append(Descriptors.HeavyAtomMolWt(mol))
        canonical_ranking = rdmolfiles.CanonicalRankAtoms(mol, breakTies=False)
        symmetric_pair = int(
            len(canonical_ranking) != len(set(canonical_ranking)))
        contains_symmetric_pair.append(symmetric_pair)

    num_distinct_atoms_dataset = len(dataset_distinct_atoms)
    num_distinct_atoms = np.array(num_distinct_atoms)
    num_distinct_atoms = (num_distinct_atoms.mean(), num_distinct_atoms.std())
    num_bonds = np.array(num_bonds)
    num_bonds = (num_bonds.mean(), num_bonds.std())
    num_rotatable_bonds = np.array(num_rotatable_bonds)
    num_rotatable_bonds = (num_rotatable_bonds.mean(),
                           num_rotatable_bonds.std())
    molecular_mass = np.array(molecular_mass)
    molecular_mass = (molecular_mass.mean(), molecular_mass.std())
    contains_symmetric_pair = np.array(contains_symmetric_pair).mean()

    return [
        num_distinct_atoms_dataset, num_distinct_atoms, num_bonds,
        num_rotatable_bonds, molecular_mass, contains_symmetric_pair
    ]
Exemplo n.º 16
0
def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True):
    """Featurize individual compounds in dataframe.
  Given a featurizer that operates on individual chemical compounds 
  or macromolecules, compute & add features for that compound to the 
  features dataframe
  """
    sample_elems = df[field].tolist()

    features = []
    for ind, elem in enumerate(sample_elems):
        mol = Chem.MolFromSmiles(elem)
        if mol:
            new_order = rdmolfiles.CanonicalRankAtoms(mol)
            mol = rdmolops.RenumberAtoms(mol, new_order)
        if ind % log_every_N == 0:
            log("Featurizing sample %d" % ind, verbose)
        features.append(featurizer.featurize([mol]))
    valid_inds = torch.Tensor([1 if elt.size > 0 else 0 for elt in features],
                              dtype=bool)
    features = [
        elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
    ]
    return torch.squeeze(torch.Tensor(features), axis=1), valid_inds
Exemplo n.º 17
0
def mol_to_graph(mol, graph_constructor, node_featurizer, edge_featurizer,
                 canonical_atom_order):
    """Convert an RDKit molecule object into a DGLGraph and featurize for it.

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule holder
    graph_constructor : callable
        Takes an RDKit molecule as input and returns a DGLGraph
    node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for nodes like atoms in a molecule, which can be used to
        update ndata for a DGLGraph.
    edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for edges like bonds in a molecule, which can be used to
        update edata for a DGLGraph.
    canonical_atom_order : bool
        Whether to use a canonical order of atoms returned by RDKit. Setting it
        to true might change the order of atoms in the graph constructed.

    Returns
    -------
    g : DGLGraph
        Converted DGLGraph for the molecule
    """
    if canonical_atom_order:
        new_order = rdmolfiles.CanonicalRankAtoms(mol)
        mol = rdmolops.RenumberAtoms(mol, new_order)
    g = graph_constructor(mol)

    if node_featurizer is not None:
        g.ndata.update(node_featurizer(mol))

    if edge_featurizer is not None:
        g.edata.update(edge_featurizer(mol))

    return g
Exemplo n.º 18
0
def mol_to_nearest_neighbor_graph(mol,
                                  coordinates,
                                  neighbor_cutoff,
                                  max_num_neighbors=None,
                                  p_distance=2,
                                  add_self_loop=False,
                                  node_featurizer=None,
                                  edge_featurizer=None,
                                  canonical_atom_order=True,
                                  keep_dists=False,
                                  dist_field='dist',
                                  explicit_hydrogens=False,
                                  num_virtual_nodes=0):
    """Convert an RDKit molecule into a nearest neighbor graph and featurize for it.

    Different from bigraph and complete graph, the nearest neighbor graph
    may not be symmetric since i is the closest neighbor of j does not
    necessarily suggest the other way.

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule holder
    coordinates : numpy.ndarray of shape (N, D)
        The coordinates of atoms in the molecule. N for the number of atoms
        and D for the dimensions of the coordinates.
    neighbor_cutoff : float
        If the distance between a pair of nodes is larger than neighbor_cutoff,
        they will not be considered as neighboring nodes.
    max_num_neighbors : int or None.
        If not None, then this specifies the maximum number of neighbors
        allowed for each atom. Default to None.
    p_distance : int
        We compute the distance between neighbors using Minkowski (:math:`l_p`)
        distance. When ``p_distance = 1``, Minkowski distance is equivalent to
        Manhattan distance. When ``p_distance = 2``, Minkowski distance is
        equivalent to the standard Euclidean distance. Default to 2.
    add_self_loop : bool
        Whether to add self loops in DGLGraphs. Default to False.
    node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for nodes like atoms in a molecule, which can be used to update
        ndata for a DGLGraph. Default to None.
    edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for edges like bonds in a molecule, which can be used to update
        edata for a DGLGraph. Default to None.
    canonical_atom_order : bool
        Whether to use a canonical order of atoms returned by RDKit. Setting it
        to true might change the order of atoms in the graph constructed. Default
        to True.
    keep_dists : bool
        Whether to store the distance between neighboring atoms in ``edata`` of the
        constructed DGLGraphs. Default to False.
    dist_field : str
        Field for storing distance between neighboring atoms in ``edata``. This comes
        into effect only when ``keep_dists=True``. Default to ``'dist'``.
    explicit_hydrogens : bool
        Whether to explicitly represent hydrogens as nodes in the graph. If True,
        it will call rdkit.Chem.AddHs(mol). Default to False.
    num_virtual_nodes : int
        The number of virtual nodes to add. The virtual nodes will be connected to
        all real nodes with virtual edges. If the returned graph has any node/edge
        feature, an additional column of binary values will be used for each feature
        to indicate the identity of virtual node/edges. The features of the virtual
        nodes/edges will be zero vectors except for the additional column. Default to 0.

    Returns
    -------
    DGLGraph or None
        Nearest neighbor DGLGraph for the molecule if :attr:`mol` is valid and None otherwise.

    Examples
    --------
    >>> from dgllife.utils import mol_to_nearest_neighbor_graph
    >>> from rdkit import Chem
    >>> from rdkit.Chem import AllChem

    >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C')
    >>> AllChem.EmbedMolecule(mol)
    >>> AllChem.MMFFOptimizeMolecule(mol)
    >>> coords = get_mol_3d_coordinates(mol)
    >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25)
    >>> print(g)
    DGLGraph(num_nodes=23, num_edges=6,
             ndata_schemes={}
             edata_schemes={})

    Quite often we will want to use the distance between end atoms of edges, this can be
    achieved with

    >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25, keep_dists=True)
    >>> print(g.edata['dist'])
    tensor([[1.2024],
            [1.2024],
            [1.2270],
            [1.2270],
            [1.2259],
            [1.2259]])

    By default, we do not explicitly represent hydrogens as nodes, which can be done as follows.

    >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C')
    >>> mol = Chem.AddHs(mol)
    >>> AllChem.EmbedMolecule(mol)
    >>> AllChem.MMFFOptimizeMolecule(mol)
    >>> coords = get_mol_3d_coordinates(mol)
    >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25,
    >>>                                   explicit_hydrogens=True)
    >>> print(g)
    DGLGraph(num_nodes=41, num_edges=42,
             ndata_schemes={}
             edata_schemes={})

    See Also
    --------
    get_mol_3d_coordinates
    k_nearest_neighbors
    smiles_to_nearest_neighbor_graph
    """
    if mol is None:
        print('Invalid mol found')
        return None

    if explicit_hydrogens:
        mol = Chem.AddHs(mol)

    num_atoms = mol.GetNumAtoms()
    num_coords = coordinates.shape[0]
    assert num_atoms == num_coords, \
        'Expect the number of atoms to match the first dimension of coordinates, ' \
        'got {:d} and {:d}'.format(num_atoms, num_coords)

    if canonical_atom_order:
        new_order = rdmolfiles.CanonicalRankAtoms(mol)
        mol = rdmolops.RenumberAtoms(mol, new_order)

    srcs, dsts, dists = k_nearest_neighbors(
        coordinates=coordinates,
        neighbor_cutoff=neighbor_cutoff,
        max_num_neighbors=max_num_neighbors,
        p_distance=p_distance,
        self_loops=add_self_loop)
    g = dgl.graph(([], []), idtype=torch.int32)

    # Add nodes first since some nodes may be completely isolated
    g.add_nodes(num_atoms)

    # Add edges
    g.add_edges(srcs, dsts)

    if node_featurizer is not None:
        g.ndata.update(node_featurizer(mol))

    if edge_featurizer is not None:
        g.edata.update(edge_featurizer(mol))

    if keep_dists:
        assert dist_field not in g.edata, \
            'Expect {} to be reserved for distance between neighboring atoms.'
        g.edata[dist_field] = torch.tensor(dists).float().reshape(-1, 1)

    if num_virtual_nodes > 0:
        num_real_nodes = g.num_nodes()
        real_nodes = list(range(num_real_nodes))
        g.add_nodes(num_virtual_nodes)

        # Change Topology
        virtual_src = []
        virtual_dst = []
        for count in range(num_virtual_nodes):
            virtual_node = num_real_nodes + count
            virtual_node_copy = [virtual_node] * num_real_nodes
            virtual_src.extend(real_nodes)
            virtual_src.extend(virtual_node_copy)
            virtual_dst.extend(virtual_node_copy)
            virtual_dst.extend(real_nodes)
        g.add_edges(virtual_src, virtual_dst)

        for nk, nv in g.ndata.items():
            nv = torch.cat([nv, torch.zeros(g.num_nodes(), 1)], dim=1)
            nv[:-num_virtual_nodes, -1] = 1
            g.ndata[nk] = nv

        for ek, ev in g.edata.items():
            ev = torch.cat([ev, torch.zeros(g.num_edges(), 1)], dim=1)
            ev[:-num_virtual_nodes * num_real_nodes * 2, -1] = 1
            g.edata[ek] = ev

    return g
Exemplo n.º 19
0
def mol_to_graph(mol,
                 graph_constructor,
                 node_featurizer,
                 edge_featurizer,
                 canonical_atom_order,
                 explicit_hydrogens,
                 num_virtual_nodes=0):
    """Convert an RDKit molecule object into a DGLGraph and featurize for it.

    This function can be used to construct any arbitrary ``DGLGraph`` from an
    RDKit molecule instance.

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule holder
    graph_constructor : callable
        Takes an RDKit molecule as input and returns a DGLGraph
    node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for nodes like atoms in a molecule, which can be used to
        update ndata for a DGLGraph.
    edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for edges like bonds in a molecule, which can be used to
        update edata for a DGLGraph.
    canonical_atom_order : bool
        Whether to use a canonical order of atoms returned by RDKit. Setting it
        to true might change the order of atoms in the graph constructed.
    explicit_hydrogens : bool
        Whether to explicitly represent hydrogens as nodes in the graph. If True,
        it will call rdkit.Chem.AddHs(mol).
    num_virtual_nodes : int
        The number of virtual nodes to add. The virtual nodes will be connected to
        all real nodes with virtual edges. If the returned graph has any node/edge
        feature, an additional column of binary values will be used for each feature
        to indicate the identity of virtual node/edges. The features of the virtual
        nodes/edges will be zero vectors except for the additional column. Default to 0.

    Returns
    -------
    DGLGraph or None
        Converted DGLGraph for the molecule if :attr:`mol` is valid and None otherwise.

    See Also
    --------
    mol_to_bigraph
    mol_to_complete_graph
    mol_to_nearest_neighbor_graph
    """
    if mol is None:
        print('Invalid mol found')
        return None

    # Whether to have hydrogen atoms as explicit nodes
    if explicit_hydrogens:
        mol = Chem.AddHs(mol)

    if canonical_atom_order:
        new_order = rdmolfiles.CanonicalRankAtoms(mol)
        mol = rdmolops.RenumberAtoms(mol, new_order)
    g = graph_constructor(mol)

    if node_featurizer is not None:
        g.ndata.update(node_featurizer(mol))

    if edge_featurizer is not None:
        g.edata.update(edge_featurizer(mol))

    if num_virtual_nodes > 0:
        num_real_nodes = g.num_nodes()
        real_nodes = list(range(num_real_nodes))
        g.add_nodes(num_virtual_nodes)

        # Change Topology
        virtual_src = []
        virtual_dst = []
        for count in range(num_virtual_nodes):
            virtual_node = num_real_nodes + count
            virtual_node_copy = [virtual_node] * num_real_nodes
            virtual_src.extend(real_nodes)
            virtual_src.extend(virtual_node_copy)
            virtual_dst.extend(virtual_node_copy)
            virtual_dst.extend(real_nodes)
        g.add_edges(virtual_src, virtual_dst)

        for nk, nv in g.ndata.items():
            nv = torch.cat([nv, torch.zeros(g.num_nodes(), 1)], dim=1)
            nv[-num_virtual_nodes:, -1] = 1
            g.ndata[nk] = nv

        for ek, ev in g.edata.items():
            ev = torch.cat([ev, torch.zeros(g.num_edges(), 1)], dim=1)
            ev[-num_virtual_nodes * num_real_nodes * 2:, -1] = 1
            g.edata[ek] = ev

    return g
Exemplo n.º 20
0
def mol_to_nearest_neighbor_graph(mol,
                                  coordinates,
                                  neighbor_cutoff,
                                  max_num_neighbors=None,
                                  p_distance=2,
                                  add_self_loop=False,
                                  node_featurizer=None,
                                  edge_featurizer=None,
                                  canonical_atom_order=True,
                                  keep_dists=False,
                                  dist_field='dist'):
    """Convert an RDKit molecule into a nearest neighbor graph and featurize for it.

    Different from bigraph and complete graph, the nearest neighbor graph
    may not be symmetric since i is the closest neighbor of j does not
    necessarily suggest the other way.

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule holder
    coordinates : numpy.ndarray of shape (N, D)
        The coordinates of atoms in the molecule. N for the number of atoms
        and D for the dimensions of the coordinates.
    neighbor_cutoff : float
        If the distance between a pair of nodes is larger than neighbor_cutoff,
        they will not be considered as neighboring nodes.
    max_num_neighbors : int or None.
        If not None, then this specifies the maximum number of neighbors
        allowed for each atom. Default to None.
    p_distance : int
        We compute the distance between neighbors using Minkowski (:math:`l_p`)
        distance. When ``p_distance = 1``, Minkowski distance is equivalent to
        Manhattan distance. When ``p_distance = 2``, Minkowski distance is
        equivalent to the standard Euclidean distance. Default to 2.
    add_self_loop : bool
        Whether to add self loops in DGLGraphs. Default to False.
    node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for nodes like atoms in a molecule, which can be used to update
        ndata for a DGLGraph. Default to None.
    edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for edges like bonds in a molecule, which can be used to update
        edata for a DGLGraph. Default to None.
    canonical_atom_order : bool
        Whether to use a canonical order of atoms returned by RDKit. Setting it
        to true might change the order of atoms in the graph constructed. Default
        to True.
    keep_dists : bool
        Whether to store the distance between neighboring atoms in ``edata`` of the
        constructed DGLGraphs. Default to False.
    dist_field : str
        Field for storing distance between neighboring atoms in ``edata``. This comes
        into effect only when ``keep_dists=True``. Default to ``'dist'``.
    """
    if canonical_atom_order:
        new_order = rdmolfiles.CanonicalRankAtoms(mol)
        mol = rdmolops.RenumberAtoms(mol, new_order)

    srcs, dsts, dists = k_nearest_neighbors(
        coordinates=coordinates,
        neighbor_cutoff=neighbor_cutoff,
        max_num_neighbors=max_num_neighbors,
        p_distance=p_distance,
        self_loops=add_self_loop)
    g = DGLGraph()

    # Add nodes first since some nodes may be completely isolated
    num_atoms = mol.GetNumAtoms()
    g.add_nodes(num_atoms)

    # Add edges
    g.add_edges(srcs, dsts)

    if node_featurizer is not None:
        g.ndata.update(node_featurizer(mol))

    if edge_featurizer is not None:
        g.edata.update(edge_featurizer(mol))

    if keep_dists:
        assert dist_field not in g.edata, \
            'Expect {} to be reserved for distance between neighboring atoms.'
        g.edata[dist_field] = torch.tensor(dists).float().reshape(-1, 1)

    return g
Exemplo n.º 21
0
def mol_to_nearest_neighbor_graph(mol,
                                  coordinates,
                                  neighbor_cutoff,
                                  max_num_neighbors=None,
                                  p_distance=2,
                                  add_self_loop=False,
                                  node_featurizer=None,
                                  edge_featurizer=None,
                                  canonical_atom_order=True,
                                  keep_dists=False,
                                  dist_field='dist',
                                  explicit_hydrogens=False):
    """Convert an RDKit molecule into a nearest neighbor graph and featurize for it.

    Different from bigraph and complete graph, the nearest neighbor graph
    may not be symmetric since i is the closest neighbor of j does not
    necessarily suggest the other way.

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule holder
    coordinates : numpy.ndarray of shape (N, D)
        The coordinates of atoms in the molecule. N for the number of atoms
        and D for the dimensions of the coordinates.
    neighbor_cutoff : float
        If the distance between a pair of nodes is larger than neighbor_cutoff,
        they will not be considered as neighboring nodes.
    max_num_neighbors : int or None.
        If not None, then this specifies the maximum number of neighbors
        allowed for each atom. Default to None.
    p_distance : int
        We compute the distance between neighbors using Minkowski (:math:`l_p`)
        distance. When ``p_distance = 1``, Minkowski distance is equivalent to
        Manhattan distance. When ``p_distance = 2``, Minkowski distance is
        equivalent to the standard Euclidean distance. Default to 2.
    add_self_loop : bool
        Whether to add self loops in DGLGraphs. Default to False.
    node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for nodes like atoms in a molecule, which can be used to update
        ndata for a DGLGraph. Default to None.
    edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for edges like bonds in a molecule, which can be used to update
        edata for a DGLGraph. Default to None.
    canonical_atom_order : bool
        Whether to use a canonical order of atoms returned by RDKit. Setting it
        to true might change the order of atoms in the graph constructed. Default
        to True.
    keep_dists : bool
        Whether to store the distance between neighboring atoms in ``edata`` of the
        constructed DGLGraphs. Default to False.
    dist_field : str
        Field for storing distance between neighboring atoms in ``edata``. This comes
        into effect only when ``keep_dists=True``. Default to ``'dist'``.
    explicit_hydrogens : bool
        Whether to explicitly represent hydrogens as nodes in the graph. Default to False.

    Returns
    -------
    g : DGLGraph
        Nearest neighbor DGLGraph for the molecule

    Examples
    --------
    >>> from dgllife.utils import mol_to_nearest_neighbor_graph
    >>> from rdkit import Chem
    >>> from rdkit.Chem import AllChem

    >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C')
    >>> AllChem.EmbedMolecule(mol)
    >>> AllChem.MMFFOptimizeMolecule(mol)
    >>> coords = get_mol_3d_coordinates(mol)
    >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25)
    >>> print(g)
    DGLGraph(num_nodes=23, num_edges=6,
             ndata_schemes={}
             edata_schemes={})

    Quite often we will want to use the distance between end atoms of edges, this can be
    achieved with

    >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25, keep_dists=True)
    >>> print(g.edata['dist'])
    tensor([[1.2024],
            [1.2024],
            [1.2270],
            [1.2270],
            [1.2259],
            [1.2259]])

    By default, we do not explicitly represent hydrogens as nodes, which can be done as follows.

    >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C')
    >>> mol = Chem.AddHs(mol)
    >>> AllChem.EmbedMolecule(mol)
    >>> AllChem.MMFFOptimizeMolecule(mol)
    >>> coords = get_mol_3d_coordinates(mol)
    >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25,
    >>>                                   explicit_hydrogens=True)
    >>> print(g)
    DGLGraph(num_nodes=41, num_edges=42,
             ndata_schemes={}
             edata_schemes={})

    See Also
    --------
    get_mol_3d_coordinates
    k_nearest_neighbors
    smiles_to_nearest_neighbor_graph
    """
    if explicit_hydrogens:
        mol = Chem.AddHs(mol)
    else:
        mol = Chem.RemoveHs(mol)

    num_atoms = mol.GetNumAtoms()
    num_coords = coordinates.shape[0]
    assert num_atoms == num_coords, \
        'Expect the number of atoms to match the first dimension of coordinates, ' \
        'got {:d} and {:d}'.format(num_atoms, num_coords)

    if canonical_atom_order:
        new_order = rdmolfiles.CanonicalRankAtoms(mol)
        mol = rdmolops.RenumberAtoms(mol, new_order)

    srcs, dsts, dists = k_nearest_neighbors(
        coordinates=coordinates,
        neighbor_cutoff=neighbor_cutoff,
        max_num_neighbors=max_num_neighbors,
        p_distance=p_distance,
        self_loops=add_self_loop)
    g = DGLGraph()

    # Add nodes first since some nodes may be completely isolated
    g.add_nodes(num_atoms)

    # Add edges
    g.add_edges(srcs, dsts)

    if node_featurizer is not None:
        g.ndata.update(node_featurizer(mol))

    if edge_featurizer is not None:
        g.edata.update(edge_featurizer(mol))

    if keep_dists:
        assert dist_field not in g.edata, \
            'Expect {} to be reserved for distance between neighboring atoms.'
        g.edata[dist_field] = torch.tensor(dists).float().reshape(-1, 1)

    return g
Exemplo n.º 22
0
def is_symmetric_chem(mol):
    if type(mol) == str:
        mol = Chem.MolFromSmiles(mol)
    z = list(rdmolfiles.CanonicalRankAtoms(mol, breakTies=False))
    return len(z) != len(set(z))