Exemplo n.º 1
0
def graph_from_smiles(smiles):
    graph = MolGraph()
    mol = MolFromSmiles(smiles)

    # mol = MolFromSmiles(smiles, sanitize=False)
    # mol.UpdatePropertyCache(strict=False)
    # Chem.SanitizeMol(mol, Chem.SanitizeFlags.SANITIZE_FINDRADICALS | Chem.SanitizeFlags.SANITIZE_KEKULIZE | Chem.SanitizeFlags.SANITIZE_SETAROMATICITY | Chem.SanitizeFlags.SANITIZE_SETCONJUGATION | Chem.SanitizeFlags.SANITIZE_SETHYBRIDIZATION | Chem.SanitizeFlags.SANITIZE_SYMMRINGS, catchErrors=True)
    if not mol:
        raise ValueError("Could not parse SMILES string:", smiles)
    atoms_by_rd_idx = {}
    for atom in mol.GetAtoms():
        new_atom_node = graph.new_node('atom',
                                       features=atom_features(atom),
                                       rdkit_ix=atom.GetIdx())
        atoms_by_rd_idx[atom.GetIdx()] = new_atom_node

    for bond in mol.GetBonds():
        atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()]
        atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()]
        new_bond_node = graph.new_node('bond', features=bond_features(bond))
        new_bond_node.add_neighbors((atom1_node, atom2_node))
        atom1_node.add_neighbors((atom2_node, ))

    mol_node = graph.new_node('molecule')
    mol_node.add_neighbors(graph.nodes['atom'])
    return graph
Exemplo n.º 2
0
def graph_from_smiles(smiles):
    graph = MolGraph()
    check = np.array(1)
    if type(check) is not type(smiles):
        str_smiles = smiles._data[0][0]
    else:
        str_smiles = smiles[0]
    mol = MolFromSmiles(str_smiles)
    if not mol:
        raise ValueError("Could not parse SMILES string:", str_smiles)

    atoms_by_rd_idx = {}
    for atom in mol.GetAtoms():
        new_atom_node = graph.new_node('atom',
                                       features=atom_features(atom),
                                       rdkit_ix=atom.GetIdx())
        atoms_by_rd_idx[atom.GetIdx()] = new_atom_node

    for bond in mol.GetBonds():
        atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()]
        atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()]
        new_bond_node = graph.new_node('bond', features=bond_features(bond))
        new_bond_node.add_neighbors((atom1_node, atom2_node))
        atom1_node.add_neighbors((atom2_node, ))

    mol_node = graph.new_node('molecule')
    mol_node.add_neighbors(graph.nodes['atom'])
    return graph
Exemplo n.º 3
0
def graph_from_smiles(smiles):
    graph = MolGraph()
    mol = MolFromSmiles(smiles)
    if not mol:
        raise ValueError("Could not parse SMILES string:", smiles)
    atoms_by_rd_idx = {}
    for atom in mol.GetAtoms():
        new_atom_node = graph.new_node("atom", features=atom_features(atom), rdkit_ix=atom.GetIdx())
        atoms_by_rd_idx[atom.GetIdx()] = new_atom_node

    for bond in mol.GetBonds():
        atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()]
        atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()]
        new_bond_node = graph.new_node("bond", features=bond_features(bond))
        new_bond_node.add_neighbors((atom1_node, atom2_node))
        atom1_node.add_neighbors((atom2_node,))

    mol_node = graph.new_node("molecule")
    mol_node.add_neighbors(graph.nodes["atom"])
    return graph
Exemplo n.º 4
0
def graph_from_amino_acids(sequence):
    graph = MolGraph()
    mol = MolFromSequence(sequence)
    if not mol:
        raise ValueError("Could not parse input string:", sequence)
    atoms_by_rd_idx = {}
    for atom in mol.GetAtoms():
        new_atom_node = graph.new_node('atom', features=atom_features(atom), rdkit_ix=atom.GetIdx())
        atoms_by_rd_idx[atom.GetIdx()] = new_atom_node

    for bond in mol.GetBonds():
        atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()]
        atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()]
        new_bond_node = graph.new_node('bond', features=bond_features(bond))
        new_bond_node.add_neighbors((atom1_node, atom2_node))
        atom1_node.add_neighbors((atom2_node,))

    mol_node = graph.new_node('molecule')
    mol_node.add_neighbors(graph.nodes['atom'])
    return graph
Exemplo n.º 5
0
    def convertDataToGraph(self, data):

        chemData = []
        for row in data:
            mol = row['mol']
            atoms = mol.GetAtoms()
            graphList = []
            atomsFeatures = np.zeros((len(atoms), self.featureSize))

            for atom in atoms:

                atomFeature = atom_features(atom)
                bondFeature = np.zeros(6)

                neighborList = []
                bonds = atom.GetBonds()

                for bond in bonds:
                    neighbor = bond.GetBeginAtom()
                    if (neighbor.GetIdx() == atom.GetIdx()):
                        neighbor = bond.GetEndAtom()

                    neighborList += [neighbor.GetIdx()]
                    bondFeature += bond_features(bond)

                features = np.concatenate((atomFeature, bondFeature), axis=0)

                atomsFeatures[atom.GetIdx()] = features
                graphList += [{'idx': atom.GetIdx(), 'neighbor': np.array(neighborList)}]


            chemData += [[float(row['pce']), {'graphList': graphList, 'atomsFeatures': atomsFeatures}]]

        chemData = np.array(chemData)

        return chemData
def tensorize_smiles_job(smiles, max_degree=5, max_atoms=None):
    '''Takes a list of smiles and turns the graphs in tensor representation.

    # Arguments:
        smiles: a list (or iterable) of smiles representations
        max_atoms: the maximum number of atoms per molecule (to which all
            molecules will be padded), use `None` for auto
        max_degree: max_atoms: the maximum number of neigbour per atom that each
            molecule can have (to which all molecules will be padded), use `None`
            for auto

        **NOTE**: It is not recommended to set max_degree to `None`/auto when
            using `NeuralGraph` layers. Max_degree determines the number of
            trainable parameters and is essentially a hyperparameter.
            While models can be rebuilt using different `max_atoms`, they cannot
            be rebuild for different values of `max_degree`, as the architecture
            will be different.

            For organic molecules `max_degree=5` is a good value (Duvenaud et. al, 2015)


    # Returns:
        atoms: np.array, An atom feature np.array of size `(molecules, max_atoms, atom_features)`
        bonds: np.array, A bonds np.array of size `(molecules, max_atoms, max_neighbours)`
        edges: np.array, A connectivity array of size `(molecules, max_atoms, max_neighbours, bond_features)`
    TODO:
        * Arguments for sparse vector encoding

    '''

    # import sizes
    n = len(smiles)
    n_atom_features = features.num_atom_features()
    n_bond_features = features.num_bond_features()

    # preallocate atom tensor with 0's and bond tensor with -1 (because of 0 index)
    # If max_degree or max_atoms is set to None (auto), initialise dim as small
    #   as possible (1)
    atom_tensor = np.zeros((n, max_atoms or 1, n_atom_features),
                           dtype=np.float32)
    bond_tensor = np.zeros(
        (n, max_atoms or 1, max_degree or 1, n_bond_features),
        dtype=np.float32)
    edge_tensor = -np.ones((n, max_atoms or 1, max_degree or 1), dtype=np.int8)

    for mol_ix, s in enumerate(smiles):

        #load mol, atoms and bonds
        sio = sys.stderr = StringIO()
        mol = Chem.MolFromSmiles(s)
        assert mol is not None, 'Could not parse smiles {}, error: {}'.format(
            s, sio.getvalue())
        atoms = mol.GetAtoms()
        bonds = mol.GetBonds()

        # If max_atoms is exceeded, resize if max_atoms=None (auto), else raise
        if len(atoms) > atom_tensor.shape[1]:
            assert max_atoms is None, 'too many atoms ({0}) in molecule: {1}'.format(
                len(atoms), s)
            atom_tensor = padaxis(atom_tensor, len(atoms), axis=1)
            bond_tensor = padaxis(bond_tensor, len(atoms), axis=1)
            edge_tensor = padaxis(edge_tensor,
                                  len(atoms),
                                  axis=1,
                                  pad_value=-1)

        rdkit_ix_lookup = {}
        connectivity_mat = {}

        for atom_ix, atom in enumerate(atoms):
            # write atom features
            atom_tensor[mol_ix,
                        atom_ix, :n_atom_features] = features.atom_features(
                            atom)

            # store entry in idx
            rdkit_ix_lookup[atom.GetIdx()] = atom_ix

        # preallocate array with neighbour lists (indexed by atom)
        connectivity_mat = [[] for _ in atoms]

        for bond in bonds:
            # lookup atom ids
            a1_ix = rdkit_ix_lookup[bond.GetBeginAtom().GetIdx()]
            a2_ix = rdkit_ix_lookup[bond.GetEndAtom().GetIdx()]

            # lookup how many neighbours are encoded yet
            a1_neigh = len(connectivity_mat[a1_ix])
            a2_neigh = len(connectivity_mat[a2_ix])

            # If max_degree is exceeded, resize if max_degree=None (auto), else raise
            new_degree = max(a1_neigh, a2_neigh) + 1
            if new_degree > bond_tensor.shape[2]:
                assert max_degree is None, 'too many neighours ({0}) in molecule: {1}'.format(
                    new_degree, s)
                bond_tensor = padaxis(bond_tensor, new_degree, axis=2)
                edge_tensor = padaxis(edge_tensor,
                                      new_degree,
                                      axis=2,
                                      pad_value=-1)

            # store bond features
            bond_features = np.array(features.bond_features(bond), dtype=int)
            bond_tensor[mol_ix, a1_ix, a1_neigh, :] = bond_features
            bond_tensor[mol_ix, a2_ix, a2_neigh, :] = bond_features

            #add to connectivity matrix
            connectivity_mat[a1_ix].append(a2_ix)
            connectivity_mat[a2_ix].append(a1_ix)

        #store connectivity matrix
        for a1_ix, neighbours in enumerate(connectivity_mat):
            degree = len(neighbours)
            edge_tensor[mol_ix, a1_ix, :degree] = neighbours

    return atom_tensor, bond_tensor, edge_tensor