示例#1
0
def graph_from_smiles(smiles, fp_switch):  #ecfp = false, fcfp = true
    graph = MolGraph()
    check = np.array(1)
    if type(check) is not type(smiles):
        str_smiles = smiles._data[0][0]
    else:
        str_smiles = smiles[0]
    mol = MolFromSmiles(str_smiles)

    if not mol:
        raise ValueError("Could not parse SMILES string:", str_smiles)

    atoms_by_rd_idx = {}
    fcfp = atom_features_from_fcfp(mol)
    idx = 0
    for atom in mol.GetAtoms():
        new_atom_node = graph.new_node(
            'atom',
            features=np.r_[atom_features_from_ecfp(atom), fcfp[idx]],
            rdkit_ix=atom.GetIdx())
        atoms_by_rd_idx[atom.GetIdx()] = new_atom_node
        idx += 1

    for bond in mol.GetBonds():
        atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()]
        atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()]
        new_bond_node = graph.new_node('bond', features=bond_features(bond))
        new_bond_node.add_neighbors((atom1_node, atom2_node))
        atom1_node.add_neighbors((atom2_node, ))

    mol_node = graph.new_node('molecule')
    mol_node.add_neighbors(graph.nodes['atom'])
    return graph
示例#2
0
def graph_from_smiles(smiles):
    graph = MolGraph()
    try:
        mol = MolFromSmiles(smiles)
    except:
        print('Could not parse...')
        print(smiles)
        quit()
    if not mol:
        raise ValueError("Could not parse SMILES string:", smiles)
    atoms_by_rd_idx = {}
    for atom in mol.GetAtoms():
        new_atom_node = graph.new_node('atom',
                                       features=atom_features(atom),
                                       rdkit_ix=atom.GetIdx())
        atoms_by_rd_idx[atom.GetIdx()] = new_atom_node

    for bond in mol.GetBonds():
        atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()]
        atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()]
        new_bond_node = graph.new_node('bond', features=bond_features(bond))
        new_bond_node.add_neighbors((atom1_node, atom2_node))
        atom1_node.add_neighbors((atom2_node, ))

    mol_node = graph.new_node('molecule')
    mol_node.add_neighbors(graph.nodes['atom'])
    return graph
示例#3
0
def graph_from_smiles(smiles):
    #    print ('graph_from_smiles::',smiles)
    graph = MolGraph()
    mol = MolFromSmiles(smiles)
    if not mol:
        raise ValueError("Could not parse SMILES string:", smiles)
    atoms_by_rd_idx = {}
    for atom in mol.GetAtoms():
        #print(atom.GetSymbol(), 'deg', atom.GetDegree(), '#H',atom.GetTotalNumHs(),'valence', atom.GetImplicitValence(), 'Idx()',atom.GetIdx())
        new_atom_node = graph.new_node('atom',
                                       features=atom_features(atom),
                                       rdkit_ix=atom.GetIdx())
        atoms_by_rd_idx[atom.GetIdx()] = new_atom_node

    for bond in mol.GetBonds():
        #print('bond.GetBeginAtom()--bond.GetBeginAtom():', bond.GetBeginAtom().GetIdx(), bond.GetEndAtom().GetIdx(), 'type',str(bond.GetBondType()).split('.')[-1],'conjugated', bond.GetIsConjugated(), 'ring',bond.IsInRing())
        atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()]
        atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()]
        new_bond_node = graph.new_node('bond', features=bond_features(bond))
        new_bond_node.add_neighbors((atom1_node, atom2_node))
        atom1_node.add_neighbors((atom2_node, ))

    mol_node = graph.new_node('molecule')
    mol_node.add_neighbors(graph.nodes['atom'])
    return graph
示例#4
0
def graph_from_smiles(smiles):
    graph = MolGraph()
    mol = MolFromSmiles(smiles)
    Chem.DetectBondStereochemistry(mol, -1)
    Chem.AssignStereochemistry(mol, flagPossibleStereoCenters=True, force=True)
    Chem.AssignAtomChiralTagsFromStructure(mol, -1)

    if not mol:
        raise ValueError("Could not parse SMILES string:", smiles)
    atoms_by_rd_idx = {}
    for atom in mol.GetAtoms():
        new_atom_node = graph.new_node('atom',
                                       features=atom_features(atom),
                                       rdkit_ix=atom.GetIdx())
        atoms_by_rd_idx[atom.GetIdx()] = new_atom_node

    for bond in mol.GetBonds():
        atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()]
        atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()]
        new_bond_node = graph.new_node('bond', features=bond_features(bond))
        new_bond_node.add_neighbors((atom1_node, atom2_node))
        atom1_node.add_neighbors((atom2_node, ))

    mol_node = graph.new_node('molecule')
    mol_node.add_neighbors(graph.nodes['atom'])
    return graph
示例#5
0
def graph_from_smiles(smiles):
    graph = MolGraph()
    mol = MolFromSmiles(smiles)
    if not mol:
        raise ValueError("Could not parse SMILES string:", smiles)
    atoms_by_rd_idx = {}

    rdPartialCharges.ComputeGasteigerCharges(mol)
    for atom in mol.GetAtoms():
        add_Gasteiger = float(atom.GetProp('_GasteigerCharge'))
        if np.isnan(add_Gasteiger) or np.isinf(add_Gasteiger):
            add_Gasteiger = 0.0
        new_atom_node = graph.new_node('atom',
                                       features=atom_features(
                                           atom, add_Gasteiger),
                                       rdkit_ix=atom.GetIdx())
        atoms_by_rd_idx[atom.GetIdx()] = new_atom_node

    for bond in mol.GetBonds():
        atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()]
        atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()]
        new_bond_node = graph.new_node('bond', features=bond_features(bond))
        new_bond_node.add_neighbors((atom1_node, atom2_node))
        atom1_node.add_neighbors((atom2_node, ))

    mol_node = graph.new_node('molecule')
    mol_node.add_neighbors(graph.nodes['atom'])
    return graph
示例#6
0
def graph_from_smiles(smiles):
    graph = MolGraph()
    mol = MolFromSmiles(smiles)

    # mol = MolFromSmiles(smiles, sanitize=False)
    # mol.UpdatePropertyCache(strict=False)
    # Chem.SanitizeMol(mol, Chem.SanitizeFlags.SANITIZE_FINDRADICALS | Chem.SanitizeFlags.SANITIZE_KEKULIZE | Chem.SanitizeFlags.SANITIZE_SETAROMATICITY | Chem.SanitizeFlags.SANITIZE_SETCONJUGATION | Chem.SanitizeFlags.SANITIZE_SETHYBRIDIZATION | Chem.SanitizeFlags.SANITIZE_SYMMRINGS, catchErrors=True)
    if not mol:
        raise ValueError("Could not parse SMILES string:", smiles)
    atoms_by_rd_idx = {}
    for atom in mol.GetAtoms():
        new_atom_node = graph.new_node('atom',
                                       features=atom_features(atom),
                                       rdkit_ix=atom.GetIdx())
        atoms_by_rd_idx[atom.GetIdx()] = new_atom_node

    for bond in mol.GetBonds():
        atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()]
        atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()]
        new_bond_node = graph.new_node('bond', features=bond_features(bond))
        new_bond_node.add_neighbors((atom1_node, atom2_node))
        atom1_node.add_neighbors((atom2_node, ))

    mol_node = graph.new_node('molecule')
    mol_node.add_neighbors(graph.nodes['atom'])
    return graph
示例#7
0
def load_from_smiles(smiles):
    """ Load a single molecule graph from its SMIELS string. """
    graph = Molecule()
    mol = MolFromSmiles(smiles)
    if not mol:
        raise ValueError("Could not parse SMILES string:", smiles)
    for atom in mol.GetAtoms():
        atom_node = Node('atom', node_id(smiles, atom.GetIdx()),
                         atom_features(atom))
        graph.add_node(atom_node)

    for bond in mol.GetBonds():
        src_node = graph.get_node(
            'atom', node_id(smiles,
                            bond.GetBeginAtom().GetIdx()))
        tgt_node = graph.get_node('atom',
                                  node_id(smiles,
                                          bond.GetEndAtom().GetIdx()))
        bond_node = Node('bond', node_id(smiles, bond.GetIdx()),
                         bond_features(bond))
        graph.add_node(bond_node)
        bond_node.add_neighbors([src_node, tgt_node])
        src_node.add_neighbors([bond_node, tgt_node])
        tgt_node.add_neighbors([bond_node, src_node])

    mol_node = Node('molecule', smiles)
    graph.add_node(mol_node)
    atom_nodes = graph.get_node_list('atom')
    mol_node.add_neighbors(atom_nodes)

    graph.sort_by_degree('atom')

    return graph
示例#8
0
 def Translate(self, smi, canonical=True):
     """
     Method translates a SMILES-string to a undirected
     graph G(V,E) with featureless vertices and unweighted
     edges, e.g. the graph equivalent of a saturated hydrocarbon.
     Input:
     smi
     """
     # Make a copy of the molecule to address the degrees
     mol = MolFromSmiles(smi)
     degrees = [atom.GetDegree() for atom in mol.GetAtoms()]
     edges = [(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())
              for bond in mol.GetBonds()]
     return self.Write(degrees, edges, canonical=canonical)
示例#9
0
def get_max_atom_bond_size(smiles_iterator, explicit_hs=True):
    """ Convienence function to get max_atoms, max_bonds for a set of input
    SMILES """

    max_atoms = 0
    max_bonds = 0
    for smiles in tqdm(smiles_iterator):
        mol = MolFromSmiles(smiles)
        if explicit_hs:
            mol = AddHs(mol)
        max_atoms = max([max_atoms, len(mol.GetAtoms())])
        max_bonds = max([max_bonds, len(mol.GetBonds())])

    return dict(max_atoms=max_atoms, max_bonds=max_bonds * 2)
示例#10
0
 def process(self, smiles): #构图
     mol = MolFromSmiles(smiles)
     n = mol.GetNumAtoms()+1
     graph = DGLGraph()
     graph.add_nodes(n)
     graph.add_edges(graph.nodes(), graph.nodes())
     graph.add_edges(range(1, n), 0)
     for e in mol.GetBonds():
         u, v = e.GetBeginAtomIdx(), e.GetEndAtomIdx()
         graph.add_edge(u+1, v+1)
         graph.add_edge(v+1, u+1)
     adj = graph.adjacency_matrix(transpose=False).to_dense()
     v, m = torch.cat([atom_feature(atom)[0][None, :] for atom in mol.GetAtoms()]), FEATURE_DIM
     vec = torch.cat([torch.zeros((1, m)),v]).to(self.device)
     return GCNPoint(n, adj, vec)
    def gen_graph(self, smi):

        mol = MolFromSmiles(smi)
        adj = Chem.rdmolops.GetAdjacencyMatrix(mol)
        feat = []
        for atom in mol.GetAtoms():
            feat.append(atom.GetAtomicNum())

        #cut and padding
        adj = adj[:self.max_atom, :self.max_atom]
        adj_pad = np.zeros((self.max_atom, self.max_atom))
        adj_pad[:len(adj), :len(adj)] = adj + np.eye(len(adj))

        feat = feat[:self.max_atom]
        padding = [0 for _ in range(self.max_atom - len(feat))]
        feat.extend(padding)

        return feat, adj_pad
示例#12
0
    def process(self, smiles):  #构图
        mol = MolFromSmiles(smiles)
        n = mol.GetNumAtoms()
        graph = DGLGraph()
        graph.add_nodes(n)
        graph.add_edges(graph.nodes(), graph.nodes())
        graph.add_edges(range(1, n), 0)
        graph.ndata["element"] = torch.tensor(
            [ATOM[atom.GetAtomicNum()] for atom in mol.GetAtoms()])
        graph.ndata["explicit"] = torch.tensor(
            [atom.GetExplicitValence() for atom in mol.GetAtoms()])
        graph.ndata["implicit"] = torch.tensor(
            [atom.GetImplicitValence() for atom in mol.GetAtoms()])
        graph.ndata["hybrid"] = torch.tensor(
            [HYBRID[atom.GetHybridization()] for atom in mol.GetAtoms()])
        graph.ndata["hcount"] = torch.tensor(
            [atom.GetTotalNumHs() for atom in mol.GetAtoms()])
        graph.ndata["degree"] = torch.tensor(
            [atom.GetDegree() for atom in mol.GetAtoms()])
        graph.ndata["charge"] = torch.tensor(
            [atom.GetFormalCharge() + 2 for atom in mol.GetAtoms()])
        graph.ndata["ring"] = torch.tensor(
            [int(atom.IsInRing()) for atom in mol.GetAtoms()])
        graph.ndata["aromatic"] = torch.tensor(
            [int(atom.GetIsAromatic()) for atom in mol.GetAtoms()])
        for e in mol.GetBonds():
            u, v = e.GetBeginAtomIdx(), e.GetEndAtomIdx()
            graph.add_edge(u, v)
            graph.add_edge(v, u)

        vec = self.embed(graph.ndata["element"] + graph.ndata["explicit"] +
                         graph.ndata["implicit"] + graph.ndata["hybrid"] +
                         graph.ndata["hcount"] + graph.ndata["degree"] +
                         graph.ndata["charge"] + graph.ndata["ring"] +
                         graph.ndata["aromatic"])
        return GNNPoint(n, graph, vec)
def load_ptc(smile_file, result_file):
    filtered = []
    valid_list = ['MR=P', 'MR=CE', 'MR=SE', 'MR=NE', 'MR=N']
    f_smile = open(smile_file, 'r')
    f_result = open(result_file, 'r')
    smiles = []
    labels = []
    for line in f_smile:
        smile = line.split()[1]
        smiles.append(smile)
    for line in f_result:
        words = line.split(',')
        data = words[0]
        label = data.split()[1]
        labels.append(label)

    for i in range(len(smiles)):
        smile = smiles[i]
        label = labels[i]
        if label not in valid_list:
            continue
        if label in ['MR=P', 'MR=CE', 'MR=SE']:
            label = 1
        else:
            label = 0
        filtered.append((smile, label))

    graphs = []
    id = 0
    for data in filtered:
        smile = data[0]
        label = data[1]
        mol = MolFromSmiles(str(smile))
        if mol is None:
            continue
        adj = construct_edge_matrix_from(mol)
        atom_list = [a.GetSymbol() for a in mol.GetAtoms()]
        graphs.append((id, atom_list, adj, label))
        id += 1
    atom2id = getAtom2id(graphs)
    graphs = convert_graph_1(graphs, atom2id)
    return graphs
示例#14
0
def construct_RGCN_bigraph_from_smiles(smiles):
    g = DGLGraph()

    # Add nodes
    mol = MolFromSmiles(smiles)
    num_atoms = mol.GetNumAtoms()
    g.add_nodes(num_atoms)
    atoms_feature_all = []
    for atom_index, atom in enumerate(mol.GetAtoms()):
        atom_feature = atom_features(atom).tolist()
        atoms_feature_all.append(atom_feature)
    g.ndata["atom"] = torch.tensor(atoms_feature_all)



    # Add edges
    src_list = []
    dst_list = []
    etype_feature_all = []
    num_bonds = mol.GetNumBonds()
    for i in range(num_bonds):
        bond = mol.GetBondWithIdx(i)
        etype_feature = etype_features(bond)
        u = bond.GetBeginAtomIdx()
        v = bond.GetEndAtomIdx()
        src_list.extend([u, v])
        dst_list.extend([v, u])
        etype_feature_all.append(etype_feature)
        etype_feature_all.append(etype_feature)

    g.add_edges(src_list, dst_list)
    normal_all = []
    for i in etype_feature_all:
        normal = etype_feature_all.count(i)/len(etype_feature_all)
        normal = round(normal, 1)
        normal_all.append(normal)

    g.edata["etype"] = torch.tensor(etype_feature_all)
    g.edata["normal"] = torch.tensor(normal_all)
    return g
示例#15
0
 def impsmiles(self, m, keepiso=False):
     """make implicit smiles from molecule"""
     import re
     #smi = MolToSmiles(m, canonical=True, isomericSmiles=True)
     smi = MolToSmiles(m, canonical=False, isomericSmiles=keepiso)
     if keepiso == False:
         smi = smi.replace('([H])', '')
         smi = smi.replace('[H]', '')
     mol = MolFromSmiles(smi)
     outsmi = ""
     iatom = 0
     atoms = mol.GetAtoms()
     #parts = re.split("([\d\(\)\+\[\]-=#:])",smi);
     parts = re.split("(\[.*?\]|Cl|Br|F|I|B|C|c|N|n|O|o|S|s|P|p)", smi)
     #return [a.GetSymbol() for a in atoms]
     for p in parts:
         if len(p) == 0:
             pass
         elif p.isalpha():
             hcount = atoms[iatom].GetImplicitValence()
             if hcount == 0: outsmi += p
             elif hcount == 1: outsmi += "[%sH]" % p
             else: outsmi += "[%sH%d]" % (p, hcount)
             iatom += 1
         elif p.startswith("["):
             hcount = atoms[iatom].GetNumImplicitHs()
             if hcount == 0: outsmi += p
             elif hcount == 1:
                 outsmi += "[%sH%d]" % (atoms[iatom].GetSymbol(),
                                        atoms[iatom].GetFormalCharge())
             else:
                 outsmi += "[%sH%d%+d]" % (atoms[iatom].GetSymbol(), hcount,
                                           atoms[iatom].GetFormalCharge())
             iatom += 1
         else:
             outsmi += p
     return outsmi
示例#16
0
def extract_graph(data_path, out_file_path, max_atom_num, label_name=None):
    import os
    from rdkit import RDConfig
    from rdkit.Chem import ChemicalFeatures
    fdefName = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')
    factory = ChemicalFeatures.BuildFeatureFactory(fdefName)

    data_pd = pd.read_csv(data_path)
    smiles_list = data_pd['SMILES'].tolist()

    symbol_candidates = set()
    atom_attribute_dim = num_atom_features()
    bond_attribute_dim = num_bond_features()

    node_attribute_matrix_list = []
    bond_attribute_matrix_list = []
    adjacent_matrix_list = []
    distance_matrix_list = []
    valid_index = []

    ###
    degree_set = set()
    h_num_set = set()
    implicit_valence_set = set()
    charge_set = set()
    ###

    for line_idx, smiles in enumerate(smiles_list):
        smiles = smiles.strip()
        mol = MolFromSmiles(smiles)
        AllChem.Compute2DCoords(mol)
        conformer = mol.GetConformers()[0]
        feats = factory.GetFeaturesForMol(mol)
        acceptor_atom_ids = map(
            lambda x: x.GetAtomIds()[0],
            filter(lambda x: x.GetFamily() == 'Acceptor', feats))
        donor_atom_ids = map(lambda x: x.GetAtomIds()[0],
                             filter(lambda x: x.GetFamily() == 'Donor', feats))

        adjacent_matrix = np.zeros((max_atom_num, max_atom_num))
        adjacent_matrix = adjacent_matrix.astype(int)
        distance_matrix = np.zeros((max_atom_num, max_atom_num))
        node_attribute_matrix = np.zeros((max_atom_num, atom_attribute_dim))
        node_attribute_matrix = node_attribute_matrix.astype(int)

        if len(mol.GetAtoms()) > max_atom_num:
            print('Outlier {} has {} atoms'.format(line_idx,
                                                   mol.GetNumAtoms()))
            continue
        valid_index.append(line_idx)

        atom_positions = [None for _ in range(mol.GetNumAtoms() + 1)]
        for atom in mol.GetAtoms():
            atom_idx = atom.GetIdx()
            symbol_candidates.add(atom.GetSymbol())
            atom_positions[atom_idx] = conformer.GetAtomPosition(atom_idx)
            degree_set.add(atom.GetDegree())
            h_num_set.add(atom.GetTotalNumHs())
            implicit_valence_set.add(atom.GetImplicitValence())
            charge_set.add(atom.GetFormalCharge())
            node_attribute_matrix[atom_idx] = extract_atom_features(
                atom,
                is_acceptor=atom_idx in acceptor_atom_ids,
                is_donor=atom_idx in donor_atom_ids)
        node_attribute_matrix_list.append(node_attribute_matrix)

        for idx_i in range(mol.GetNumAtoms()):
            for idx_j in range(idx_i + 1, mol.GetNumAtoms()):
                distance = get_atom_distance(conformer.GetAtomPosition(idx_i),
                                             conformer.GetAtomPosition(idx_j))
                distance_matrix[idx_i, idx_j] = distance
                distance_matrix[idx_j, idx_i] = distance
        distance_matrix_list.append(distance_matrix)

        for bond in mol.GetBonds():
            begin_atom = bond.GetBeginAtom()
            end_atom = bond.GetEndAtom()
            begin_index = begin_atom.GetIdx()
            end_index = end_atom.GetIdx()
            adjacent_matrix[begin_index, end_index] = 1
            adjacent_matrix[end_index, begin_index] = 1
        adjacent_matrix_list.append(adjacent_matrix)

    adjacent_matrix_list = np.asarray(adjacent_matrix_list)
    distance_matrix_list = np.asarray(distance_matrix_list)
    node_attribute_matrix_list = np.asarray(node_attribute_matrix_list)
    bond_attribute_matrix_list = np.asarray(bond_attribute_matrix_list)
    print('adjacent matrix shape\t', adjacent_matrix_list.shape)
    print('distance matrix shape\t', distance_matrix_list.shape)
    print('node attr matrix shape\t', node_attribute_matrix_list.shape)
    print('bond attr matrix shape\t', bond_attribute_matrix_list.shape)
    print(symbol_candidates)
    print('{} valid out of {}'.format(len(valid_index), len(smiles_list)))

    print('degree set:\t', degree_set)
    print('h num set: \t', h_num_set)
    print('implicit valence set: \t', implicit_valence_set)
    print('charge set:\t', charge_set)

    if label_name is None:
        np.savez_compressed(
            out_file_path,
            adjacent_matrix_list=adjacent_matrix_list,
            distance_matrix_list=distance_matrix_list,
            node_attribute_matrix_list=node_attribute_matrix_list,
            bond_attribute_matrix_list=bond_attribute_matrix_list)
    else:
        true_labels = data_pd[label_name].tolist()
        true_labels = np.array(true_labels)
        valid_index = np.array(valid_index)
        true_labels = true_labels[valid_index]
        np.savez_compressed(
            out_file_path,
            adjacent_matrix_list=adjacent_matrix_list,
            distance_matrix_list=distance_matrix_list,
            node_attribute_matrix_list=node_attribute_matrix_list,
            bond_attribute_matrix_list=bond_attribute_matrix_list,
            label_name=true_labels)
    print()
    return
示例#17
0
    def construct_feature_matrices(self, smiles, train=True):
        """ construct a molecule from the given smiles string and return atom
        and bond classes.

        Returns
        dict with entries
        'n_atom' : number of atoms in the molecule
        'n_bond' : number of bonds in the molecule 
        'atom' : (n_atom,) length list of atom classes
        'bond' : (n_bond,) list of bond classes
        'connectivity' : (n_bond, 2) array of source atom, target atom pairs.

        """

        self.atom_tokenizer.train = train
        self.bond_tokenizer.train = train

        logger = logging.getLogger(__name__)
        mol = MolFromSmiles(smiles)
        if self.explicit_hs:
            mol = AddHs(mol)

        n_atom = mol.GetNumAtoms()
        n_bond = 2 * mol.GetNumBonds()

        # If its an isolated atom, add a self-link
        if n_bond == 0:
            n_bond = 1
            logger.warning(f'Found molecule {smiles} with zero bonds')

        atom_feature_matrix = np.zeros(n_atom, dtype='int')
        bond_feature_matrix = np.zeros(n_bond, dtype='int')
        bond_indices = np.zeros(n_bond, dtype='int')
        connectivity = np.zeros((n_bond, 2), dtype='int')

        bond_index = 0
        for n, atom in enumerate(mol.GetAtoms()):

            # Atom Classes
            atom_feature_matrix[n] = self.atom_tokenizer(
                self.atom_features(atom))

            start_index = atom.GetIdx()

            for bond in atom.GetBonds():
                # Is the bond pointing at the target atom
                rev = bond.GetBeginAtomIdx() != start_index

                # Bond Classes
                bond_feature_matrix[bond_index] = self.bond_tokenizer(
                    self.bond_features(bond, flipped=rev))

                # Connect edges to original bonds
                bond_indices[bond_index] = bond.GetIdx()

                # Connectivity
                if not rev:  # Original direction
                    connectivity[bond_index, 0] = bond.GetBeginAtomIdx()
                    connectivity[bond_index, 1] = bond.GetEndAtomIdx()

                else:  # Reversed
                    connectivity[bond_index, 0] = bond.GetEndAtomIdx()
                    connectivity[bond_index, 1] = bond.GetBeginAtomIdx()

                bond_index += 1

        # Track the largest atom and bonds seen
        if train:
            if n_atom > self.max_atoms:
                self.max_atoms = n_atom
            if mol.GetNumBonds() > self.max_bonds:
                self.max_bonds = mol.GetNumBonds()

        return {
            'n_atom': n_atom,
            'n_bond': mol.GetNumBonds(),  # the real number of bonds
            'bond_indices': bond_indices,
            'atom': atom_feature_matrix,
            'bond': bond_feature_matrix,
            'connectivity': connectivity,
        }
示例#18
0
from rdkit.Chem import MolFromSmiles as MFS
from tqdm import tqdm

atoms = set()
with open("data/ZINC/ghose_filtered/raw/smiles.txt") as f:
  for smiles in tqdm(f.readlines()):
    mol = MFS(smiles.strip())
    if mol is None:
      continue
    mol_atoms = mol.GetAtoms()
    for atom in mol_atoms:
      atoms.add(atom.GetSymbol())
print(atoms)
print(len(atoms))
with open("data/ZINC/ghose_filtered/raw/atom_types.txt", "w") as f:
  f.write(" ".join(atoms))
示例#19
0
    R_cnn = R_cnn_1 + R_cnn_2 + R_cnn_3 + R_cnn_4 + R_cnn_5 + R_cnn_6 + \
            R_cnn_7 + R_cnn_8 + R_cnn_9 + R_cnn_10 + R_cnn_15 + R_cnn_20

    LRPCheck("Deconvolution:", R_cnn, l_out, verbose)

    scores = np.sum(R_cnn, axis=1)

    return y_real[0], scores, np.sum(l_out) - np.sum(R_cnn)


# Main Code
smiles = CanonSmiles(smiles, useChiral=0)
mol = MolFromSmiles(smiles)
mw = Descriptors.ExactMolWt(mol)
atoms = {a.GetIdx(): a.GetSmarts() for a in mol.GetAtoms()}
impacts = np.zeros(len(atoms), dtype='float')

print("Predicting %i atoms..." % (len(atoms)))
vals = []
for idx, a in tqdm(atoms.items()):
    val, scores, _ = calcQSAR(mol, idx, mw, verbose=False)
    vals.append(val)
    impacts[idx] = scores[0]

res = np.mean(vals)
std = np.std(vals)

print("\n{} Prediction = {:.7f} +/- {:7f} {}".format(info[0], res, 1.96 * std / math.sqrt(len(vals)), info[3]))

# plot the results
示例#20
0
文件: preprocessor.py 项目: akey7/nfp
    def construct_feature_matrices(self, smiles):
        """ construct a molecule from the given smiles string and return atom
        and bond classes.

        Returns
        dict with entries
        'n_atom' : number of atoms in the molecule
        'n_bond' : number of bonds in the molecule 
        'atom' : (n_atom,) length list of atom classes
        'bond' : (n_bond,) list of bond classes
        'connectivity' : (n_bond, 2) array of source atom, target atom pairs.

        """

        mol = MolFromSmiles(smiles)
        if self.explicit_hs:
            mol = AddHs(mol)

        n_atom = len(mol.GetAtoms())
        n_bond = 2 * len(mol.GetBonds())

        # If its an isolated atom, add a self-link
        if n_bond == 0:
            n_bond = 1

        atom_feature_matrix = np.zeros(n_atom, dtype='int')
        bond_feature_matrix = np.zeros(n_bond, dtype='int')
        connectivity = np.zeros((n_bond, 2), dtype='int')

        bond_index = 0

        atom_seq = mol.GetAtoms()
        atoms = [atom_seq[i] for i in range(n_atom)]

        for n, atom in enumerate(atoms):

            # Atom Classes
            atom_feature_matrix[n] = self.atom_tokenizer(
                self.atom_features(atom))

            start_index = atom.GetIdx()

            for bond in atom.GetBonds():
                # Is the bond pointing at the target atom
                rev = bond.GetBeginAtomIdx() != start_index

                # Bond Classes
                bond_feature_matrix[bond_index] = self.bond_tokenizer(
                    self.bond_features(bond, flipped=rev))

                # Connectivity
                if not rev:  # Original direction
                    connectivity[bond_index, 0] = bond.GetBeginAtomIdx()
                    connectivity[bond_index, 1] = bond.GetEndAtomIdx()

                else:  # Reversed
                    connectivity[bond_index, 0] = bond.GetEndAtomIdx()
                    connectivity[bond_index, 1] = bond.GetBeginAtomIdx()

                bond_index += 1

        return {
            'n_atom': n_atom,
            'n_bond': n_bond,
            'atom': atom_feature_matrix,
            'bond': bond_feature_matrix,
            'connectivity': connectivity,
        }
    def parse_smiles_str(self, smiles_str, id, target=None):
        # Use RDKit to parse SMILES string
        mol = MolFromSmiles(smiles_str)
        if not mol:
            return None

        # Represent Hydrogen atoms explicity (if necessary)
        if self.config['explicit_Hs']:
            mol = Chem.AddHs(mol)

        # Compute number of nodes (atoms) and edges (bonds)
        n_nodes, n_edges = mol.GetNumAtoms(), mol.GetNumBonds()

        # Allocate space for Numpy arrays representing the molecular graph
        node_features = np.zeros((n_nodes, self.num_node_features), dtype=np.float32)
        edge_features = np.zeros((n_edges, self.num_edge_features), dtype=np.float32)
        adj_mat = np.zeros((2*n_edges, 2), dtype=np.int64)  # Adjacency matrix (sparse representation)
        inc_mat = np.zeros((2*n_edges, 2), dtype=np.int64)  # Incidence matrix (sparse representation)

        # Retrieve node (atom) features, if needed
        if self.num_node_features > 0:
            for i, atom in enumerate(mol.GetAtoms()):
                node_features[i] = self.get_node_features(atom)

        # Retrieve edges (bonds)
        for i, bond in enumerate(mol.GetBonds()):
            # Fill in the two pairs of indices this edge (bond) contributes to the adjacency matrix
            adj_mat[2*i] = [bond.GetBeginAtom().GetIdx(), bond.GetEndAtom().GetIdx()]
            adj_mat[2*i+1] = [bond.GetEndAtom().GetIdx(), bond.GetBeginAtom().GetIdx()]
            # Fill in the two pairs of indices this edge (bond) contributes to the incidence matrix
            inc_mat[2*i] = [bond.GetBeginAtom().GetIdx(), i]
            inc_mat[2*i+1] = [bond.GetEndAtom().GetIdx(), i]

            # Retrieve edge (bond) features, if needed
            if self.num_edge_features > 0:
                edge_features[i] = self.get_edge_features(bond)

        # Sort the adjacency and incidence matrices lexicographically
        adj_mat = adj_mat[np.lexsort((adj_mat[:, 1], adj_mat[:, 0]))]
        inc_mat = inc_mat[np.lexsort((inc_mat[:, 1], inc_mat[:, 0]))]

        # Represent molecular graph as a dictionary
        g = {'node_features': node_features, 'edge_features': edge_features, 'adj_mat': adj_mat, 'inc_mat': inc_mat}

        # Add target(s) (if any), making sure they are a NumPy array object with method tobytes()
        if target is not None:
            # Convert scalars to NumPy array
            if not isinstance(target, np.ndarray):
                target = np.array(target, np.float32)

            # Ensure target is of type np.float32
            target = target.astype(np.float32)

            # Flatten targets of rank >= 2
            if target.ndim > 1:
                target = target.flatten()

            # Store target as a (row) 2D NumPy array (for compatibility)
            g['target'] = np.reshape(target, (1, -1))
            n_targets = g['target'].shape[1]
        # If there are no targets, add an empty NumPy array (for compatibility)
        else:
            g['target'] = np.zeros((1, 0), dtype=np.float32)
            n_targets = 0

        # Add ID, making sure it is a NumPy array object with method tobytes()
        if not isinstance(target, np.ndarray):
            id = np.array(id, np.int64)
        g['id'] = id

        # Finally, add shape information. The last element refers to the number of graphs, and is included for
        # compatibility with batched graphs
        g['shape'] = np.array((n_nodes, n_edges, self.num_node_features, self.num_edge_features, n_targets, 1),
                              np.int64)

        return g
示例#22
0
    def process(self):
        if osp.exists(
                os.path.join(self.processed_dir,
                             'Decagon-{}-multi.pt'.format(self.datatype))):
            return

        data_list = []

        # >>> Obtain One-Hot Encoding for Side-Effects
        json_dict = {
            literal_eval(k): v
            for k, v in self.json_load[self.datatype].items()
        }
        total = len(json_dict)

        for idx, (smiles1, smiles2) in enumerate(json_dict):
            printProgress(idx + 1, total,
                          '{} dataset preparation: '.format(self.datatype),
                          ' ', 2, 50)
            mol1 = MolFromSmiles(smiles1)
            mol2 = MolFromSmiles(smiles2)
            label = np.array(json_dict[(smiles1, smiles2)])
            #print(len(label[label == 1]))
            #print(len(label[label == 0]))
            #print("\n{}-[{},{},{}:{}] : {}".format(mode, smiles1, smiles2, se, target_dict[se], label))

            if mol1 is None or mol2 is None:
                print("There is a missing drug from the pair (%s,%s)" %
                      (mol1, mol2))
                continue

            ######################################################################
            # >>> Get pairwise graph G1, G2
            c1_size = mol1.GetNumAtoms()
            c2_size = mol2.GetNumAtoms()

            if c1_size == 0 or c2_size == 0:
                print("There is a size error from pair (%s,%s)" % (mol1, mol2))
                continue

            atoms1 = mol1.GetAtoms()
            atoms2 = mol2.GetAtoms()
            bonds1 = mol1.GetBonds()
            bonds2 = mol2.GetBonds()

            features, edges = [], []

            for atom in atoms1:
                feature = atom_features(atom)
                features.append(feature / sum(feature))  # normalize
            for atom in atoms2:
                feature = atom_features(atom)
                features.append(feature / sum(feature))  # normalize
            for bond in bonds1:
                edges.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()])
            for bond in bonds2:
                edges.append([
                    bond.GetBeginAtomIdx() + c1_size,
                    bond.GetEndAtomIdx() + c1_size
                ])

            if len(edges) == 0:
                continue

            G = nx.Graph(edges).to_directed()
            edge_index = [[e1, e2] for e1, e2 in G.edges]

            GraphSiameseData = DATA.Data(
                x=torch.Tensor(features),
                edge_index=torch.LongTensor(edge_index).transpose(1, 0),
                y=torch.Tensor(label).view(1, -1))
            GraphSiameseData.__setitem__('c1_size',
                                         torch.LongTensor([c1_size]))
            GraphSiameseData.__setitem__('c2_size',
                                         torch.LongTensor([c2_size]))
            data_list.append(GraphSiameseData)
            ###########################################################################

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]
        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]

        # check this function
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])
示例#23
0
    def process(self):
        if osp.exists(
                os.path.join(self.processed_dir,
                             'Decagon-{}.pt'.format(self.datatype))):
            return

        data_list = []

        # >>> Obtain One-Hot Encoding for Side-Effects
        target_list = []
        with open(self.total_data_dir, 'r', encoding='utf-8') as f:
            rdr = csv.reader(f)
            for line in rdr:
                target_list.append(line[-1])

        label_encoder = LabelEncoder()
        label_encoder.fit(
            target_list
        )  # Automatically generate one-hot labels for side-effects
        label_list = label_encoder.transform(target_list)
        num_classes = len(label_encoder.classes_)

        target_dict = {}
        for target_idx, targets in enumerate(target_list):
            target_dict[targets] = label_list[target_idx]

        for label_idx, mode in enumerate(['negative', 'positive']):
            # negative will be 0, positive will be 1
            pair_list, se_list = [], []
            with open(osp.join(self.dataset_dir,
                               'Decagon-{}-{}.csv'.format(mode,
                                                          self.datatype)),
                      'r',
                      encoding='utf-8') as f:
                rdr = csv.reader(f)
                for line in rdr:
                    se_list.append(line[-1])
                    pair_list.append(line[:-1])
            one_hot = [0] * num_classes
            total = len(pair_list)

            for idx, (smiles_pair, se) in enumerate(zip(pair_list, se_list)):
                smiles1, smiles2 = smiles_pair
                side_effect = one_hot.copy()
                side_effect[target_dict[se]] = 1

                printProgress(idx + 1, total,
                              '{} dataset preparation: '.format(self.datatype),
                              ' ', 2, 50)
                mol1 = MolFromSmiles(smiles1)
                mol2 = MolFromSmiles(smiles2)
                label = [int(label_idx)]

                #print("\n{}-[{},{},{}:{}] : {}".format(mode, smiles1, smiles2, se, target_dict[se], label))

                if mol1 is None or mol2 is None:
                    print("There is a missing drug from the pair (%s,%s)" %
                          (mol1, mol2))
                    continue

                ######################################################################
                # >>> Get pairwise graph G1, G2
                c1_size = mol1.GetNumAtoms()
                c2_size = mol2.GetNumAtoms()

                if c1_size == 0 or c2_size == 0:
                    print("There is a size error from pair (%s,%s)" %
                          (mol1, mol2))
                    continue

                atoms1 = mol1.GetAtoms()
                atoms2 = mol2.GetAtoms()
                bonds1 = mol1.GetBonds()
                bonds2 = mol2.GetBonds()

                features, edges = [], []

                for atom in atoms1:
                    feature = atom_features(atom)
                    features.append(feature / sum(feature))  # normalize
                for atom in atoms2:
                    feature = atom_features(atom)
                    features.append(feature / sum(feature))  # normalize
                for bond in bonds1:
                    edges.append(
                        [bond.GetBeginAtomIdx(),
                         bond.GetEndAtomIdx()])
                for bond in bonds2:
                    edges.append([
                        bond.GetBeginAtomIdx() + c1_size,
                        bond.GetEndAtomIdx() + c1_size
                    ])

                if len(edges) == 0:
                    continue

                G = nx.Graph(edges).to_directed()
                edge_index = [[e1, e2] for e1, e2 in G.edges]

                GraphSiameseData = DATA.Data(
                    x=torch.Tensor(features),
                    edge_index=torch.LongTensor(edge_index).transpose(1, 0),
                    y=torch.Tensor(label).view(-1, 1))
                GraphSiameseData.__setitem__('c1_size',
                                             torch.LongTensor([c1_size]))
                GraphSiameseData.__setitem__('c2_size',
                                             torch.LongTensor([c2_size]))
                GraphSiameseData.__setitem__(
                    'side_effect',
                    torch.Tensor(side_effect).view(1, -1))
                data_list.append(GraphSiameseData)
                ###########################################################################

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]
        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]

        # check this function
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])