示例#1
0
def structure_standardization(smi):
    """
    Standardization function to clean up smiles with RDKit. First, the input smiles is converted into a mol object.
    Not-readable SMILES are written to the log file. The molecule size is checked by the number of atoms (non-hydrogen).
    If the molecule has more than 100 non-hydrogen atoms, the compound is discarded and written in the log file.
    Molecules with number of non-hydrogen atoms <= 100 are standardized with the MolVS toolkit
    (https://molvs.readthedocs.io/en/latest/index.html) relying on RDKit. Molecules which failed the standardization
    process are saved in the log file. The remaining standardized structures are converted back into their canonical
    SMILES format.
    :param smi: Input SMILES from the given structure data file T4
    :return: smi_clean: Cleaned and standardized canonical SMILES of the given input SMILES.
    """
    tautomer.TAUTOMER_TRANSFORMS = update_tautomer_rules()
    importlib.reload(MolVS_standardizer)
    param = ReadConfig()
    standardization_param = param.get_conf_dict(parameters.get_parameters())['standardization']

    max_num_atoms = standardization_param['max_num_atoms']
    max_num_tautomers = standardization_param['max_num_tautomers']
    include_stereoinfo = standardization_param['include_stereoinfo']
    my_standardizer = MolVS_standardizer.Standardizer(max_tautomers=max_num_tautomers)

    mol = MolFromSmiles(smi)  # Read SMILES and convert it to RDKit mol object.
    if mol is not None:  # Check, if the input SMILES has been converted into a mol object.
        if mol.GetNumAtoms() <= max_num_atoms:  # check size of the molecule based on the non-hydrogen atom count.
            try:

                mol = my_standardizer.charge_parent(mol)  # standardize molecules using MolVS and RDKit
                mol = my_standardizer.isotope_parent(mol)
                if include_stereoinfo is False:
                    mol = my_standardizer.stereo_parent(mol)
                    mol = my_standardizer.tautomer_parent(mol)
                    mol_clean = my_standardizer.standardize(mol)
                    smi_clean = MolToSmiles(mol_clean)  # convert mol object back to SMILES
                else:
                    mol = my_standardizer.tautomer_parent(mol)
                    mol_clean = my_standardizer.standardize(mol)
                    smi_clean = MolToSmiles(mol_clean)
            except (ValueError, AttributeError) as e:
                smi_clean = np.nan
                logging.error(
                    'Standardization error, ' + smi + ', Error Type: ' + str(
                        e))  # write failed molecules during standardization to log file

        else:
            smi_clean = np.nan
            logging.error('Molecule too large, ' + smi)

    else:
        smi_clean = np.nan
        logging.error('Reading Error, ' + smi)

    return smi_clean
示例#2
0
def test_custom_kekulize():
    smiles = 'CC=C1c2ccccc2C(=CC)c3ccccc13'
    smiles = 'N#CC1=C(SCC(=O)Nc2cccc(Cl)c2)N=C([O-])[C@H](C#N)C12CCCCC2'
    mol = MolFromSmiles(smiles)
    
    display(mol)
    for atom_idx in range(0,mol.GetNumAtoms()):
       bonds =  mol.GetAtomWithIdx(atom_idx).GetBonds()
       for bond in bonds:
           print(bond.GetBondType())
           
        
    non_aromatic_atoms = find_custom_Kekulize_set(mol,  60,  5)
    
    mol = custom_kekulize(mol,non_aromatic_atoms)
    
    display(mol)
    for atom_idx in range(0,mol.GetNumAtoms()):
       bonds =  mol.GetAtomWithIdx(atom_idx).GetBonds()
       for bond in bonds:
           print(bond.GetBondType())
示例#3
0
 def _one_random(smls, n, iso, q):
     res = list()
     for s in smls:
         r = list()
         m = MolFromSmiles(s)
         if m:
             start = time()
             while len(set(r)) < n and (time() - start) < 10:
                 ans = list(range(m.GetNumAtoms()))
                 np.random.shuffle(ans)
                 nm = RenumberAtoms(m, ans)
                 r.append(
                     MolToSmiles(nm, canonical=False, isomericSmiles=iso))
             res.extend(r)
     q.put(res)
示例#4
0
 def process(self, smiles): #构图
     mol = MolFromSmiles(smiles)
     n = mol.GetNumAtoms()+1
     graph = DGLGraph()
     graph.add_nodes(n)
     graph.add_edges(graph.nodes(), graph.nodes())
     graph.add_edges(range(1, n), 0)
     for e in mol.GetBonds():
         u, v = e.GetBeginAtomIdx(), e.GetEndAtomIdx()
         graph.add_edge(u+1, v+1)
         graph.add_edge(v+1, u+1)
     adj = graph.adjacency_matrix(transpose=False).to_dense()
     v, m = torch.cat([atom_feature(atom)[0][None, :] for atom in mol.GetAtoms()]), FEATURE_DIM
     vec = torch.cat([torch.zeros((1, m)),v]).to(self.device)
     return GCNPoint(n, adj, vec)
示例#5
0
    def calculate_single(self, smiles) -> Tuple:

        if smiles is nan:
            return None, False, "No smiles entry."
        try:
            mol = MolFromSmiles(
                smiles)  # Read SMILES and convert it to RDKit mol object.
        except (TypeError, ValueError, AttributeError) as e:
            return None, False, str(e)
        # Check, if the input SMILES has been converted into a mol object.
        if mol is None:
            return None, False, "failed to parse smiles {}".format(smiles)
        # check size of the molecule based on the non-hydrogen atom count.
        if mol.GetNumAtoms() >= self.max_num_atoms:
            return (
                None,
                False,
                "number of non-H atoms {0} exceeds limit of {1} for smiles {2}"
                .format(mol.GetNumAtoms(), self.max_num_atoms, smiles),
            )
        try:
            mol = rdMolStandardize.ChargeParent(
                mol)  # standardize molecules using MolVS and RDKit
            mol = self.isotope_parent(mol)
            if self.include_stereoinfo is False:
                Chem.RemoveStereochemistry(mol)
            mol = self.tautomerizer.Canonicalize(mol)
            mol_clean_tmp = self.my_standardizer(mol)
            smi_clean_tmp = MolToSmiles(
                mol_clean_tmp)  # convert mol object back to SMILES
            ## Double check if standardized SMILES is a valid mol object
            mol_clean = MolFromSmiles(smi_clean_tmp)
            smi_clean = MolToSmiles(mol_clean)
        except (TypeError, ValueError, AttributeError) as e:
            return None, False, str(e)
        return smi_clean, True, None
示例#6
0
def randomize_smiles(smiles, num=10, isomeric=True):
    """ Generate different SMILES representations for the same molecule

    :param smiles: {str} SMILES string
    :param num: {int} number of different SMILES strings to generate
    :param isomeric: {bool} whether to consider stereo centers
    :return: different SMILES representation for same molecule
    """
    m = MolFromSmiles(smiles)
    res = list()
    while len(set(res)) < num:
        ans = list(range(m.GetNumAtoms()))
        np.random.shuffle(ans)
        nm = RenumberAtoms(m, ans)
        res.append(MolToSmiles(nm, canonical=False, isomericSmiles=isomeric))
    return res
示例#7
0
    def test_12_convertToRdkit(self):
        smimol = SMILE_SMI
        sm = SmallMol(smimol, removeHs=True, fixHs=False)
        mrd = MolFromSmiles(smimol)
        mrd_natom = mrd.GetNumAtoms()

        sm_rd = sm.toRdkitMol(includeConformer=True)
        sm_rd_natoms = sm_rd.GetNumAtoms()

        self.assertIsInstance(
            sm_rd,
            rdkit.Chem.rdchem.Mol,
            msg="The conversion of the SmallMol object into the rdkit"
            "Mol one get wrong")
        self.assertEqual(
            sm_rd_natoms,
            mrd_natom,
            msg="NUmber of atoms different. The handle and convertion of the "
            "SmallMol object into the rdkit Mol one probably get wrong")
示例#8
0
def construct_RGCN_bigraph_from_smiles(smiles):
    g = DGLGraph()

    # Add nodes
    mol = MolFromSmiles(smiles)
    num_atoms = mol.GetNumAtoms()
    g.add_nodes(num_atoms)
    atoms_feature_all = []
    for atom_index, atom in enumerate(mol.GetAtoms()):
        atom_feature = atom_features(atom).tolist()
        atoms_feature_all.append(atom_feature)
    g.ndata["atom"] = torch.tensor(atoms_feature_all)



    # Add edges
    src_list = []
    dst_list = []
    etype_feature_all = []
    num_bonds = mol.GetNumBonds()
    for i in range(num_bonds):
        bond = mol.GetBondWithIdx(i)
        etype_feature = etype_features(bond)
        u = bond.GetBeginAtomIdx()
        v = bond.GetEndAtomIdx()
        src_list.extend([u, v])
        dst_list.extend([v, u])
        etype_feature_all.append(etype_feature)
        etype_feature_all.append(etype_feature)

    g.add_edges(src_list, dst_list)
    normal_all = []
    for i in etype_feature_all:
        normal = etype_feature_all.count(i)/len(etype_feature_all)
        normal = round(normal, 1)
        normal_all.append(normal)

    g.edata["etype"] = torch.tensor(etype_feature_all)
    g.edata["normal"] = torch.tensor(normal_all)
    return g
示例#9
0
def processline(t, step, line):
    global lensum
    if t.incr():
        return 1
    if step == 0:
        lensum += len(line)
    else:
        m = MolFromSmiles(line)
        if step == 100:
            lensum += len(line)
        elif step == 105:
            lensum += len(sha256(line).hexdigest())
        elif step in (110, 120):
            with open(tmpname, 'wb+') as f:
                print(line, file=f)
                if step == 120:
                    os.fsync(f.fileno())
            lensum += os.stat(tmpname).st_size
        elif step == 210:
            lensum += m.GetNumAtoms()
        elif step == 220:
            lensum += m.GetNumBonds()
        elif step == 300:
            lensum += len(MolToSmiles(m))
        elif step == 400:
            lensum += len(MolToMolBlock(m))
        elif step == 420:
            m2 = AddHs(m)
            EmbedMolecule(m2, randomSeed=2020)
            m2 = RemoveHs(m2)
            m2.SetProp("_Name", "test")
            lensum += len(MolToMolBlock(m2))
        elif step == 600:
            lensum += mol2file(m, 'svg')
        elif step == 610:
            lensum += mol2file(m, 'png')
        else:
            raise ValueError("Not implemented step " + str(step))

    return 0
示例#10
0
    def process(self, smiles):  #构图
        mol = MolFromSmiles(smiles)
        n = mol.GetNumAtoms()
        graph = DGLGraph()
        graph.add_nodes(n)
        graph.add_edges(graph.nodes(), graph.nodes())
        graph.add_edges(range(1, n), 0)
        graph.ndata["element"] = torch.tensor(
            [ATOM[atom.GetAtomicNum()] for atom in mol.GetAtoms()])
        graph.ndata["explicit"] = torch.tensor(
            [atom.GetExplicitValence() for atom in mol.GetAtoms()])
        graph.ndata["implicit"] = torch.tensor(
            [atom.GetImplicitValence() for atom in mol.GetAtoms()])
        graph.ndata["hybrid"] = torch.tensor(
            [HYBRID[atom.GetHybridization()] for atom in mol.GetAtoms()])
        graph.ndata["hcount"] = torch.tensor(
            [atom.GetTotalNumHs() for atom in mol.GetAtoms()])
        graph.ndata["degree"] = torch.tensor(
            [atom.GetDegree() for atom in mol.GetAtoms()])
        graph.ndata["charge"] = torch.tensor(
            [atom.GetFormalCharge() + 2 for atom in mol.GetAtoms()])
        graph.ndata["ring"] = torch.tensor(
            [int(atom.IsInRing()) for atom in mol.GetAtoms()])
        graph.ndata["aromatic"] = torch.tensor(
            [int(atom.GetIsAromatic()) for atom in mol.GetAtoms()])
        for e in mol.GetBonds():
            u, v = e.GetBeginAtomIdx(), e.GetEndAtomIdx()
            graph.add_edge(u, v)
            graph.add_edge(v, u)

        vec = self.embed(graph.ndata["element"] + graph.ndata["explicit"] +
                         graph.ndata["implicit"] + graph.ndata["hybrid"] +
                         graph.ndata["hcount"] + graph.ndata["degree"] +
                         graph.ndata["charge"] + graph.ndata["ring"] +
                         graph.ndata["aromatic"])
        return GNNPoint(n, graph, vec)
示例#11
0
    def construct_feature_matrices(self, smiles, train=True):
        """ construct a molecule from the given smiles string and return atom
        and bond classes.

        Returns
        dict with entries
        'n_atom' : number of atoms in the molecule
        'n_bond' : number of bonds in the molecule 
        'atom' : (n_atom,) length list of atom classes
        'bond' : (n_bond,) list of bond classes
        'connectivity' : (n_bond, 2) array of source atom, target atom pairs.

        """

        self.atom_tokenizer.train = train
        self.bond_tokenizer.train = train

        logger = logging.getLogger(__name__)
        mol = MolFromSmiles(smiles)
        if self.explicit_hs:
            mol = AddHs(mol)

        n_atom = mol.GetNumAtoms()
        n_bond = 2 * mol.GetNumBonds()

        # If its an isolated atom, add a self-link
        if n_bond == 0:
            n_bond = 1
            logger.warning(f'Found molecule {smiles} with zero bonds')

        atom_feature_matrix = np.zeros(n_atom, dtype='int')
        bond_feature_matrix = np.zeros(n_bond, dtype='int')
        bond_indices = np.zeros(n_bond, dtype='int')
        connectivity = np.zeros((n_bond, 2), dtype='int')

        bond_index = 0
        for n, atom in enumerate(mol.GetAtoms()):

            # Atom Classes
            atom_feature_matrix[n] = self.atom_tokenizer(
                self.atom_features(atom))

            start_index = atom.GetIdx()

            for bond in atom.GetBonds():
                # Is the bond pointing at the target atom
                rev = bond.GetBeginAtomIdx() != start_index

                # Bond Classes
                bond_feature_matrix[bond_index] = self.bond_tokenizer(
                    self.bond_features(bond, flipped=rev))

                # Connect edges to original bonds
                bond_indices[bond_index] = bond.GetIdx()

                # Connectivity
                if not rev:  # Original direction
                    connectivity[bond_index, 0] = bond.GetBeginAtomIdx()
                    connectivity[bond_index, 1] = bond.GetEndAtomIdx()

                else:  # Reversed
                    connectivity[bond_index, 0] = bond.GetEndAtomIdx()
                    connectivity[bond_index, 1] = bond.GetBeginAtomIdx()

                bond_index += 1

        # Track the largest atom and bonds seen
        if train:
            if n_atom > self.max_atoms:
                self.max_atoms = n_atom
            if mol.GetNumBonds() > self.max_bonds:
                self.max_bonds = mol.GetNumBonds()

        return {
            'n_atom': n_atom,
            'n_bond': mol.GetNumBonds(),  # the real number of bonds
            'bond_indices': bond_indices,
            'atom': atom_feature_matrix,
            'bond': bond_feature_matrix,
            'connectivity': connectivity,
        }
    def parse_smiles_str(self, smiles_str, id, target=None):
        # Use RDKit to parse SMILES string
        mol = MolFromSmiles(smiles_str)
        if not mol:
            return None

        # Represent Hydrogen atoms explicity (if necessary)
        if self.config['explicit_Hs']:
            mol = Chem.AddHs(mol)

        # Compute number of nodes (atoms) and edges (bonds)
        n_nodes, n_edges = mol.GetNumAtoms(), mol.GetNumBonds()

        # Allocate space for Numpy arrays representing the molecular graph
        node_features = np.zeros((n_nodes, self.num_node_features), dtype=np.float32)
        edge_features = np.zeros((n_edges, self.num_edge_features), dtype=np.float32)
        adj_mat = np.zeros((2*n_edges, 2), dtype=np.int64)  # Adjacency matrix (sparse representation)
        inc_mat = np.zeros((2*n_edges, 2), dtype=np.int64)  # Incidence matrix (sparse representation)

        # Retrieve node (atom) features, if needed
        if self.num_node_features > 0:
            for i, atom in enumerate(mol.GetAtoms()):
                node_features[i] = self.get_node_features(atom)

        # Retrieve edges (bonds)
        for i, bond in enumerate(mol.GetBonds()):
            # Fill in the two pairs of indices this edge (bond) contributes to the adjacency matrix
            adj_mat[2*i] = [bond.GetBeginAtom().GetIdx(), bond.GetEndAtom().GetIdx()]
            adj_mat[2*i+1] = [bond.GetEndAtom().GetIdx(), bond.GetBeginAtom().GetIdx()]
            # Fill in the two pairs of indices this edge (bond) contributes to the incidence matrix
            inc_mat[2*i] = [bond.GetBeginAtom().GetIdx(), i]
            inc_mat[2*i+1] = [bond.GetEndAtom().GetIdx(), i]

            # Retrieve edge (bond) features, if needed
            if self.num_edge_features > 0:
                edge_features[i] = self.get_edge_features(bond)

        # Sort the adjacency and incidence matrices lexicographically
        adj_mat = adj_mat[np.lexsort((adj_mat[:, 1], adj_mat[:, 0]))]
        inc_mat = inc_mat[np.lexsort((inc_mat[:, 1], inc_mat[:, 0]))]

        # Represent molecular graph as a dictionary
        g = {'node_features': node_features, 'edge_features': edge_features, 'adj_mat': adj_mat, 'inc_mat': inc_mat}

        # Add target(s) (if any), making sure they are a NumPy array object with method tobytes()
        if target is not None:
            # Convert scalars to NumPy array
            if not isinstance(target, np.ndarray):
                target = np.array(target, np.float32)

            # Ensure target is of type np.float32
            target = target.astype(np.float32)

            # Flatten targets of rank >= 2
            if target.ndim > 1:
                target = target.flatten()

            # Store target as a (row) 2D NumPy array (for compatibility)
            g['target'] = np.reshape(target, (1, -1))
            n_targets = g['target'].shape[1]
        # If there are no targets, add an empty NumPy array (for compatibility)
        else:
            g['target'] = np.zeros((1, 0), dtype=np.float32)
            n_targets = 0

        # Add ID, making sure it is a NumPy array object with method tobytes()
        if not isinstance(target, np.ndarray):
            id = np.array(id, np.int64)
        g['id'] = id

        # Finally, add shape information. The last element refers to the number of graphs, and is included for
        # compatibility with batched graphs
        g['shape'] = np.array((n_nodes, n_edges, self.num_node_features, self.num_edge_features, n_targets, 1),
                              np.int64)

        return g
示例#13
0
    def process(self):
        if osp.exists(
                os.path.join(self.processed_dir,
                             'Decagon-{}-multi.pt'.format(self.datatype))):
            return

        data_list = []

        # >>> Obtain One-Hot Encoding for Side-Effects
        json_dict = {
            literal_eval(k): v
            for k, v in self.json_load[self.datatype].items()
        }
        total = len(json_dict)

        for idx, (smiles1, smiles2) in enumerate(json_dict):
            printProgress(idx + 1, total,
                          '{} dataset preparation: '.format(self.datatype),
                          ' ', 2, 50)
            mol1 = MolFromSmiles(smiles1)
            mol2 = MolFromSmiles(smiles2)
            label = np.array(json_dict[(smiles1, smiles2)])
            #print(len(label[label == 1]))
            #print(len(label[label == 0]))
            #print("\n{}-[{},{},{}:{}] : {}".format(mode, smiles1, smiles2, se, target_dict[se], label))

            if mol1 is None or mol2 is None:
                print("There is a missing drug from the pair (%s,%s)" %
                      (mol1, mol2))
                continue

            ######################################################################
            # >>> Get pairwise graph G1, G2
            c1_size = mol1.GetNumAtoms()
            c2_size = mol2.GetNumAtoms()

            if c1_size == 0 or c2_size == 0:
                print("There is a size error from pair (%s,%s)" % (mol1, mol2))
                continue

            atoms1 = mol1.GetAtoms()
            atoms2 = mol2.GetAtoms()
            bonds1 = mol1.GetBonds()
            bonds2 = mol2.GetBonds()

            features, edges = [], []

            for atom in atoms1:
                feature = atom_features(atom)
                features.append(feature / sum(feature))  # normalize
            for atom in atoms2:
                feature = atom_features(atom)
                features.append(feature / sum(feature))  # normalize
            for bond in bonds1:
                edges.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()])
            for bond in bonds2:
                edges.append([
                    bond.GetBeginAtomIdx() + c1_size,
                    bond.GetEndAtomIdx() + c1_size
                ])

            if len(edges) == 0:
                continue

            G = nx.Graph(edges).to_directed()
            edge_index = [[e1, e2] for e1, e2 in G.edges]

            GraphSiameseData = DATA.Data(
                x=torch.Tensor(features),
                edge_index=torch.LongTensor(edge_index).transpose(1, 0),
                y=torch.Tensor(label).view(1, -1))
            GraphSiameseData.__setitem__('c1_size',
                                         torch.LongTensor([c1_size]))
            GraphSiameseData.__setitem__('c2_size',
                                         torch.LongTensor([c2_size]))
            data_list.append(GraphSiameseData)
            ###########################################################################

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]
        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]

        # check this function
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])
示例#14
0
def structure_standardization(smi: str) -> str:
    """
    Standardization function to clean up smiles with RDKit. First, the input smiles is converted into a mol object.
    Not-readable SMILES are written to the log file. The molecule size is checked by the number of atoms (non-hydrogen).
    If the molecule has more than 100 non-hydrogen atoms, the compound is discarded and written in the log file.
    Molecules with number of non-hydrogen atoms <= 100 are standardized with the MolVS toolkit
    (https://molvs.readthedocs.io/en/latest/index.html) relying on RDKit. Molecules which failed the standardization
    process are saved in the log file. The remaining standardized structures are converted back into their canonical
    SMILES format.
    :param smi: Input SMILES from the given structure data file T4
    :return: smi_clean: Cleaned and standardized canonical SMILES of the given input SMILES.


    Args:
        smi (str): Non-standardized smiles string

    Returns:
        str: standardized smiles string
    """

    # tautomer.TAUTOMER_TRANSFORMS = update_tautomer_rules()
    # importlib.reload(MolVS_standardizer)
    # param = ReadConfig()
    standardization_param = ConfigDict.get_parameters()["standardization"]

    max_num_atoms = standardization_param["max_num_atoms"]
    max_num_tautomers = standardization_param["max_num_tautomers"]
    include_stereoinfo = standardization_param["include_stereoinfo"]

    ## Load new tautomer enumarator/canonicalizer
    tautomerizer = rdMolStandardize.TautomerEnumerator()
    tautomerizer.SetMaxTautomers(max_num_tautomers)
    tautomerizer.SetRemoveSp3Stereo(
        False)  # Keep stereo information of keto/enol tautomerization

    def isotope_parent(mol: Chem.Mol) -> Chem.Mol:
        """
        Isotope parent from MOLVS
        Return the isotope parent of a given molecule.
        The isotope parent has all atoms replaced with the most abundant isotope for that element.
        Args:
            mol (Chem.Mol): input rdkit mol object

        Returns:
            Chem.Mol: isotope parent rdkit mol object
        """
        mol = copy.deepcopy(mol)
        # Replace isotopes with common weight
        for atom in mol.GetAtoms():
            atom.SetIsotope(0)
        return mol

    def my_standardizer(mol: Chem.Mol) -> Chem.Mol:
        """
        MolVS implementation of standardization

        Args:
            mol (Chem.Mol): non-standardized rdkit mol object

        Returns:
            Chem.Mol: stndardized rdkit mol object
        """
        mol = copy.deepcopy(mol)
        Chem.SanitizeMol(mol)
        mol = Chem.RemoveHs(mol)
        disconnector = rdMolStandardize.MetalDisconnector()
        mol = disconnector.Disconnect(mol)
        normalizer = rdMolStandardize.Normalizer()
        mol = normalizer.normalize(mol)
        reionizer = rdMolStandardize.Reionizer()
        mol = reionizer.reionize(mol)
        Chem.AssignStereochemistry(mol, force=True, cleanIt=True)
        # TODO: Check this removes symmetric stereocenters
        return mol

    mol = MolFromSmiles(smi)  # Read SMILES and convert it to RDKit mol object.
    if (mol is not None
        ):  # Check, if the input SMILES has been converted into a mol object.
        if (
                mol.GetNumAtoms() <= max_num_atoms
        ):  # check size of the molecule based on the non-hydrogen atom count.
            try:

                mol = rdMolStandardize.ChargeParent(
                    mol)  # standardize molecules using MolVS and RDKit
                mol = isotope_parent(mol)
                if include_stereoinfo is False:
                    Chem.RemoveStereochemistry(mol)
                    mol = tautomerizer.Canonicalize(mol)
                    mol_clean = my_standardizer(mol)
                    smi_clean = MolToSmiles(
                        mol_clean)  # convert mol object back to SMILES
                else:
                    mol = tautomerizer.Canonicalize(mol)
                    mol_clean = my_standardizer(mol)
                    smi_clean = MolToSmiles(mol_clean)
            except (ValueError, AttributeError) as e:
                smi_clean = np.nan
                logging.error(
                    "Standardization error, " + smi + ", Error Type: " + str(e)
                )  # write failed molecules during standardization to log file

        else:
            smi_clean = np.nan
            logging.error("Molecule too large, " + smi)

    else:
        smi_clean = np.nan
        logging.error("Reading Error, " + smi)

    return smi_clean
示例#15
0
def extract_graph(data_path, out_file_path, max_atom_num, label_name=None):
    import os
    from rdkit import RDConfig
    from rdkit.Chem import ChemicalFeatures
    fdefName = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')
    factory = ChemicalFeatures.BuildFeatureFactory(fdefName)

    data_pd = pd.read_csv(data_path)
    smiles_list = data_pd['SMILES'].tolist()

    symbol_candidates = set()
    atom_attribute_dim = num_atom_features()
    bond_attribute_dim = num_bond_features()

    node_attribute_matrix_list = []
    bond_attribute_matrix_list = []
    adjacent_matrix_list = []
    distance_matrix_list = []
    valid_index = []

    ###
    degree_set = set()
    h_num_set = set()
    implicit_valence_set = set()
    charge_set = set()
    ###

    for line_idx, smiles in enumerate(smiles_list):
        smiles = smiles.strip()
        mol = MolFromSmiles(smiles)
        AllChem.Compute2DCoords(mol)
        conformer = mol.GetConformers()[0]
        feats = factory.GetFeaturesForMol(mol)
        acceptor_atom_ids = map(
            lambda x: x.GetAtomIds()[0],
            filter(lambda x: x.GetFamily() == 'Acceptor', feats))
        donor_atom_ids = map(lambda x: x.GetAtomIds()[0],
                             filter(lambda x: x.GetFamily() == 'Donor', feats))

        adjacent_matrix = np.zeros((max_atom_num, max_atom_num))
        adjacent_matrix = adjacent_matrix.astype(int)
        distance_matrix = np.zeros((max_atom_num, max_atom_num))
        node_attribute_matrix = np.zeros((max_atom_num, atom_attribute_dim))
        node_attribute_matrix = node_attribute_matrix.astype(int)

        if len(mol.GetAtoms()) > max_atom_num:
            print('Outlier {} has {} atoms'.format(line_idx,
                                                   mol.GetNumAtoms()))
            continue
        valid_index.append(line_idx)

        atom_positions = [None for _ in range(mol.GetNumAtoms() + 1)]
        for atom in mol.GetAtoms():
            atom_idx = atom.GetIdx()
            symbol_candidates.add(atom.GetSymbol())
            atom_positions[atom_idx] = conformer.GetAtomPosition(atom_idx)
            degree_set.add(atom.GetDegree())
            h_num_set.add(atom.GetTotalNumHs())
            implicit_valence_set.add(atom.GetImplicitValence())
            charge_set.add(atom.GetFormalCharge())
            node_attribute_matrix[atom_idx] = extract_atom_features(
                atom,
                is_acceptor=atom_idx in acceptor_atom_ids,
                is_donor=atom_idx in donor_atom_ids)
        node_attribute_matrix_list.append(node_attribute_matrix)

        for idx_i in range(mol.GetNumAtoms()):
            for idx_j in range(idx_i + 1, mol.GetNumAtoms()):
                distance = get_atom_distance(conformer.GetAtomPosition(idx_i),
                                             conformer.GetAtomPosition(idx_j))
                distance_matrix[idx_i, idx_j] = distance
                distance_matrix[idx_j, idx_i] = distance
        distance_matrix_list.append(distance_matrix)

        for bond in mol.GetBonds():
            begin_atom = bond.GetBeginAtom()
            end_atom = bond.GetEndAtom()
            begin_index = begin_atom.GetIdx()
            end_index = end_atom.GetIdx()
            adjacent_matrix[begin_index, end_index] = 1
            adjacent_matrix[end_index, begin_index] = 1
        adjacent_matrix_list.append(adjacent_matrix)

    adjacent_matrix_list = np.asarray(adjacent_matrix_list)
    distance_matrix_list = np.asarray(distance_matrix_list)
    node_attribute_matrix_list = np.asarray(node_attribute_matrix_list)
    bond_attribute_matrix_list = np.asarray(bond_attribute_matrix_list)
    print('adjacent matrix shape\t', adjacent_matrix_list.shape)
    print('distance matrix shape\t', distance_matrix_list.shape)
    print('node attr matrix shape\t', node_attribute_matrix_list.shape)
    print('bond attr matrix shape\t', bond_attribute_matrix_list.shape)
    print(symbol_candidates)
    print('{} valid out of {}'.format(len(valid_index), len(smiles_list)))

    print('degree set:\t', degree_set)
    print('h num set: \t', h_num_set)
    print('implicit valence set: \t', implicit_valence_set)
    print('charge set:\t', charge_set)

    if label_name is None:
        np.savez_compressed(
            out_file_path,
            adjacent_matrix_list=adjacent_matrix_list,
            distance_matrix_list=distance_matrix_list,
            node_attribute_matrix_list=node_attribute_matrix_list,
            bond_attribute_matrix_list=bond_attribute_matrix_list)
    else:
        true_labels = data_pd[label_name].tolist()
        true_labels = np.array(true_labels)
        valid_index = np.array(valid_index)
        true_labels = true_labels[valid_index]
        np.savez_compressed(
            out_file_path,
            adjacent_matrix_list=adjacent_matrix_list,
            distance_matrix_list=distance_matrix_list,
            node_attribute_matrix_list=node_attribute_matrix_list,
            bond_attribute_matrix_list=bond_attribute_matrix_list,
            label_name=true_labels)
    print()
    return
示例#16
0
    def process(self):
        if osp.exists(
                os.path.join(self.processed_dir,
                             'Decagon-{}.pt'.format(self.datatype))):
            return

        data_list = []

        # >>> Obtain One-Hot Encoding for Side-Effects
        target_list = []
        with open(self.total_data_dir, 'r', encoding='utf-8') as f:
            rdr = csv.reader(f)
            for line in rdr:
                target_list.append(line[-1])

        label_encoder = LabelEncoder()
        label_encoder.fit(
            target_list
        )  # Automatically generate one-hot labels for side-effects
        label_list = label_encoder.transform(target_list)
        num_classes = len(label_encoder.classes_)

        target_dict = {}
        for target_idx, targets in enumerate(target_list):
            target_dict[targets] = label_list[target_idx]

        for label_idx, mode in enumerate(['negative', 'positive']):
            # negative will be 0, positive will be 1
            pair_list, se_list = [], []
            with open(osp.join(self.dataset_dir,
                               'Decagon-{}-{}.csv'.format(mode,
                                                          self.datatype)),
                      'r',
                      encoding='utf-8') as f:
                rdr = csv.reader(f)
                for line in rdr:
                    se_list.append(line[-1])
                    pair_list.append(line[:-1])
            one_hot = [0] * num_classes
            total = len(pair_list)

            for idx, (smiles_pair, se) in enumerate(zip(pair_list, se_list)):
                smiles1, smiles2 = smiles_pair
                side_effect = one_hot.copy()
                side_effect[target_dict[se]] = 1

                printProgress(idx + 1, total,
                              '{} dataset preparation: '.format(self.datatype),
                              ' ', 2, 50)
                mol1 = MolFromSmiles(smiles1)
                mol2 = MolFromSmiles(smiles2)
                label = [int(label_idx)]

                #print("\n{}-[{},{},{}:{}] : {}".format(mode, smiles1, smiles2, se, target_dict[se], label))

                if mol1 is None or mol2 is None:
                    print("There is a missing drug from the pair (%s,%s)" %
                          (mol1, mol2))
                    continue

                ######################################################################
                # >>> Get pairwise graph G1, G2
                c1_size = mol1.GetNumAtoms()
                c2_size = mol2.GetNumAtoms()

                if c1_size == 0 or c2_size == 0:
                    print("There is a size error from pair (%s,%s)" %
                          (mol1, mol2))
                    continue

                atoms1 = mol1.GetAtoms()
                atoms2 = mol2.GetAtoms()
                bonds1 = mol1.GetBonds()
                bonds2 = mol2.GetBonds()

                features, edges = [], []

                for atom in atoms1:
                    feature = atom_features(atom)
                    features.append(feature / sum(feature))  # normalize
                for atom in atoms2:
                    feature = atom_features(atom)
                    features.append(feature / sum(feature))  # normalize
                for bond in bonds1:
                    edges.append(
                        [bond.GetBeginAtomIdx(),
                         bond.GetEndAtomIdx()])
                for bond in bonds2:
                    edges.append([
                        bond.GetBeginAtomIdx() + c1_size,
                        bond.GetEndAtomIdx() + c1_size
                    ])

                if len(edges) == 0:
                    continue

                G = nx.Graph(edges).to_directed()
                edge_index = [[e1, e2] for e1, e2 in G.edges]

                GraphSiameseData = DATA.Data(
                    x=torch.Tensor(features),
                    edge_index=torch.LongTensor(edge_index).transpose(1, 0),
                    y=torch.Tensor(label).view(-1, 1))
                GraphSiameseData.__setitem__('c1_size',
                                             torch.LongTensor([c1_size]))
                GraphSiameseData.__setitem__('c2_size',
                                             torch.LongTensor([c2_size]))
                GraphSiameseData.__setitem__(
                    'side_effect',
                    torch.Tensor(side_effect).view(1, -1))
                data_list.append(GraphSiameseData)
                ###########################################################################

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]
        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]

        # check this function
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])