示例#1
0
 def _similarity( self, id0, id1, **kwarg ) :
     # Retrieves the parent molecules of the MCS.
     mol0 = KBASE.ask( id0 )
     mol1 = KBASE.ask( id1 )
     if (mol0.total_charge() != mol1.total_charge()) :
         return 0.0
     return 1.0
示例#2
0
def create(basic_graph, mcs_ids, rule, add_attr=True):
    """
    Returns a graph. Node = molecule's ID or name, edge = similarity score
    
    @type  mcs_ids : C{list} of C{str}
    @param mcs_ids : A list of common substructures' IDs
    @type  rule    : C{Rule}
    @param rule    : The rule to determine the similarity score between two structures
    """
    g = copy.deepcopy(basic_graph)
    for id in mcs_ids:
        id0, id1 = mcs.get_parent_ids(id)
        simi = rule.similarity(id0, id1, mcs_id=id)
        if (simi > 0):
            if (add_attr):
                try:
                    partial_ring = int(KBASE.ask(id, "partial_ring"))
                except LookupError:
                    partial_ring = 0
                try:
                    slack_simi = KBASE.ask(id, "slack_similarity")
                except LookupError:
                    slack_simi = 0.0
                g.add_edge(id0,
                           id1,
                           similarity=simi,
                           slack_similarity=slack_simi,
                           partial_ring=partial_ring,
                           mcs_id=id)
            else:
                g.add_edge(id0, id1, similarity=simi)
    return g
示例#3
0
    def _similarity( self, id0, id1, **kwarg ) :
        # Uses the first common substructure.
        num_atom_mcs  = KBASE.ask( kwarg["mcs_id"], "num_heavy_atoms" )
        num_atom_mol0 = len( KBASE.ask( id0 ).heavy_atoms() )
        num_atom_mol1 = len( KBASE.ask( id1 ).heavy_atoms() )

        return float( (num_atom_mcs  >= self._threshold    ) or
                      (num_atom_mol0 <  self._threshold + 3) or
                      (num_atom_mol1 <  self._threshold + 3) )
示例#4
0
def annotate_nodes_with_smiles(g):
    """

    """
    for molid in g.nodes():
        try:
            smiles = KBASE.ask(molid, "SMILES")
        except LookupError:
            smiles = KBASE.ask(molid).smiles()
            KBASE.deposit_extra(molid, "SMILES", smiles)
        g.node[molid]["SMILES"] = smiles
示例#5
0
 def read_n_files( filenames ) :
     """                                                   
     `filenames' is a list of file names. Reads the files and deposits them into the `KBASE'. Returns a list of keys.                                                           
     """                                                   
     strucid = []
     for fn in filenames :
         strucs = read_file( fn )
         for e in strucs:                                  
             id = KBASE.deposit( e.id(), e )               
             KBASE.deposit_extra(id, "filename", (fn))
             e.set_id( id )                                
             strucid.append( id )                          
     return strucid
示例#6
0
 def read_n_files( filenames ) :
     """
     `filenames' is a list of file names. The format of each file will be determined from the file's extension name. Reads
     the files and deposits them into the `KBASE'. Returns a list of keys.
     """
     strucid = []
     for fn in filenames :
         strucs = read_file( fn )
         for e in strucs :
             id = KBASE.deposit( e.id(), e )
             KBASE.deposit_extra(id, "filename", (fn))
             e.set_id( id )
             strucid.append( id )
     return strucid
示例#7
0
def annotate_edges_with_smiles(g):
    """

    """
    for e in g.edges(data=True):
        try:
            mcs_id = e[2]["mcs_id"]
            try:
                smiles = KBASE.ask(mcs_id, "SMILES")
            except LookupError:
                smiles = mcs.get_struc(mcs_id).smiles()
            KBASE.deposit_extra(mcs_id, "SMILES", smiles)
            g[e[0]][e[1]]["SMILES"] = smiles
        except KeyError:
            pass
示例#8
0
def annotate_nodes_with_title(g):
    """

    """
    for molid in g.nodes():
        g.node[molid]["title"] = KBASE.ask(molid).title()
        g.node[molid]["label"] = molid[:7]
示例#9
0
def add_mcs_id( mcs_id_list, graph ):
    for edge in graph.edges(data = True):
        mol0_id = edge[0]
        mol1_id = edge[1]
        mol0 = KBASE.ask( mol0_id )
        mol1 = KBASE.ask( mol1_id )
        name0 = mol0.title()
        name1 = mol1.title()
        mcs_title = "mcs@%s..%s" % (name0, name1,)
        mcs_id = hashlib.sha1( mcs_title ).hexdigest()
        if mcs_id not in mcs_id_list:
            # the first mcs_id is not in id list need to generate reverse one 
            mcs_title = "mcs@%s..%s" % (name1, name0,)
            mcs_id = hashlib.sha1( mcs_title ).hexdigest() 
            if mcs_id not in mcs_id_list:
                sys.exit()   
        graph.add_edge(mol0_id, mol1_id, mcs_id = mcs_id)
示例#10
0
文件: mcs.py 项目: biocad/leadoptmap
def get_parent_ids(mcs_id):
    """
    Returns a pair of IDs of the common substructure's parents.

    @type  mcs_id: C{str}
    @param mcs_id: ID of the common substructure
    """
    return KBASE.ask(mcs_id, "mcs-parents")
示例#11
0
 def similarity( self, id0, id1, **kwarg ) :
     try :
         mcs_id = kwarg["mcs_id"]
         simi   = KBASE.ask( mcs_id, "similarity" )
     except KeyError :
         simi = Rule.similarity( self, id0, id1, **kwarg )
     if (simi < self._cutoff) :
         simi = 0.0
     return simi
示例#12
0
def annotate_edges_with_matches(g):
    """

    """
    for e in g.edges(data=True):
        try:
            mcs_id = e[2]["mcs_id"]
            mol0 = KBASE.ask(e[0])
            mol1 = KBASE.ask(e[1])
            mcs_matches = KBASE.ask(mcs_id, "mcs-matches")
            trimmed_mcs = KBASE.ask(mcs_id, "trimmed-mcs")
            layout_mcs = KBASE.ask(mcs_id, "layout_mcs")
            g[e[0]][e[1]]["original-mcs"] = {
                e[0]: mol0.smarts(mcs_matches[e[0]]),
                e[1]: mol1.smarts(mcs_matches[e[1]]),
            }
            g[e[0]][e[1]]["trimmed-mcs"] = trimmed_mcs
            g[e[0]][e[1]]["layout_mcs"] = layout_mcs
        except KeyError:
            pass
示例#13
0
    def _similarity( self, id0, id1, **kwarg ) :
        # Uses the first common substructure.
        mcs_id = kwarg["mcs_id"]
        mcs0   = mcs.get_struc( mcs_id )
        mol0   = KBASE.ask( id0 )
        mol1   = KBASE.ask( id1 )

        num_heavy_atoms = len( mcs0.heavy_atoms() )
        num_light_atoms = len( mcs0.atom ) - num_heavy_atoms
        
        KBASE.deposit_extra( mcs_id, "num_heavy_atoms", num_heavy_atoms )
        KBASE.deposit_extra( mcs_id, "num_light_atoms", num_light_atoms )
        
        return similarity.by_heavy_atom_count( mol0, mol1, mcs0 )
示例#14
0
def matrix ( mols, mcs_ids, rule ):
    import numpy
    id_list = []
    id_vs_simi = {}
    id_vs_title = {}
    title_vs_simi = {}
    title_list = []
    filename_vs_title = {}
    for mol in mols:
        #generate dictionary of id vs title of giving mols
        title = mol.title()
        id = mol.id()
        file_path = KBASE.ask (id, "filename")
        filename = os.path.basename(file_path)
        if id not in id_list:
            id_list.append(id)
        if title not in title_list:
            title_list.append(title)
        id_vs_title [id] = title
        filename_vs_title [filename] = title
    for id in mcs_ids:
        #generate dictionary of pair's title vs similarity score
        id0, id1 = mcs.get_parent_ids(id)
        simi       = rule.similarity( id0, id1, mcs_id = id )        
        title0 = id_vs_title[id0]
        title1 = id_vs_title[id1]
        title_vs_simi [(title0,title1)] = simi
    #generate the score matrix 
    size          = len( title_list )
    scores = numpy.zeros( (size, size,) )
    for i in range( size ) :
            scores[i, i] = 1.0
            for j in range( i + 1, size ) :
                title_i = title_list[i]
                title_j = title_list[j]
                if title_vs_simi.has_key((title_i,title_j)):
                    simi = title_vs_simi[(title_i,title_j)]
                    scores[i, j] = simi                
                    scores[j, i] = simi

    return (title_list, id_list,filename_vs_title, scores)                
示例#15
0
文件: mcs.py 项目: biocad/leadoptmap
def get_struc(mcs_id):
    """
    get the mcs strcuture based on mcs_id
    """
    title = KBASE.ask(mcs_id)
    id0, id1 = KBASE.ask(mcs_id, "mcs-parents")
    mcs_matches = KBASE.ask(mcs_id, "mcs-matches")
    atom_match0, atom_match1 = mcs_matches[id0], mcs_matches[id1]
    mol0 = KBASE.ask(id0)
    mol1 = KBASE.ask(id1)
    mcs = KBASE.ask(id0).extract(atom_match0)

    for i, e in enumerate(atom_match1, start=1):
        mcs.atom_prop[i]["mapped_index"] = e

    mcs.set_title(title)
    mcs.set_id(mcs_id)
    return mcs
示例#16
0
文件: mcs.py 项目: biocad/leadoptmap
    def deposit_to_kbase(id0, id1, atom_match0, atom_match1):
        """
        Deposits a MCS substructure and relevant information into the kbase and returns its ID in the C{KBASE}.

        @type         id0: C{str}
        @param        id0: ID of the first (reference) molecule in the C{KBASE}
        @type         id1: C{str}
        @param        id1: ID of the second molecule in the C{KBASE}
        @type  atom_match: C{list} of C{int}
        @param atom_match: A list of atom indices of matches atoms in the reference molecule
        @type     mcs_mol: C{Struc}
        @param    mcs_mol: C{Struc} object of the MCS substructure        
        """
        mol0 = KBASE.ask(id0)
        mol1 = KBASE.ask(id1)
        name0 = mol0.title()
        name1 = mol1.title()
        mcs_title = "mcs@%s..%s" % (
            name0,
            name1,
        )
        mcs_id = hashlib.sha1(mcs_title).hexdigest()
        mcs_id = KBASE.deposit(mcs_id, mcs_title)

        # Sorts the two lists according to the ascending order of atom indices of the first list.
        atom_match0, atom_match1 = zip(*sorted(zip(atom_match0, atom_match1),
                                               cmp=lambda x, y: x[0] - y[0]))
        atom_match0, atom_match1 = list(atom_match0), list(atom_match1)

        KBASE.deposit_extra(mcs_id, "mcs-parents", (
            id0,
            id1,
        ))
        KBASE.deposit_extra(mcs_id, "mcs-matches", {
            id0: atom_match0,
            id1: atom_match1,
        })

        return mcs_id
示例#17
0
def main(molid_list, opt, args):
    """
    @type  molid_list: C{list} of C{str}'s
    @param molid_list: A list of molecule IDs in the C{KBASE}
    """
    #load mols files
    if (opt.graph):
        g = pickle.load(open(opt.graph))
    else:
        mols = []
        for id in molid_list[opt.receptor:]:
            mols.append(KBASE.ask(id))
    #choose mcs search engine and rules
        if (struc.infrastructure == "schrodinger"):
            mcs_engine = mcs.SchrodMcs(1)
            basic_rule = rule.Mcs(
                rule.EqualCharge(),
                rule.TrimMcs(True, rule.MinimumNumberOfAtom()))
            slack_rule = rule.Mcs(
                rule.EqualCharge(),
                rule.TrimMcs(False, rule.MinimumNumberOfAtom()))
        elif (struc.infrastructure == "oechem"):
            mcs_engine = mcs.OeMcs()
            basic_rule = rule.Mcs(
                rule.EqualCharge(),
                rule.TrimMcs_oe(True, rule.MinimumNumberOfAtom()))
            slack_rule = rule.Mcs(
                rule.EqualCharge(),
                rule.TrimMcs_oe(False, rule.MinimumNumberOfAtom()))

        logging.info("MCS searching...")
        mcs_ids = mcs_engine.search_all(mols, opt)
        logging.info("MCS searching... Done")

        #build score matrix from mcs search enable Jonathan's graph planning algorithm
        if (opt.build):
            import build
            (title_list, id_list, filename_vs_title,
             strict_score) = build.matrix(mols, mcs_ids, basic_rule)
            (title_list, id_list, filename_vs_title,
             unstrict_score) = build.matrix(mols, mcs_ids, slack_rule)
            import GraphGenerator4 as gg4
            knownCompoundsList = []
            #load the name list of coumpounds with known experimental value if there is any
            try:
                with open(args[0] + "/knownCompounds") as kcFile:
                    knownCompoundsList = kcFile.readlines()
                knownCompoundsList = [
                    filename_vs_title[name.strip()]
                    for name in knownCompoundsList
                ]
            except IOError:
                print "No Known Compounds Listed"

            gg = gg4.GraphGenerator4(strict_score, unstrict_score, 0.05, 6,
                                     title_list, id_list, knownCompoundsList)
            g = gg.getGraphObject()
            build.add_mcs_id(mcs_ids, g)
            c = networkx.connected_component_subgraphs(g)

        # Gets graph (`g') and clusters (`c') using schrodinger's graph planning algorithm
        else:
            logging.info("Creating graph...")
            g, c = graph.gen_graph(mcs_ids,
                                   basic_rule,
                                   slack_rule,
                                   simi_cutoff=0.05,
                                   max_csize=100,
                                   num_c2c=1)
        graph.annotate_nodes_with_smiles(g)
        graph.annotate_nodes_with_title(g)
        graph.annotate_edges_with_smiles(g)
        graph.annotate_edges_with_hexcode(g)
        graph.annotate_edges_with_matches(g)
        logging.info("Creating graph... Done")

        logging.debug(
            "DEBUG: %d clusters (counted as the connected components in the graph):"
            % len(c))
        c.sort(lambda x, y: len(x) - len(y))
        for i, e in enumerate(c):
            logging.debug("DEBUG: cluster #%d, %d structures:" % (
                i,
                len(e),
            ))
            titles = [KBASE.ask(id).title() for id in e]
            titles.sort()
            for t in titles:
                logging.debug("DEBUG:  %s" % t)
        # store graph for reusing and analysing
        pkl_fname = opt.output + ".pkl"
        pkl_fh = open(pkl_fname, "w")
        pickle.dump(g, pkl_fh)
        pkl_fh.close()

        try:
            #use pygraphviz for graph layout
            import graphviz

            ag = networkx.to_agraph(g)
            ag.node_attr["fixedsize"] = True
            ag.edge_attr["penwidth"] = 2.0

            simi = [float(e.attr["similarity"]) for e in ag.edges()]
            scale = 1.0 / max(simi)
            for e in ag.edges_iter():
                try:
                    partial_ring = int(e.attr["partial_ring"])
                except (ValueError, TypeError):
                    partial_ring = 0
                saturation = float(e.attr["similarity"]) * scale
                saturation = 0.0 if (saturation < 0) else (1.0 if (
                    saturation > 1) else saturation)
                e.attr["color"] = "0.8396,%f,0.8" % saturation
                e.attr["weight"] = saturation
                del e.attr["label"]
                if (saturation < 0.01 or partial_ring):
                    e.attr["style"] = "dashed"
            ag.write(opt.output + ".dot")
        except ImportError:
            logging.warn(
                "WARNING: pygraphviz is not installed. Cannot write a .dot output file."
            )

    edges = g.edges(data=True)
    logging.info("%d edges in total" % len(edges))

    #generate schrodinger FEP input files
    if (opt.siminp):
        if (opt.siminp_type == "gro"):
            raise NotImplementedError(
                "Support for writing Gromacs input files is not yet implemented."
            )
        if (opt.siminp_type == "mae"):
            import schrodinger.application.desmond.fep_mapping as dfm

            tmp_mae_fname = mcs.tempfile_basename + "_siminp.mae"
            receptor_mol = []

            if (opt.receptor):
                for e in range(opt.receptor):
                    mol = KBASE.ask(molid_list[e])
                    mol._struc.property["s_leadoptmap_moltype"] = "receptor"
                    receptor_mol.append(mol)

            for id0, id1, attr in edges:
                mol0 = KBASE.ask(id0)
                mol1 = KBASE.ask(id1)
                out_fname = "%s_%s_%s.mae" % (
                    opt.siminp,
                    id0[:7],
                    id1[:7],
                )
                mol0._struc.property["s_leadoptmap_moltype"] = "ligand"
                mol1._struc.property["s_leadoptmap_moltype"] = "%s:%s" % (
                    id0,
                    id1,
                )

                mol0.write(tmp_mae_fname, mode="w")
                mol1.write(tmp_mae_fname, mode="a")

                try:
                    overwrite = True
                    data = dfm.get_atom_mapping_data(tmp_mae_fname, atomtype=3)
                    if (opt.receptor):
                        overwrite = False
                        receptor_mol[0].write(out_fname, mode="w")
                        for i in range(1, opt.receptor):
                            receptor_mol[i].write(out_fname, mode="a")
                    dfm.write_fepsubst_to_file(data,
                                               out_fname,
                                               overwrite=overwrite)
                except (
                        RuntimeError,
                        NameError,
                ):
                    logging.warn(
                        "WARNING: Failed to write the input files for '%s' and '%s'."
                        % (
                            mol0,
                            mol1,
                        ))
    if (not opt.save):
        tmp_fnames = glob.glob(mcs.tempfile_basename + "*")
        for fname in tmp_fnames:
            os.remove(fname)
示例#18
0
            for e in strucs:                                  
                id = KBASE.deposit( e.id(), e )               
                KBASE.deposit_extra(id, "filename", (fn))
                e.set_id( id )                                
                strucid.append( id )                          
        return strucid


    infrastructure = "oechem"

except ImportError, e :
    pass



if (infrastructure is None) :
    print "ERROR: Need either Schrodinger's or OEChem's infrastructure to run, but none is found."
    import sys
    sys.exit( 1 )

    


if ("__main__" == __name__) :
    filenames = ["xfer3.10.mol2", "xfer3.11.mol2",]
    id_list = read_n_files( filenames )
    mol0 = KBASE.ask( id_list[0] )
    print mol0.title(), len( mol0.heavy_atoms() )
    mol1 = KBASE.ask( id_list[1] )
    print mol1.title(), len( mol1.heavy_atoms() )
示例#19
0
文件: mcs.py 项目: biocad/leadoptmap
        def search_all(self, mols, opt):
            if (not opt.mcs):
                mae_fname = tempfile_basename + ".mae"
                out_fname = tempfile_basename + ".csv"
                log_fname = tempfile_basename + ".log"
                log_fh = open(log_fname, "w")

                if (os.path.isfile(mae_fname)):
                    os.remove(mae_fname)

                for mol in mols:
                    title = mol.title()
                    mol.set_title(mol.id())
                    mol.write(mae_fname)
                    mol.set_title(title)
                cmd = [
                    self._cmd,
                    "-imae",
                    mae_fname,
                    "-opw",
                    out_fname,
                    "-atomtype",
                    str(self._typing),
                    "-nobreakring",
                ]
                mcs_proc = subprocess.Popen(cmd,
                                            stderr=subprocess.STDOUT,
                                            stdout=log_fh)
                null, stderr = mcs_proc.communicate()
                val = mcs_proc.returncode

                if (val == 17):
                    raise RuntimeError(
                        "Used a MCS feature that requires Schrodinger's CANVAS_ELEMENTS license."
                    )
                if (val != 0):
                    msg = "CanvasMCS exited prematurely. This could be because the input molecules were too dissimilar" \
                          " or too numerous, or because the chosen atom-typing scheme was too general."
                    with open(out_fname) as fh:
                        msg += "\n\n"
                        msg += fh.read()
                    raise RuntimeError(msg)
            else:
                logging.debug(
                    "DEBUG: Reuse previous MCS searching results: '%s'." %
                    opt.mcs)
                out_fname = opt.mcs

            with open(out_fname, "r") as fh:
                import csv

                lines = fh.readlines()[1:]
                mcs_match = []
                for tokens in csv.reader(lines):
                    mcs_match.append(
                        McsMatch(tokens[1], tokens[3], tokens[11], tokens[14],
                                 tokens[9], tokens[12]))

            ret = []
            for m in mcs_match:
                id0 = m.mol0_id
                id1 = m.mol1_id
                mol0 = KBASE.ask(id0)
                mol1 = KBASE.ask(id1)

                atom_match0 = [int(i) for i in m.mcs_atom0.split(',')]
                atom_match1 = [int(i) for i in m.mcs_atom1.split(',')]

                ret.append(
                    self.deposit_to_kbase(id0, id1, atom_match0, atom_match1))

            return ret
示例#20
0
文件: mcs.py 项目: biocad/leadoptmap
    mol1 = KBASE.ask(id1)
    mcs = KBASE.ask(id0).extract(atom_match0)

    for i, e in enumerate(atom_match1, start=1):
        mcs.atom_prop[i]["mapped_index"] = e

    mcs.set_title(title)
    mcs.set_id(mcs_id)
    return mcs


if ("__main__" == __name__):
    filenames = [
        "xfer3.11.mol2",
        "xfer3.12.mol2",
    ]
    id_list = struc.read_n_files(filenames)
    mol0 = KBASE.ask(id_list[0])
    mol1 = KBASE.ask(id_list[1])
    mcs = SchrodMcs(3)
    mcs_id = mcs.search(mol0, mol1)[0]
    mol_id = KBASE.ask(mcs_id, "mcs-parents")[0]
    mcs_struc = KBASE.ask(mcs_id)[0]
    mol_struc = KBASE.ask(mol_id)

    out_fname = "out.mae"
    if (os.path.isfile(out_fname)):
        os.remove(out_fname)
    mol_struc.write(out_fname)
    mcs_struc.write(out_fname)
示例#21
0
        num_heavy_atoms = len( mcs0.heavy_atoms() )
        num_light_atoms = len( mcs0.atom ) - num_heavy_atoms

        KBASE.deposit_extra( mcs_id, "num_heavy_atoms", num_heavy_atoms )
        KBASE.deposit_extra( mcs_id, "num_light_atoms", num_light_atoms )

        return similarity.exp_delta( 2 * (orig_num_heavy_atoms - num_heavy_atoms), 0 )



# Example of a complex rule: A combination of a few simple rules (in case, they are Mcs, MinimumNumberOfAtom, and Cutoff).
# cutoff_simi = Cutoff( 0.2, Mcs( MinimumNumberOfAtom( 4 ) ) )



if ("__main__" == __name__) :
    import struc
    from mcs   import SchrodMcs
    from kbase import KBASE
    
    filenames = ["xfer3.11.mol2", "xfer3.12.mol2",]
    id_list   = struc.read_n_files( filenames )
    mol0      = KBASE.ask( id_list[0] )
    mol1      = KBASE.ask( id_list[1] )
    mcs       = SchrodMcs( 3 )
    mcs_id    = mcs.search( mol0, mol1 )[0]
    mol_id    = KBASE.ask( mcs_id, "mcs_parents" )

    print MCS.similarity( mol_id[0], mol_id[1] )
    
示例#22
0
def gen_graph(mcs_ids, basic_rule, slack_rule, simi_cutoff, max_csize,
              num_c2c):
    """
    Generates and returns a graph according to the requirements.
    
    @type      mcs_ids: C{list} of C{str}
    @param     mcs_ids: A list of ids of the maximum substructures in C{KBASE}
    @type         rule: C{rule.Rule}
    @param        rule: The rule to determine the similarity score between two structures
    @type  simi_cutoff: C{float}
    @param simi_cutoff: Cutoff of similarity scores. Values less than the cutoff are considered as 0.
    @type    max_csize: C{int}
    @param   max_csize: Maximum cluster size
    @type      num_c2c: C{int}
    @param     num_c2c: Number of cluster-to-cluster edges
    """
    basic_graph = networkx.Graph()
    all_ids = set()
    fh = open("simiscore", "w") if (logging.getLogger().getEffectiveLevel()
                                    == logging.DEBUG) else None
    logging.info("  Calculating similarity scores...")
    for id in mcs_ids:
        #get the id for mol0 and mol1
        id0, id1 = mcs.get_parent_ids(id)
        #calculate the similarity scores for molecule pair
        simi = basic_rule.similarity(id0, id1, mcs_id=id)
        slack_simi = slack_rule.similarity(id0, id1, mcs_id=id)
        KBASE.deposit_extra(id, "similarity", simi)
        KBASE.deposit_extra(id, "slack_similarity", slack_simi)
        all_ids.add(id0)
        all_ids.add(id1)
        if (fh):
            print >> fh, simi
    logging.info("  Calculating similarity scores... Done")
    basic_graph.add_nodes_from(all_ids)
    #create a complete graph
    complete = create(basic_graph, mcs_ids, rule.Cutoff(0))
    #delete connections with scores lower than simi_cutoff
    desired = cutoff_graph(complete, simi_cutoff)
    #get molecule clusters
    clusters = sorted(networkx.connected_components(desired),
                      cmp=lambda x, y: len(y) - len(x))

    logging.info("  Original number of clusters: %d" % len(clusters))
    #break down big clusters
    num_big_clusters = 0
    for i, c in enumerate(clusters):
        logging.info("    size of cluster #%02d: %d" % (i, len(c)), )
        num_big_clusters += (len(c) > max_csize)

    if (num_big_clusters and False):
        logging.info(
            "  %d cluster(s) are too big. Break them into smaller ones. Reclustering..."
            % num_big_clusters)
        new_clusters = []
        for c in clusters:
            if (max_csize < len(c)):
                new_clusters += break_cluster(desired.subgraph(c), simi_cutoff,
                                              max_csize)
            else:
                new_clusters.append(c)
        clusters = new_clusters
        logging.info("  Reclustering... Done")
    clusters = sorted(clusters, cmp=lambda x, y: len(y) - len(x))

    n = len(clusters)
    logging.info("  %d clusters in total" % n)
    for i, c in enumerate(clusters):
        logging.info("    size of cluster #%02d: %d" % (
            i,
            len(c),
        ))

    # Optimizes the subgraphs.
    logging.info("  Optimizing the subgraph of each cluster...")
    new_desired = networkx.Graph()
    for e in clusters:
        sg = optimize_graph(complete.subgraph(e), desired.subgraph(e), "trim",
                            simi_cutoff)
        new_desired = networkx.union(new_desired, sg)
    desired = new_desired
    logging.info("  Optimizing the subgraph of each cluster... Done")

    # Connects the clusters.
    unconnected_clusters = set(range(n))
    while (unconnected_clusters and n > 1):
        c2c_edges = []
        cluster_index = unconnected_clusters.pop()
        this_cluster = clusters[cluster_index]
        other_clusters = copy.copy(clusters)
        other_clusters.remove(this_cluster)
        for e in other_clusters:
            c2c_edges.extend(networkx.edge_boundary(complete, this_cluster, e))
        if (len(c2c_edges) == 0):
            logging.warn("WARNING: Cannot connect cluster #%d with others." %
                         (cluster_index, ))
            logging.warn(
                "         If there should be connections, consider to adjust the rules to"
            )
            logging.warn(
                "         reduce 0-similarity assignments or loosen the MCS conditions."
            )
            continue
        c2c_edges.sort(lambda x, y: cmp_edge(complete, x, y))
        connected_clusters = set()
        for k in range(-1, -num_c2c - 1, -1):
            edge = c2c_edges[k]
            node0 = edge[0]
            node1 = edge[1]
            simi = complete[node0][node1]["similarity"]
            mcs_id = complete[node0][node1]["mcs_id"]
            desired.add_edge(node0,
                             node1,
                             similarity=simi,
                             boundary=True,
                             mcs_id=mcs_id)
            logging.warn("  boundary similarity = %f between '%s' and '%s'" % (
                simi,
                KBASE.ask(node0),
                KBASE.ask(node1),
            ))
            for e in unconnected_clusters:
                if (node0 in clusters[e] or node1 in clusters[e]):
                    connected_clusters.add(e)
        unconnected_clusters -= connected_clusters

    return desired, clusters
示例#23
0
    def _similarity( self, id0, id1, **kwarg ) :
        # Uses the first common substructure.
        mcs_id = kwarg["mcs_id"]
        mcs0   = mcs.get_struc( mcs_id ).copy()
        mol0   = KBASE.ask( id0 )
        mol1   = KBASE.ask( id1 )

        orig_num_heavy_atoms = len( mcs0.heavy_atoms() )
        # Deletes chiral atoms.
        chiral_atoms = mcs0.chiral_atoms()
        ring_atoms   = mcs0.  ring_atoms()
        chiral_atoms.sort( reverse = True )
        for atom_index in chiral_atoms :
            if (atom_index in ring_atoms) :
                #if the chiral atom is in a ring, delete atoms attached to it but not in ring.
                bonded_atoms = set( mcs0.bonded_atoms( atom_index ) ) - ring_atoms
                if (bonded_atoms) :
                    i = 0
                    n = 0
                    for atom in bonded_atoms :
                        cp = mcs0.copy()
                        cp.delete_atom( atom )
                        m = len( cp.atom )
                        if (m > n) :
                            i = atom
                            n = m
                    mcs0.delete_atom( i )
                    mcs0 = mcs0.copy()
                else :
                    logging.warn( "WARNING: Cannot delete chiral atom #%d in structure: %s" % (atom_index, mcs0.title(),) )
            else :
                # If the chiral atom is not a ring atom, we simply delete it.
                mcs0.delete_atom( atom_index )
                mcs0 = mcs0.copy()

        # If the deletion results in multiple unconnected fragments, we keep only the biggest one.
        mcs0 = mcs0.copy()
        atoms_to_delete = []
        for e in mcs0.molecules()[1:] :
            atoms_to_delete.extend( e )
        mcs0.delete_atom( atoms_to_delete )
        mcs0 = mcs0.copy()
        partial_ring         = self._delete_broken_ring( mol0, mol1, mcs0 )
        mcs0 = mcs0.copy()
        atoms_to_delete_2 = []
        for e in mcs0.molecules()[1:] :
            atoms_to_delete_2.extend( e )
        mcs0.delete_atom( atoms_to_delete_2 )
        #Here is different from schrodinger's method. Since openeye cannot save mcs searching results as smart strings. Using smiles stirng instead for later layout.  
        smiles0 = mcs0.smiles()
        #Arbitrarily set the simles0 = smiles1 (do not considering the mcs searching difference between mol0 matching to mol1 vs mol1 matching to mol0)
        smiles1 = smiles0

        KBASE.deposit_extra( mcs_id, "trimmed-mcs",  {id0:smiles0, id1:smiles1,} )
        KBASE.deposit_extra( mcs_id, "partial_ring", len( partial_ring ) )
        KBASE.deposit_extra( mcs_id, "layout_mcs", smiles0 )

        num_heavy_atoms = len( mcs0.heavy_atoms() )
        num_light_atoms = len( mcs0.atom ) - num_heavy_atoms

        KBASE.deposit_extra( mcs_id, "num_heavy_atoms", num_heavy_atoms )
        KBASE.deposit_extra( mcs_id, "num_light_atoms", num_light_atoms )

        return similarity.exp_delta( 2 * (orig_num_heavy_atoms - num_heavy_atoms), 0 )
示例#24
0
    def _similarity( self, id0, id1, **kwarg ) :
        # Uses the first common substructure.
        mcs_id = kwarg["mcs_id"]
        mcs0   = mcs.get_struc( mcs_id ).copy()
        mol0   = KBASE.ask( id0 )
        mol1   = KBASE.ask( id1 )

        orig_num_heavy_atoms = len( mcs0.heavy_atoms() )
        partial_ring         = self._delete_broken_ring( mol0, mol1, mcs0 )
            
        # Deletes chiral atoms.
        chiral_atoms = mcs0.chiral_atoms()
        ring_atoms   = mcs0.  ring_atoms()
        chiral_atoms.sort( reverse = True )
        for atom_index in chiral_atoms :
            if (atom_index in ring_atoms) :
                bonded_atoms = set( mcs0.bonded_atoms( atom_index ) ) - ring_atoms
                if (bonded_atoms) :
                    i = 0
                    n = 0
                    for atom in bonded_atoms :
                        cp = mcs0.copy()
                        cp.delete_atom( atom )
                        m = len( cp.atom )
                        if (m > n) :
                            i = atom
                            n = m
                    mcs0.delete_atom( i )
            else :
                # If the chiral atom is not a ring atom, we simply delete it.
                mcs0.delete_atom( atom_index )

        # If the deletion results in multiple unconnected fragments, we keep only the biggest one.
        atoms_to_delete = []
        for e in mcs0.molecules()[1:] :
            atoms_to_delete.extend( e )
        mcs0.delete_atom( atoms_to_delete )

        # Gets the SMARTS for the trimmed structure.
        atom_list0 = []
        atom_list1 = []
        for e in mcs0.atom_prop[1:] :
            atom_list0.append( e[  "orig_index"] )
            atom_list1.append( e["mapped_index"] )

        smarts0 = mol0.smarts( atom_list0 )
        try :
            smarts1 = mol1.smarts( atom_list1 )
        except ValueError :
            smarts1 = ""
        
        KBASE.deposit_extra( mcs_id, "trimmed-mcs",  {id0:smarts0, id1:smarts1,} )
        KBASE.deposit_extra( mcs_id, "partial_ring", len( partial_ring ) )
        KBASE.deposit_extra( mcs_id, "layout_mcs", mcs0.smiles() )

        num_heavy_atoms = len( mcs0.heavy_atoms() )
        num_light_atoms = len( mcs0.atom ) - num_heavy_atoms
        
        KBASE.deposit_extra( mcs_id, "num_heavy_atoms", num_heavy_atoms )
        KBASE.deposit_extra( mcs_id, "num_light_atoms", num_light_atoms )

        return similarity.exp_delta( 2 * (orig_num_heavy_atoms - num_heavy_atoms), 0 )