예제 #1
0
def detect_and_remove_synteny(inwpcbgs,PCG,GENE_IDENTIFIER_SET,verbose=True):
    """ """
    MIN_OBSERVED_VS_EXPECTED_RATIO = 0.20

    observed_organism_subcombis = []
    syntenic_subinwpcbgs = []

    # detect syntenic genes in MAIN inwpCBGs,
    # without taking strongest informants by GTG analyses
    syntenic_inwpcbgs = assign_syntenic_inwpcbgs(inwpcbgs)

    for syntinwpcbg in syntenic_inwpcbgs:
        syntenic_subinwpcbgs.append(syntinwpcbg)

    for inwpCBG in inwpcbgs:
        # omit inwpCBGs with annotated exons/orfs
        if inwpCBG.count_orfs_labeled_as_annotated_exon() >= 2: continue
        target = inwpCBG._get_target_organism()

        # make a (artificially fully connected) GeneTreeGraph
        gtg = GeneTreeGraph()
        gtg.add_node(target)
        for (pacbpkey,nodeQ,nodeS),pacbporf in inwpCBG.pacbps.iteritems():
            orgS = inwpCBG.organism_by_node(nodeS)
            if orgS not in GENE_IDENTIFIER_SET: continue
            gtg.add_node(orgS)
        for (pacbpkey,nodeQ,nodeS),pacbporf in inwpCBG.pacbps.iteritems():
            orgQ = inwpCBG.organism_by_node(nodeQ)
            orgS = inwpCBG.organism_by_node(nodeS)
            if orgS not in GENE_IDENTIFIER_SET: continue
            gtg.add_edge( orgQ, orgS, wt = pacbporf.bitscore )
    
            # make artificially missed edges between the informants
            for org in inwpCBG.organism_set():
                if org not in [orgQ,orgS] and org in GENE_IDENTIFIER_SET:
                    if gtg.has_edge( orgS, org ) and\
                    gtg.weights[(orgS, org)] > pacbporf.bitscore:
                        gtg.set_edge_weight(orgS,org,wt = pacbporf.bitscore)
                    else:
                        gtg.add_edge( orgS, org, wt = pacbporf.bitscore )
    
        # omit (nearly) empty genetreegraphs
        if gtg.node_count() <= 1: continue

        # remove (much) weaker connected nodes as expected from the gtg
        while gtg.get_nodes() and MIN_OBSERVED_VS_EXPECTED_RATIO >\
        min( [ gtg.get_node_weighted_connectivity_observed_vs_expected(node) for node in gtg.get_nodes() ]):
            node = gtg.weakest_connected_node()
            gtg.del_node(node)
    
        # check if already tested before; present in observed_organism_subcombis
        if gtg.get_ordered_nodes() in observed_organism_subcombis: continue
    
        # store to already tested organism subcombinations
        observed_organism_subcombis.append( gtg.get_ordered_nodes() )
    
        # create a subPCG of these organisms
        subPCG = PacbpCollectionGraph(crossdata={},
                    blastmatrix=PCG._blastmatrix)
        for (pacbpkey,nodeQ,nodeS), pacbporf in PCG.pacbps.iteritems():
            (orgQ,orfQid),(orgS,orfSid) = nodeQ,nodeS
            if orgQ not in gtg.get_nodes(): continue
            if orgS not in gtg.get_nodes(): continue
            subPCG.add_node(nodeQ)
            subPCG.add_node(nodeS)
            subPCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
            subPCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf
    
        # make inwpCBGs of this subPCG
        subinwpcbgs = PCG2inwpCBGS(subPCG)

        # check if there are subinwpcbgs
        if not subinwpcbgs: continue

        ########################################################################
        #if verbose:
        #    print "subPCG organism set:", gtg.get_ordered_nodes()
        #    print_inwpcbgstructure(subinwpcbgs,gtg.get_ordered_nodes())
        ########################################################################
    
        # create a subInwardsPointingCodingBlockGraph of these organisms
        #subinwpCBG = InwardsPointingCodingBlockGraph()
        #for (pacbpkey,nodeQ,nodeS), pacbporf in inwpCBG.pacbps.iteritems():
        #    (orgQ,orfQid),(orgS,orfSid) = nodeQ,nodeS
        #    if orgQ not in gtg.get_nodes(): continue
        #    if orgS not in gtg.get_nodes(): continue
        #    subinwpCBG.add_node(nodeQ)
        #    subinwpCBG.add_node(nodeS)
        #    subinwpCBG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
        #    subinwpCBG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf

        # detect syntenic genes in this subinwpcbgs
        syntenic_inwpcbgs = assign_syntenic_inwpcbgs(subinwpcbgs)

        for syntinwpcbg in syntenic_inwpcbgs:
            syntenic_subinwpcbgs.append(syntinwpcbg)
            ####################################################################
            if verbose:
                print "SYNTENIC!!", syntinwpcbg, syntinwpcbg.get_ordered_nodes()
                for subCBG in subinwpcbgs:
                    print "syntenic in:", subCBG, subCBG.get_ordered_nodes()
            ####################################################################

    if not syntenic_subinwpcbgs:
        return False

    # cleanup all inwpCBGs from the syntenic subInwpCBGs
    syntenic_pacbpkeys = []
    for syntinwpcbg in syntenic_subinwpcbgs:
        node_set = syntinwpcbg.node_set()
        for inwpCBG in inwpcbgs:
            if not node_set.difference(inwpCBG.node_set()):
                for pacbpkey in inwpCBG.pacbps.keys():
                    if pacbpkey not in syntenic_pacbpkeys:
                        syntenic_pacbpkeys.append(pacbpkey)

    # place all syntenic_pacbpkeys and PacbPORFs in the syntenicPCG
    # and, at the same time, remove from the main PCG
    syntenicPCG = PacbpCollectionGraph(crossdata={},blastmatrix=PCG._blastmatrix)
    for key in syntenic_pacbpkeys:
        (pacbpkey,nodeQ,nodeS) = key
        pacbporf = PCG.pacbps[key]
        # add to syntenicPCG
        syntenicPCG.add_node(nodeQ)
        syntenicPCG.add_node(nodeS)
        syntenicPCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
        syntenicPCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf
        # remove from main PCG
        _delete_pacbp(PCG,key)

    # return syntenicPCG
    return syntenicPCG
예제 #2
0
def detect_and_remove_synteny(inwpcbgs,
                              PCG,
                              GENE_IDENTIFIER_SET,
                              verbose=True):
    """ """
    MIN_OBSERVED_VS_EXPECTED_RATIO = 0.20

    observed_organism_subcombis = []
    syntenic_subinwpcbgs = []

    # detect syntenic genes in MAIN inwpCBGs,
    # without taking strongest informants by GTG analyses
    syntenic_inwpcbgs = assign_syntenic_inwpcbgs(inwpcbgs)

    for syntinwpcbg in syntenic_inwpcbgs:
        syntenic_subinwpcbgs.append(syntinwpcbg)

    for inwpCBG in inwpcbgs:
        # omit inwpCBGs with annotated exons/orfs
        if inwpCBG.count_orfs_labeled_as_annotated_exon() >= 2: continue
        target = inwpCBG._get_target_organism()

        # make a (artificially fully connected) GeneTreeGraph
        gtg = GeneTreeGraph()
        gtg.add_node(target)
        for (pacbpkey, nodeQ, nodeS), pacbporf in inwpCBG.pacbps.iteritems():
            orgS = inwpCBG.organism_by_node(nodeS)
            if orgS not in GENE_IDENTIFIER_SET: continue
            gtg.add_node(orgS)
        for (pacbpkey, nodeQ, nodeS), pacbporf in inwpCBG.pacbps.iteritems():
            orgQ = inwpCBG.organism_by_node(nodeQ)
            orgS = inwpCBG.organism_by_node(nodeS)
            if orgS not in GENE_IDENTIFIER_SET: continue
            gtg.add_edge(orgQ, orgS, wt=pacbporf.bitscore)

            # make artificially missed edges between the informants
            for org in inwpCBG.organism_set():
                if org not in [orgQ, orgS] and org in GENE_IDENTIFIER_SET:
                    if gtg.has_edge( orgS, org ) and\
                    gtg.weights[(orgS, org)] > pacbporf.bitscore:
                        gtg.set_edge_weight(orgS, org, wt=pacbporf.bitscore)
                    else:
                        gtg.add_edge(orgS, org, wt=pacbporf.bitscore)

        # omit (nearly) empty genetreegraphs
        if gtg.node_count() <= 1: continue

        # remove (much) weaker connected nodes as expected from the gtg
        while gtg.get_nodes() and MIN_OBSERVED_VS_EXPECTED_RATIO >\
        min( [ gtg.get_node_weighted_connectivity_observed_vs_expected(node) for node in gtg.get_nodes() ]):
            node = gtg.weakest_connected_node()
            gtg.del_node(node)

        # check if already tested before; present in observed_organism_subcombis
        if gtg.get_ordered_nodes() in observed_organism_subcombis: continue

        # store to already tested organism subcombinations
        observed_organism_subcombis.append(gtg.get_ordered_nodes())

        # create a subPCG of these organisms
        subPCG = PacbpCollectionGraph(crossdata={},
                                      blastmatrix=PCG._blastmatrix)
        for (pacbpkey, nodeQ, nodeS), pacbporf in PCG.pacbps.iteritems():
            (orgQ, orfQid), (orgS, orfSid) = nodeQ, nodeS
            if orgQ not in gtg.get_nodes(): continue
            if orgS not in gtg.get_nodes(): continue
            subPCG.add_node(nodeQ)
            subPCG.add_node(nodeS)
            subPCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
            subPCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf

        # make inwpCBGs of this subPCG
        subinwpcbgs = PCG2inwpCBGS(subPCG)

        # check if there are subinwpcbgs
        if not subinwpcbgs: continue

        ########################################################################
        #if verbose:
        #    print "subPCG organism set:", gtg.get_ordered_nodes()
        #    print_inwpcbgstructure(subinwpcbgs,gtg.get_ordered_nodes())
        ########################################################################

        # create a subInwardsPointingCodingBlockGraph of these organisms
        #subinwpCBG = InwardsPointingCodingBlockGraph()
        #for (pacbpkey,nodeQ,nodeS), pacbporf in inwpCBG.pacbps.iteritems():
        #    (orgQ,orfQid),(orgS,orfSid) = nodeQ,nodeS
        #    if orgQ not in gtg.get_nodes(): continue
        #    if orgS not in gtg.get_nodes(): continue
        #    subinwpCBG.add_node(nodeQ)
        #    subinwpCBG.add_node(nodeS)
        #    subinwpCBG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
        #    subinwpCBG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf

        # detect syntenic genes in this subinwpcbgs
        syntenic_inwpcbgs = assign_syntenic_inwpcbgs(subinwpcbgs)

        for syntinwpcbg in syntenic_inwpcbgs:
            syntenic_subinwpcbgs.append(syntinwpcbg)
            ####################################################################
            if verbose:
                print "SYNTENIC!!", syntinwpcbg, syntinwpcbg.get_ordered_nodes(
                )
                for subCBG in subinwpcbgs:
                    print "syntenic in:", subCBG, subCBG.get_ordered_nodes()
            ####################################################################

    if not syntenic_subinwpcbgs:
        return False

    # cleanup all inwpCBGs from the syntenic subInwpCBGs
    syntenic_pacbpkeys = []
    for syntinwpcbg in syntenic_subinwpcbgs:
        node_set = syntinwpcbg.node_set()
        for inwpCBG in inwpcbgs:
            if not node_set.difference(inwpCBG.node_set()):
                for pacbpkey in inwpCBG.pacbps.keys():
                    if pacbpkey not in syntenic_pacbpkeys:
                        syntenic_pacbpkeys.append(pacbpkey)

    # place all syntenic_pacbpkeys and PacbPORFs in the syntenicPCG
    # and, at the same time, remove from the main PCG
    syntenicPCG = PacbpCollectionGraph(crossdata={},
                                       blastmatrix=PCG._blastmatrix)
    for key in syntenic_pacbpkeys:
        (pacbpkey, nodeQ, nodeS) = key
        pacbporf = PCG.pacbps[key]
        # add to syntenicPCG
        syntenicPCG.add_node(nodeQ)
        syntenicPCG.add_node(nodeS)
        syntenicPCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
        syntenicPCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf
        # remove from main PCG
        _delete_pacbp(PCG, key)

    # return syntenicPCG
    return syntenicPCG
예제 #3
0
def create_genetree_from_crossdata(crossdata):
    """
    (Try to) create the GeneTreeGraph from crossdata dictionairy

    @type  crossdata: dict
    @param crossdata: crossdata <dict data structure>

    @rtype:  GeneTreeGraph
    @return: estimated! GeneTreeGraph constructed from PabcPs with highest bitscore
    """
    # fill GeneTreeGraph with nodes
    GTG = GeneTreeGraph()
    for (geneA,geneB) in crossdata.keys():
        if geneA not in GTG.get_nodes():
            GTG.add_node(geneA)
        if geneB not in GTG.get_nodes():
            GTG.add_node(geneB)

    # fill GeneTreeGraph with edges
    for (geneA,geneB) in crossdata.keys():
        keys = crossdata[(geneA,geneB)]['accepted_pacbs'].keys()
        if keys:
            keys.sort()
            keys.reverse()
            bestpacbp = crossdata[(geneA,geneB)]['accepted_pacbs'][keys[0]]
            # store this edge to the GeneTreeGraph
            GTG.add_edge(geneA,geneB,bestpacbp.identityscore)
        else:
            # no keys at all, meaning no Pacbps between 2 species meaining
            # GTG can not be created yet! Set GSG back to empty graph and return
            # set GTG back to an empty graph, break out anf return
            GTG = GeneTreeGraph()
            break

    # return the GeneTreeGraph
    return GTG