示例#1
0
def find_intermediary_codingblockgraph_with_tinyexon(graphL,graphR,input={},similaritymatrix=None,min_bitscore_ratio=0.3):
    """
    """
    tinyexon_crossdata = {}
    tinyexons_seen = 0

    for org in graphL.organism_set():
        theOrfL = graphL.get_orfs_of_graph(organism=org)[0]
        theOrfR = graphR.get_orfs_of_graph(organism=org)[0]
        # continue if identical orfs
        # TODO: maybe check as well for spanning ranges?
        # TODO: in theory, a tinyorf can exist on this orf as well...
        if theOrfL.id == theOrfR.id: continue
        msrL = graphL.minimal_spanning_range(organism=org)
        msrR = graphR.minimal_spanning_range(organism=org)

        # check for get eligable donors on orfL and acceptors on orfR
        if org in graphL._splicedonorgraph.organism_set() and\
        org in graphR._spliceacceptorgraph.organism_set():
            eligable_donors    = graphL._splicedonorgraph.get_organism_objects(org)
            eligable_acceptors = graphR._spliceacceptorgraph.get_organism_objects(org)
            orflist            = input[org]['orfs'].orfs

            # search for tinyexons
            tinyexonlist = bridge_two_pacbporfs_by_tinyexon(theOrfL,theOrfR,
                    preceding_donor_sites= eligable_donors,
                    subsequent_acceptor_sites= eligable_acceptors,
                    orflist=orflist
                    )

            doubletinyexons = bridge_two_pacbporfs_by_two_tinyexons(theOrfL,theOrfR,
                    preceding_donor_sites= eligable_donors,
                    subsequent_acceptor_sites= eligable_acceptors,
                    orflist=orflist
                    )

        else:
            # not donors and acceptors on both orfs!
            return []


        # Order the tinyexons with respect to which orf they are located on.
        # for now, IGNORE tinyexons on the both left and right orf it self!!
        orf2tinyexons = {}
        for tinyexon in tinyexonlist:
            if tinyexon.orf.id in [ theOrfL.id, theOrfR.id ]:
                continue
            if orf2tinyexons.has_key(tinyexon.orf.id):
                orf2tinyexons[tinyexon.orf.id].append(tinyexon)
            else:
                orf2tinyexons[tinyexon.orf.id] = [ tinyexon ]

        # loop over the unique orfids on which tinyexons are predicted
        for orfid, telist in orf2tinyexons.iteritems():
            # loop over all other organisms (except the organism itself)
            for otherorg in graphL.organism_set():
                if otherorg == org: continue
                orgkey = [org,otherorg]
                orgkey.sort()
                _orgkey_reversed = False
                if orgkey != [org,otherorg]: _orgkey_reversed = True
                orgkey = tuple(orgkey)
                if not tinyexon_crossdata.has_key(orgkey):
                    tinyexon_crossdata[orgkey] = {'accepted_pacbs': {} }
                orfL = graphL.get_orfs_of_graph(organism=otherorg)[0]
                orfR = graphR.get_orfs_of_graph(organism=otherorg)[0]

                # main list for all similarities on this orfid
                similaritiesL = []
                similaritiesR = []
                for tinyexon in telist:
                    # get protein query sequence from tinyorf
                    query_dna    = tinyexon.orf.inputgenomicsequence[tinyexon.acceptor.pos:tinyexon.donor.pos]
                    query        = dna2proteinbyframe(query_dna, (3 - tinyexon.acceptor.phase) % 3 )
                    query_aa_pos = tinyexon.acceptor.pos / 3

                    _similaritiesL = similaritymatrix.scansbjct(query,orfL.protein_sequence,min_bitscore_ratio=min_bitscore_ratio)
                    if orfL.id == orfR.id:
                        _similaritiesR = []
                    else:
                        _similaritiesR = similaritymatrix.scansbjct(query,orfR.protein_sequence,min_bitscore_ratio=min_bitscore_ratio)
                    # Append to all similarities on this orfid; append the tinyexon itself too
                    # in order to place the similarity back to a specific tinyexon.
                    # This is needed because there can be >1 tinyexon on the same orf...
                    _similaritiesL = [ (_data,tinyexon) for _data in _similaritiesL ]
                    _similaritiesR = [ (_data,tinyexon) for _data in _similaritiesR ]
                    similaritiesL.extend(_similaritiesL)
                    similaritiesR.extend(_similaritiesR)

                # re-order the similarities because they can contain data from 2 tinyexons (on the same orf)
                # ordering is performed on ``ratio * bitscore``
                # this - kind of - evalue calculation enables a preferation for longer matches
                similaritiesL = _order_similarities(similaritiesL)
                similaritiesR = _order_similarities(similaritiesR)

                # Now make pacbporfs of only the BEST tinyexon and its
                # similarity on another organism
                TAKE_BEST_SIMILARITIES = 2
                for ( ( ratio, sbjct_pos, q_seq, match, s_seq, bitscore), tinyexon ) in similaritiesL[0:TAKE_BEST_SIMILARITIES]:
                    sbjct_aa_pos = sbjct_pos+orfL.protein_startPY
                    query_aa_pos = tinyexon.acceptor.pos / 3
                    if _orgkey_reversed:
                        ###print s_seq, "'%s'" % match, ratio, orfL.id
                        pacbpkey = (bitscore, len(query), orfL.id, tinyexon.orf.id )
                        pacbp    = pacb.PacbP(input=(s_seq,q_seq,sbjct_aa_pos,query_aa_pos))
                        pacbporf = pacb.conversion.pacbp2pacbporf(pacbp,orfL,tinyexon.orf)
                    else:
                        ###print q_seq, "'%s'" % match, ratio, orfL.id
                        pacbpkey = (bitscore, len(query), tinyexon.orf.id, orfL.id )
                        pacbp    = pacb.PacbP(input=(q_seq,s_seq,query_aa_pos,sbjct_aa_pos))
                        pacbporf = pacb.conversion.pacbp2pacbporf(pacbp,tinyexon.orf,orfL)

                    tinyexons_seen+=1
                    pacbporf.extend_pacbporf_after_stops()
                    tinyexon_crossdata[orgkey]['accepted_pacbs'][pacbpkey] = pacbporf

                for ( ( ratio, sbjct_pos, q_seq, match, s_seq, bitscore), tinyexon ) in similaritiesR[0:TAKE_BEST_SIMILARITIES]:
                    sbjct_aa_pos = sbjct_pos+orfR.protein_startPY
                    query_aa_pos = tinyexon.acceptor.pos / 3
                    if _orgkey_reversed:
                        pacbpkey = (bitscore, len(query), orfR.id, tinyexon.orf.id )
                        pacbp    = pacb.PacbP(input=(s_seq,q_seq,sbjct_aa_pos,query_aa_pos))
                        pacbporf = pacb.conversion.pacbp2pacbporf(pacbp,orfR,tinyexon.orf)
                    else:
                        pacbpkey = (bitscore, len(query), tinyexon.orf.id, orfR.id )
                        pacbp    = pacb.PacbP(input=(q_seq,s_seq,query_aa_pos,sbjct_aa_pos))
                        pacbporf = pacb.conversion.pacbp2pacbporf(pacbp,tinyexon.orf,orfR)

                    tinyexons_seen+=1
                    pacbporf.extend_pacbporf_after_stops()
                    tinyexon_crossdata[orgkey]['accepted_pacbs'][pacbpkey] = pacbporf


    if not tinyexons_seen:
        return []
    else:
        # add the nodes/edges from the input graphs as well
        for ( (a,b,c,d),n1,n2 ) in graphL.pacbps.keys():
            orgkey   = ( n1[0], n2[0] )
            pacbpdna = graphL.pacbps[( (a,b,c,d),n1,n2 )]
            if not tinyexon_crossdata.has_key(orgkey):
                tinyexon_crossdata[orgkey] = {'accepted_pacbs': {} }
            tinyexon_crossdata[orgkey]['accepted_pacbs'][(a,b,c,d)] = pacbpdna

        for ( (a,b,c,d),n1,n2 ) in graphR.pacbps.keys():
            orgkey   = ( n1[0], n2[0] )
            pacbpdna = graphR.pacbps[( (a,b,c,d),n1,n2 )]
            if not tinyexon_crossdata.has_key(orgkey):
                tinyexon_crossdata[orgkey] = {'accepted_pacbs': {} }
            tinyexon_crossdata[orgkey]['accepted_pacbs'][(a,b,c,d)] = pacbpdna
    

    # make graph, remove to low connected nodes and split in complete graphs
    tinyexonsg = create_pacbpcollectiongraph_from_crossdata(tinyexon_crossdata)
    tinyexonsg.remove_low_connectivity_nodes(min_connectivity=2)
    splitted_tinyexongraphs = tinyexonsg.find_fully_connected_subgraphs(
                edges=4,
                max_missing_edges=0 )

    # now remove the graphs that are graphL and graphR ;-)
    graphLnodes = graphL.get_nodes()
    graphLnodes.sort()
    graphRnodes = graphR.get_nodes()
    graphRnodes.sort()
    for pos in range(0,len(splitted_tinyexongraphs)):
        tegNodes = splitted_tinyexongraphs[pos].get_nodes()
        tegNodes.sort()
        if tegNodes == graphLnodes:
            splitted_tinyexongraphs.pop(pos)
            break
    for pos in range(0,len(splitted_tinyexongraphs)):
        tegNodes = splitted_tinyexongraphs[pos].get_nodes()
        tegNodes.sort()
        if tegNodes == graphRnodes:
            splitted_tinyexongraphs.pop(pos)
            break

    # make ListOfCodingBlockGraphs
    cbgList = ListOfCodingBlockGraphs(splitted_tinyexongraphs,
            input=input,
            crossdata=tinyexon_crossdata
            )

    # do all what is needed to create K(s) CBGs of these
    cbgList.harvest_pacbps_from_crossdata()
    cbgList.split_codingblock_on_alternatives_in_pacbps_dict(
            filter_for_msr=True,
            filter_for_omsr=True,
            )
    # remove non-compatible CBGs
    cbgList.remove_incompatible_cbgs(
            minimal_node_count=len(input),
            minimal_edge_count=len(tinyexon_crossdata),
            filter_for_msr=True,
            filter_for_omsr=True
            )

    # get list of accepted TinyExonCbgs 
    accepted_tegs = cbgList.codingblockgraphs

    # and update weights by minimal spanning region
    for teg in accepted_tegs: teg.update_edge_weights_by_minimal_spanning_range()

    # and check if they can be placed IN BETWEEN graphL and graphR
    # TODO some prints
    final_graphs_with_tinyexons = []
    for teg in accepted_tegs:

        test_codingblock_order, rejected_graphs = make_consensus_genestructure_from_compatible_pacb_graphs(
                [graphL,graphR,teg],None)

        print "checking hypo TEG:", teg.get_ordered_nodes(), "of", len(accepted_tegs), "len of join", len(test_codingblock_order)


        #empty_input = {}
        #for org in teg.organism_set(): empty_input[org] = None
        #tmpGSG = GenestructureOfCodingBlockGraphs(empty_input)
        #tmpGSG.add_codingblocks([graphL,graphR,teg])
        #print "tinyexon tmp check:", len(test_codingblock_order), len(tmpGSG), teg.get_ordered_nodes()


        wt_after  = teg.total_weight()
        if len(test_codingblock_order) == 3:
            teg_nodes = teg.get_nodes()
            teg_nodes.sort()
            middle = test_codingblock_order[1]
            middle_nodes = middle.get_nodes()
            middle_nodes.sort()
            if middle_nodes == teg_nodes:
                # yahoo, this one is 100% okay!
                final_graphs_with_tinyexons.append( teg )

    if len(final_graphs_with_tinyexons)==1:
        print final_graphs_with_tinyexons[0].get_nodes()
        return final_graphs_with_tinyexons
    elif len(final_graphs_with_tinyexons)>1:
        print "### WARNING!!!! more than 1 tinyexon graph is found."
        print "### WARNING!!!! however, only single one is returned."
        print "### WARNING!!!! returning >1 can cause errors..."
        return [ final_graphs_with_tinyexons[0] ]
    else:
        return []