def find_intermediary_codingblockgraph_with_tinyexon(graphL,graphR,input={},similaritymatrix=None,min_bitscore_ratio=0.3): """ """ tinyexon_crossdata = {} tinyexons_seen = 0 for org in graphL.organism_set(): theOrfL = graphL.get_orfs_of_graph(organism=org)[0] theOrfR = graphR.get_orfs_of_graph(organism=org)[0] # continue if identical orfs # TODO: maybe check as well for spanning ranges? # TODO: in theory, a tinyorf can exist on this orf as well... if theOrfL.id == theOrfR.id: continue msrL = graphL.minimal_spanning_range(organism=org) msrR = graphR.minimal_spanning_range(organism=org) # check for get eligable donors on orfL and acceptors on orfR if org in graphL._splicedonorgraph.organism_set() and\ org in graphR._spliceacceptorgraph.organism_set(): eligable_donors = graphL._splicedonorgraph.get_organism_objects(org) eligable_acceptors = graphR._spliceacceptorgraph.get_organism_objects(org) orflist = input[org]['orfs'].orfs # search for tinyexons tinyexonlist = bridge_two_pacbporfs_by_tinyexon(theOrfL,theOrfR, preceding_donor_sites= eligable_donors, subsequent_acceptor_sites= eligable_acceptors, orflist=orflist ) doubletinyexons = bridge_two_pacbporfs_by_two_tinyexons(theOrfL,theOrfR, preceding_donor_sites= eligable_donors, subsequent_acceptor_sites= eligable_acceptors, orflist=orflist ) else: # not donors and acceptors on both orfs! return [] # Order the tinyexons with respect to which orf they are located on. # for now, IGNORE tinyexons on the both left and right orf it self!! orf2tinyexons = {} for tinyexon in tinyexonlist: if tinyexon.orf.id in [ theOrfL.id, theOrfR.id ]: continue if orf2tinyexons.has_key(tinyexon.orf.id): orf2tinyexons[tinyexon.orf.id].append(tinyexon) else: orf2tinyexons[tinyexon.orf.id] = [ tinyexon ] # loop over the unique orfids on which tinyexons are predicted for orfid, telist in orf2tinyexons.iteritems(): # loop over all other organisms (except the organism itself) for otherorg in graphL.organism_set(): if otherorg == org: continue orgkey = [org,otherorg] orgkey.sort() _orgkey_reversed = False if orgkey != [org,otherorg]: _orgkey_reversed = True orgkey = tuple(orgkey) if not tinyexon_crossdata.has_key(orgkey): tinyexon_crossdata[orgkey] = {'accepted_pacbs': {} } orfL = graphL.get_orfs_of_graph(organism=otherorg)[0] orfR = graphR.get_orfs_of_graph(organism=otherorg)[0] # main list for all similarities on this orfid similaritiesL = [] similaritiesR = [] for tinyexon in telist: # get protein query sequence from tinyorf query_dna = tinyexon.orf.inputgenomicsequence[tinyexon.acceptor.pos:tinyexon.donor.pos] query = dna2proteinbyframe(query_dna, (3 - tinyexon.acceptor.phase) % 3 ) query_aa_pos = tinyexon.acceptor.pos / 3 _similaritiesL = similaritymatrix.scansbjct(query,orfL.protein_sequence,min_bitscore_ratio=min_bitscore_ratio) if orfL.id == orfR.id: _similaritiesR = [] else: _similaritiesR = similaritymatrix.scansbjct(query,orfR.protein_sequence,min_bitscore_ratio=min_bitscore_ratio) # Append to all similarities on this orfid; append the tinyexon itself too # in order to place the similarity back to a specific tinyexon. # This is needed because there can be >1 tinyexon on the same orf... _similaritiesL = [ (_data,tinyexon) for _data in _similaritiesL ] _similaritiesR = [ (_data,tinyexon) for _data in _similaritiesR ] similaritiesL.extend(_similaritiesL) similaritiesR.extend(_similaritiesR) # re-order the similarities because they can contain data from 2 tinyexons (on the same orf) # ordering is performed on ``ratio * bitscore`` # this - kind of - evalue calculation enables a preferation for longer matches similaritiesL = _order_similarities(similaritiesL) similaritiesR = _order_similarities(similaritiesR) # Now make pacbporfs of only the BEST tinyexon and its # similarity on another organism TAKE_BEST_SIMILARITIES = 2 for ( ( ratio, sbjct_pos, q_seq, match, s_seq, bitscore), tinyexon ) in similaritiesL[0:TAKE_BEST_SIMILARITIES]: sbjct_aa_pos = sbjct_pos+orfL.protein_startPY query_aa_pos = tinyexon.acceptor.pos / 3 if _orgkey_reversed: ###print s_seq, "'%s'" % match, ratio, orfL.id pacbpkey = (bitscore, len(query), orfL.id, tinyexon.orf.id ) pacbp = pacb.PacbP(input=(s_seq,q_seq,sbjct_aa_pos,query_aa_pos)) pacbporf = pacb.conversion.pacbp2pacbporf(pacbp,orfL,tinyexon.orf) else: ###print q_seq, "'%s'" % match, ratio, orfL.id pacbpkey = (bitscore, len(query), tinyexon.orf.id, orfL.id ) pacbp = pacb.PacbP(input=(q_seq,s_seq,query_aa_pos,sbjct_aa_pos)) pacbporf = pacb.conversion.pacbp2pacbporf(pacbp,tinyexon.orf,orfL) tinyexons_seen+=1 pacbporf.extend_pacbporf_after_stops() tinyexon_crossdata[orgkey]['accepted_pacbs'][pacbpkey] = pacbporf for ( ( ratio, sbjct_pos, q_seq, match, s_seq, bitscore), tinyexon ) in similaritiesR[0:TAKE_BEST_SIMILARITIES]: sbjct_aa_pos = sbjct_pos+orfR.protein_startPY query_aa_pos = tinyexon.acceptor.pos / 3 if _orgkey_reversed: pacbpkey = (bitscore, len(query), orfR.id, tinyexon.orf.id ) pacbp = pacb.PacbP(input=(s_seq,q_seq,sbjct_aa_pos,query_aa_pos)) pacbporf = pacb.conversion.pacbp2pacbporf(pacbp,orfR,tinyexon.orf) else: pacbpkey = (bitscore, len(query), tinyexon.orf.id, orfR.id ) pacbp = pacb.PacbP(input=(q_seq,s_seq,query_aa_pos,sbjct_aa_pos)) pacbporf = pacb.conversion.pacbp2pacbporf(pacbp,tinyexon.orf,orfR) tinyexons_seen+=1 pacbporf.extend_pacbporf_after_stops() tinyexon_crossdata[orgkey]['accepted_pacbs'][pacbpkey] = pacbporf if not tinyexons_seen: return [] else: # add the nodes/edges from the input graphs as well for ( (a,b,c,d),n1,n2 ) in graphL.pacbps.keys(): orgkey = ( n1[0], n2[0] ) pacbpdna = graphL.pacbps[( (a,b,c,d),n1,n2 )] if not tinyexon_crossdata.has_key(orgkey): tinyexon_crossdata[orgkey] = {'accepted_pacbs': {} } tinyexon_crossdata[orgkey]['accepted_pacbs'][(a,b,c,d)] = pacbpdna for ( (a,b,c,d),n1,n2 ) in graphR.pacbps.keys(): orgkey = ( n1[0], n2[0] ) pacbpdna = graphR.pacbps[( (a,b,c,d),n1,n2 )] if not tinyexon_crossdata.has_key(orgkey): tinyexon_crossdata[orgkey] = {'accepted_pacbs': {} } tinyexon_crossdata[orgkey]['accepted_pacbs'][(a,b,c,d)] = pacbpdna # make graph, remove to low connected nodes and split in complete graphs tinyexonsg = create_pacbpcollectiongraph_from_crossdata(tinyexon_crossdata) tinyexonsg.remove_low_connectivity_nodes(min_connectivity=2) splitted_tinyexongraphs = tinyexonsg.find_fully_connected_subgraphs( edges=4, max_missing_edges=0 ) # now remove the graphs that are graphL and graphR ;-) graphLnodes = graphL.get_nodes() graphLnodes.sort() graphRnodes = graphR.get_nodes() graphRnodes.sort() for pos in range(0,len(splitted_tinyexongraphs)): tegNodes = splitted_tinyexongraphs[pos].get_nodes() tegNodes.sort() if tegNodes == graphLnodes: splitted_tinyexongraphs.pop(pos) break for pos in range(0,len(splitted_tinyexongraphs)): tegNodes = splitted_tinyexongraphs[pos].get_nodes() tegNodes.sort() if tegNodes == graphRnodes: splitted_tinyexongraphs.pop(pos) break # make ListOfCodingBlockGraphs cbgList = ListOfCodingBlockGraphs(splitted_tinyexongraphs, input=input, crossdata=tinyexon_crossdata ) # do all what is needed to create K(s) CBGs of these cbgList.harvest_pacbps_from_crossdata() cbgList.split_codingblock_on_alternatives_in_pacbps_dict( filter_for_msr=True, filter_for_omsr=True, ) # remove non-compatible CBGs cbgList.remove_incompatible_cbgs( minimal_node_count=len(input), minimal_edge_count=len(tinyexon_crossdata), filter_for_msr=True, filter_for_omsr=True ) # get list of accepted TinyExonCbgs accepted_tegs = cbgList.codingblockgraphs # and update weights by minimal spanning region for teg in accepted_tegs: teg.update_edge_weights_by_minimal_spanning_range() # and check if they can be placed IN BETWEEN graphL and graphR # TODO some prints final_graphs_with_tinyexons = [] for teg in accepted_tegs: test_codingblock_order, rejected_graphs = make_consensus_genestructure_from_compatible_pacb_graphs( [graphL,graphR,teg],None) print "checking hypo TEG:", teg.get_ordered_nodes(), "of", len(accepted_tegs), "len of join", len(test_codingblock_order) #empty_input = {} #for org in teg.organism_set(): empty_input[org] = None #tmpGSG = GenestructureOfCodingBlockGraphs(empty_input) #tmpGSG.add_codingblocks([graphL,graphR,teg]) #print "tinyexon tmp check:", len(test_codingblock_order), len(tmpGSG), teg.get_ordered_nodes() wt_after = teg.total_weight() if len(test_codingblock_order) == 3: teg_nodes = teg.get_nodes() teg_nodes.sort() middle = test_codingblock_order[1] middle_nodes = middle.get_nodes() middle_nodes.sort() if middle_nodes == teg_nodes: # yahoo, this one is 100% okay! final_graphs_with_tinyexons.append( teg ) if len(final_graphs_with_tinyexons)==1: print final_graphs_with_tinyexons[0].get_nodes() return final_graphs_with_tinyexons elif len(final_graphs_with_tinyexons)>1: print "### WARNING!!!! more than 1 tinyexon graph is found." print "### WARNING!!!! however, only single one is returned." print "### WARNING!!!! returning >1 can cause errors..." return [ final_graphs_with_tinyexons[0] ] else: return []