def pacbpCollection2AcceptedCodingBlockGraphs(pacbpCollection,gtg=None,prev=None,next=None, max_cbg_gtg_topo_dif=None, max_cbg_gtg_abs_dif=None, min_cbg_gtg_id_ratio=None): """ """ # make splitted subgraphs from PacbpCollection # cbgs must have collection.organism_set_size()-1 nodes # and no missing edges are aloued # to the number of nodes missing in the input splittedCBG) exact_cbg_node_count = pacbpCollection.organism_set_size() exact_cbg_edge_count = exact_cbg_node_count - 1 dpcPacbpCollection = deepcopy(pacbpCollection) splitted_subgraphs = pacbpCollection.find_fully_connected_subgraphs( edges=exact_cbg_edge_count, max_missing_edges=0 ) # get pacbps for the splitted subgraphs and update edge weights completed_subgraphs = [] for spl in splitted_subgraphs: # only deal with complete CBGs, not incomplete or collections if spl.node_count() != exact_cbg_node_count: continue if spl.__class__.__name__ == 'PacbpCollectionGraph': continue if spl.connectivitysaturation() < 1.0: continue # harvest pacbps from the deepcopied PacbpCollection spl.harvest_pacbps_from_pacbpcollection(dpcPacbpCollection) if not spl.has_overall_minimal_spanning_range(): continue if not spl.has_all_pacbps(): continue spl.update_edge_weights_by_minimal_spanning_range() completed_subgraphs.append(spl) # order graphs by total weight completed_subgraphs = ordering.order_graphlist_by_total_weight(completed_subgraphs) # and re-order on node occurrence: if a neighboring node is incorporated -> more likely! completed_subgraphs = ordering.reorder_cbgs_on_node_occurrence(completed_subgraphs,prev=prev,next=next) accepted_cbgs = [] for spl in completed_subgraphs: if gtg: if max_cbg_gtg_topo_dif: topo_dif = gtg.graphalignmentdifference( spl.genetree() ) if topo_dif > max_cbg_gtg_topo_dif: continue if max_cbg_gtg_abs_dif: abs_dif = gtg.absolutegraphalignmentdifference( spl.genetree() ) if abs_dif > max_cbg_gtg_abs_dif: continue if min_cbg_gtg_id_ratio: identity_ratio = spl.genetree().identity() / gtg.identity() if identity_ratio < min_cbg_gtg_id_ratio: continue # if this point is reached: splitted cbg is accepted! accepted_cbgs.append(spl) # return the accepted_cbgs return accepted_cbgs
def construct_final_tiny_cbg(self, max_exon_nt_length=SHORT_TAILINGEXON_MAX_NT_LENGTH, max_intron_nt_length=SHORT_TAILINGEXON_MAX_INTRON_NT_LENGTH, take_max_best_acceptors=SHORT_TAILINGEXON_TAKE_MAX_BEST_ACCEPTORS, take_max_best_ecgs=SHORT_TAILINGEXON_TAKE_MAX_BEST_ECGS, take_max_best_cbgs=SHORT_TAILINGEXON_TAKE_MAX_BEST_CBGS, maximal_current_stopcodongraph_average_weight=0.90, minimal_last_vs_new_identity_ratio=0.80, maximal_cexpander_cbg_tail_uniformity_aa_length=3, elegiable_donor_omsr_nt_offset=21, verbose=False): """ Make a tiny final CBG by ``shooting tiny exons into the deep`` """ # get current last CBG last = self.get_final_cbg() # check if final tail of this CBG is uniformaly alignable cxpdrOutput = cexpanderanalyses_omsr2orfend(last) IS_UNIFORMLY_ALIGNED = True for trf in cxpdrOutput._transferblocks: if trf.binarystring[-maximal_cexpander_cbg_tail_uniformity_aa_length:].count("0"): IS_UNIFORMLY_ALIGNED = False break ############################################################ if verbose: print "Cexpander uniformaly aligned:", print maximal_cexpander_cbg_tail_uniformity_aa_length, print "->", IS_UNIFORMLY_ALIGNED print "omsr: ", last._cexpander.projected_on, print last._cexpander.binarystring trf = cxpdrOutput.get_transfer_of_projected_on( last._cexpander.projected_on) if trf and trf != True: print "omsr2orfend:", last._cexpander.projected_on, print trf.binarystring ############################################################ if IS_UNIFORMLY_ALIGNED: # break out of this function. Chance of overpredicting # a final tiny exon is bigger then finding a True one! return False # check if the stopcodongraph is not (very) good already if last._stopcodongraph.average_weight() >=\ maximal_current_stopcodongraph_average_weight: # break out of this function. Chance of overpredicting # a final tiny exon is bigger then finding a True existing one return False # start the timer (performance benchmark in verbose mode) stw = StopWatch(name='stwFinalECG') stw.start() # get FinalExons on elegiable Orfs based on distance towards OMSR of # current last CBG and minimal acceptor site score omsr = last.overall_minimal_spanning_range() maxsr = last.maximal_spanning_range() ECG = ExonCollectionGraph() ################################################################ if verbose: print "currentLAST", last print last._stopcodongraph print last._stopcodongraph.is_optimal() for org in last.organism_set(): print org, last._stopcodongraph.is_optimal(organism=org) for organism in last.organism_set(): node = last.node_by_organism(organism) theorf = last.get_orfs_of_graph(organism=organism)[0] print organism, "\t", node, "\t", max(omsr[node]), "\t", print max(maxsr[node]), theorf.endPY/3 ################################################################ for organism in last.organism_set(): node = last.node_by_organism(organism) # calculate an offset for the acceptor position # variable elegiable_acceptor_omsr_nt_offset is needed to # enlarge the OMSR definded offset. When the OMSR is by chance # a few nt or aa larger than the actual exon length, the true # acceptor position can be erroneously abandoned. offset = max(omsr[node]) * 3 - elegiable_donor_omsr_nt_offset theorf = last.get_orfs_of_graph(organism=organism)[0] # check if this final orf is self can serve as a final extension remaining_orf_nt_length = (theorf.protein_endPY - max(omsr[node])) * 3 remaining_maxsr_nt_length = (max(maxsr[node]) - max(omsr[node])) * 3 remaining_maxsr_tostop_nt_length = (theorf.protein_endPY - max(maxsr[node])) * 3 FIND_NEW_FINAL_ORFS = True STORE_CURRENT_ORF_AS_FIOO = False if remaining_maxsr_nt_length >= max_exon_nt_length: # exceptionally large maxsr on rigth side of omsr # store as FIOO but to NOT search for an orf extension! ### FIND_NEW_FINAL_ORFS = False # discarded 17/09/2009; when poos maxsr present, overruled! STORE_CURRENT_ORF_AS_FIOO = True elif remaining_maxsr_tostop_nt_length <= 18: # maxsr is less then 6 AA apart from stop on current orf #FIND_NEW_FINAL_ORFS = False STORE_CURRENT_ORF_AS_FIOO = True elif remaining_orf_nt_length < max_exon_nt_length: # final piece of unaligned sequence is a perfect HMM seed STORE_CURRENT_ORF_AS_FIOO = True else: pass if STORE_CURRENT_ORF_AS_FIOO: cbs = CodingBlockStart( theorf.aapos2dnapos( max(omsr[node]) ) ) # set pssm_score to (very) high; this rewards # using the current Orf as the last Orf cbs.pssm_score = 20.0 fioo = FinalExonOnOrf(cbs,theorf.endPY,theorf) node = (organism,theorf.id,fioo.start,fioo.end) ECG.add_node_and_object(node,fioo) ################################################################ if verbose: print organism,theorf.id,"self==potential last exon", remaining_orf_nt_length print organism, theorf.id, fioo, fioo.start,fioo.end, theorf.endPY ################################################################ if not FIND_NEW_FINAL_ORFS: # quit here -> no orf extension of this CBG continue # get elegiable (new) final orfs orflist = self.input[organism]['orfs'].get_elegiable_orfs( max_orf_start=offset+max_intron_nt_length, min_orf_end=offset ) ################################################################ if verbose: print organism, [ orf.id for orf in orflist ], "offset:", offset, offset/3 ################################################################ for orf in orflist: results = find_tailing_exon_on_orf( theorf,orf, current_donor_pos=offset, max_tailingexon_nt_length=max_exon_nt_length, max_tailingexon_intron_nt_length=max_intron_nt_length, ) for exon,intron in results: node = (organism,orf.id,exon.start,exon.end) if node not in ECG.get_nodes(): ECG.add_node_and_object(node,exon) if verbose: print organism, node, exon if verbose: print stw.lap(), "Exon objects gathered", ECG.node_count() # now take only the best `take_max_best_acceptors` # because there can be quite some of them! for organism in ECG.organism_set(): objects = ordering.order_list_by_attribute( ECG.get_organism_objects(organism), order_by='pssm_score', reversed=True ) for obj in objects[take_max_best_acceptors:]: node = (organism,obj.orf.id,obj.start,obj.end) ECG.del_node(node) if verbose: print "deleted:", node, obj.orf.id, obj.pssm_score ######################################################################## if verbose: print stw.lap(), ">take_max_best_acceptors DELETED" for organism in ECG.organism_set(): for obj in ordering.order_list_by_attribute( ECG.get_organism_objects(organism), order_by='pssm_score', reversed=True ): print "remaining", organism, obj.orf.id, obj.length, obj ######################################################################## # only continue if all organisms are represented in the ECG if last.organism_set_size() > ECG.organism_set_size(): if verbose: print "To few organisms/genes present -> return False" return False # create edges in the ECG between compatible phases and # exon length, then make pacbps for these edges ECG.create_edges() ECG.make_pacbps_for_edges() if verbose: print stw.lap(), "edges + PACBPS created:", ECG.edge_count(), ECG.node_count(), len(ECG.pacbps) # search for complete graphs in this last_exon_graphs = ECG.find_fully_connected_subgraphs() ######################################################################## if verbose: print stw.lap(), "duration of ECG.find_fully_connected_subgraphs()", print len(last_exon_graphs) ######################################################################## # only continue if there is an perfectly aligned last exon graph if not (last_exon_graphs and last_exon_graphs[0].connectivitysaturation() == 1.0): #################################################################### if verbose: print "no perfect aligned last exon graph -> return False" #################################################################### return False # convert to CodingBlockGraphs new_last_cbgs = [] for leg in last_exon_graphs[0:take_max_best_ecgs]: cbg = ExonCollectionGraph2CodingBlockGraph(leg,is_last=True,lastCBG=last) if cbg != False and cbg != None and cbg.organism_set_size() == last.organism_set_size(): # create cache of CBG and do final check on quality cbg.create_cache() if (cbg.total_weight() < 0 or cbg.omsrlength() <= 10) and\ cbg._cexpander.binarystring.find("1") == -1: # discard hardly alignable CBGs continue # if here, then append this cbg as a possible novel final CBG new_last_cbgs.append( cbg ) ################################################################ if verbose: print "LEGcbg", cbg ################################################################ ######################################################################## if verbose: print stw.lap(), "ECGs converted to CBGs", len(new_last_cbgs) ######################################################################## if not new_last_cbgs: #################################################################### if verbose: print "no ecgs convertable to CBGs -> return False" #################################################################### return False # order by total weight, get the optimal CBG and its corresponding ECG new_last_cbgs = ordering.order_graphlist_by_total_weight(new_last_cbgs) theNewLastCbg = None cbgIF = None # check all interfaces between the novel final CBGs and the previous # CBG. The best interface is added to the GSG! cbgif_accepted_new_last_cbgs = [] already_checked_node_sets = [] for newcbg in new_last_cbgs[0:take_max_best_cbgs]: lastExonGraph = newcbg._ExonCollectionGraph del( newcbg._ExonCollectionGraph ) # check if it is not the extention of the current # last CBG (identical nodes) if len(last.node_set().symmetric_difference(newcbg.node_set())) == 0: if verbose: print "newCBG is the extention of current last CBG!!" continue # check if this combination of nodes (orfs) has not been tried already if newcbg.get_ordered_nodes() in already_checked_node_sets: ############################################################### if verbose: print "newCBG node set done earlier:", print newcbg.get_ordered_nodes() ############################################################### continue else: # append this set of nodes (as a list) to checklist already_checked_node_sets.append( newcbg.get_ordered_nodes() ) # check if this new final tinyexon graph has a compatible interface # with the current last one cbgIF = CodingBlockGraphInterface(last,newcbg) cbgIF.harvest_splice_sites() distinct_orgs = [] for node in lastExonGraph.get_nodes(): exon = lastExonGraph.get_node_object(node) if exon.acceptor.__class__.__name__ == 'SpliceAcceptor': distinct_orgs.append( lastExonGraph.organism_by_node(node) ) cbgIF.allow_intron_in_organisms(distinct_orgs) cbgIF.find_conserved_splice_sites() # do NOT optimize -> consumes a lot of time and is helpfull # only in extreme cases... #cbgIF.optimize() if not cbgIF.is_compatible(): ################################################################ if verbose: print "newCBG not a is_compatible() cbgIF" print newcbg ################################################################ continue # append to cbgif_accepted_new_last_cbgs newcbg._CBGinterface5p = cbgIF cbgif_accepted_new_last_cbgs.append( ( cbgIF.optimalitycheck().count(True), newcbg.total_weight(), newcbg ) ) ######################################################################## if verbose: print stw.lap(), "cbgIFs checked %s/%s" % ( len(cbgif_accepted_new_last_cbgs), len(new_last_cbgs[0:take_max_best_cbgs]) ) ######################################################################## # now start by adding the highest scoring newcbg first cbgif_accepted_new_last_cbgs.sort() cbgif_accepted_new_last_cbgs.reverse() ######################################################################## if verbose: print "candidate novel final CBGs:", len(cbgif_accepted_new_last_cbgs) for (true_cnt,totalwt,newcbg) in cbgif_accepted_new_last_cbgs: print true_cnt,totalwt,newcbg._CBGinterface5p print newcbg ######################################################################## for (true_cnt,totalwt,newcbg) in cbgif_accepted_new_last_cbgs: # get the already created cbgIF from the newcbg graph cbgIF = newcbg._CBGinterface5p # now check 4 criteria: # (1) cbgIF.is_optimal() (2) >GTG.identity # (3) >STG.totalweight (4) <STG.distance criteria = [] criteria.append( cbgIF.is_optimal() ) criteria.append( newcbg._stopcodongraph.total_weight() > last._stopcodongraph.total_weight() ) criteria.append( newcbg.genetree().identity() > last.genetree().identity() ) criteria.append( newcbg._stopcodongraph.stopcodon2omsrdistance() <= last._stopcodongraph.stopcodon2omsrdistance() ) #################################################################### if verbose: print "TRYING ADDITION of final newcbg", criteria print true_cnt,totalwt,newcbg._CBGinterface5p print newcbg #################################################################### # check if there is only a single different node/orf changed in the newcbg # this is recognized by a symmetric_difference of size 2 # in this case, be very strict! This easily causes overprediction (FP) tiny exons if len(last.node_set().symmetric_difference(newcbg.node_set())) == 2: # check if 4 criteria are valid; # a single False results in not accepting this new last tiny cbg if False in criteria: if verbose: print "# NOVEL lastTinyExon discarded; single orf extension, criteria", criteria # continue -> no new tiny CBG continue # now start check the criteria. # if criteria[0] == True, means a fully is_optimal interface! # do not perform any additional check, just add! if criteria[0] == True: theNewLastCbg = newcbg break # total weight criterion -> new.tw() > last.tw() if criteria[1] == False: ########################################################################## if verbose: print "# NOVEL lastTinyExon discarded; to low total weight" print "#", newcbg._stopcodongraph ########################################################################## # continue -> no new tiny CBG continue # identity criterion -> allow a ratio i.s.o. new.id() > last.id() # this strict criterion (>) is applied for single-new-orf-CBGs if criteria[2] == False: ratio = newcbg.genetree().identity() / last.genetree().identity() if ratio < minimal_last_vs_new_identity_ratio: ###################################################################### if verbose: print "# NOVEL lastTinyExon discarded; to low identity" print "#", newcbg._stopcodongraph, newcbg.genetree().identity() ###################################################################### # continue -> no new tiny CBG continue if criteria[3] == False: ########################################################################## if verbose: print "# NOVEL lastTinyExon discarded; higher stopcodon2omsrdistance" print "#", newcbg._stopcodongraph ########################################################################## # continue -> no new tiny CBG continue # if this point is reached, a new tiny last CBG has been found! theNewLastCbg = newcbg # break out of the for loop; store into the genestructure break # all okay -> ready for inserting the new CBG if theNewLastCbg and verbose: ################################################################################ print "NEW FINAL TINY EXON FOUND!!" print theNewLastCbg print cbgIF, cbgIF.is_optimal(), cbgIF.is_acceptable() print cbgIF._optimal_aligned_donor, cbgIF.donor_phase() print cbgIF._optimal_aligned_acceptor, cbgIF.acceptor_phase() ################################################################################ # hard-insert into the genestructure # using add_codingblock is likely to cause problems # because of the tinyness of the CBG if theNewLastCbg: for pos in range(0,len(self)): if self.codingblockgraphs[pos].IS_IGNORED: continue if self.codingblockgraphs[pos].IS_LAST: thelast = self.codingblockgraphs[pos] thelast.IS_LAST = False newcbg.IS_LAST = True self.codingblockgraphs.insert(pos+1,theNewLastCbg) # set the CBGInterface object in next and prev CBG self.codingblockgraphs[pos]._CBGinterface3p = cbgIF self.codingblockgraphs[pos+1]._CBGinterface5p = cbgIF # break out; end of this function break # done! return a True because newcbg is created & inserted return True else: # no newLastCbg found return False
def split_final_cbg_on_spanningrange_difference(self, sprdif_min_aa_length=CBG_FINAL_SPRDIF_MIN_AA_LENGTH, sprdif_min_node_count=CBG_FINAL_SPRDIF_MIN_NODE_COUNT, sprdif_min_gtid_ratio=0.55, only_perform_if_stopcodon_tw_ratio_lte=CBG_FINAL_SPRDIF_ONLY_IF_STOP_TW_RATIO_LTE, only_preform_if_cbg_id_gte=CBG_FINAL_SPRDIF_ONLY_IF_CBG_ID_GTE ): """ @type sprdif_min_aa_length: integer @param sprdif_min_aa_length: minimal length of the sprdif in aa's @type cbg_min_node_count: integer @param cbg_min_node_count: minimal number of nodes in a CBG to be elegiable for trying a split @type sprdif_min_gtid_ratio: float @param sprdif_min_gtid_ratio: @type only_perform_if_stopcodon_tw_ratio_lte: float @param only_perform_if_stopcodon_tw_ratio_lte: run function only when lastCBG.stopcodongraph.totalweight <= threshold @type only_preform_if_cbg_id_gte: float @param only_preform_if_cbg_id_gte: run function only when lastCBG.genetree.identity() >= threshold """ # get the CBG that is labelled as IS_LAST=True current_last = self.get_final_cbg() # check if we are alowed to peform this function # for groups of genes with very low identity, this function # is more likely to decrease the result then to improve the result # make AlignedStopCodonGraph current_last.align_stop_codons() tw_current = current_last._stopcodongraph.total_weight() ratio = tw_current / self.EXACT_SG_EDGE_COUNT # now check if it is alowed to enter the function: only_perform_if_... if only_perform_if_stopcodon_tw_ratio_lte and ratio > only_perform_if_stopcodon_tw_ratio_lte: return False if only_preform_if_cbg_id_gte and current_last.genetree().identity() < only_preform_if_cbg_id_gte: return False # check for rigth sprdif op requested size; if not => return False if not current_last.has_rigth_spanningrange_difference( sprdif_min_aa_length=sprdif_min_aa_length, sprdif_min_node_count=sprdif_min_node_count): # no rigth spanningrange difference -> done & return return False # make a deepcopy and clear cache of the one that will be processed last = deepcopy(current_last) last.clear_cache() # iteratively split splits = last.iteratively_split_codingblock_on_spanningrange_difference( side='rigth', sprdif_min_aa_length=sprdif_min_aa_length, sprdif_min_node_count=sprdif_min_node_count, ) # was the split succesfull? if len(splits) == 1: # no splits => done here! return False # when here process the sprdif CBGs # 1) cbghmmsearch2pacbpcollection # 2) pacbpCollection2acceptedcbgs all_accepted_cbgs = [] # loop over the splits; except for the most left one (the input `last` CBG) for splittedCBG in splits[1:]: if splittedCBG.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph': continue # complete with cbghmmsearch2pacbpcollection # get ratio of the GTG of this CBG ratio = splittedCBG.genetree().identity() / current_last.genetree().identity() # if ratio is bad -> do not perform! if sprdif_min_gtid_ratio and ratio < sprdif_min_gtid_ratio: continue pacbpCollection = cbghmmsearch2pacbpcollection(splittedCBG,self.input, prev=last, pacbp_min_length=sprdif_min_aa_length, hmmsearch_num_hits=3 ) # get list of accepted CBGs accepted = conversion.pacbpCollection2AcceptedCodingBlockGraphs(pacbpCollection,prev=last) all_accepted_cbgs.extend( accepted ) # if no accepted ones -> return False if not all_accepted_cbgs: return False # order graphs by total weight all_accepted_cbgs = ordering.order_graphlist_by_total_weight(all_accepted_cbgs) # and re-order on node occurrence: if a neighboring node is incorporated -> more likely! all_accepted_cbgs = ordering.reorder_cbgs_on_node_occurrence(all_accepted_cbgs,prev=last) # and now try to add the accepted cbgs into the genestructure # speedup the process by creating a tinyGSG object of only the last CBG # but, set the _GENETREE attribute to the genetree of the main GSG from graph_genestructure import GenestructureOfCodingBlockGraphs lastGSG = GenestructureOfCodingBlockGraphs(self.input) lastGSG.add_codingblock(current_last) lastGSG._GENETREE = self._GENETREE RETURN_STATUS_CBG_IS_ADDED = False for cbgL in all_accepted_cbgs: # only Ks CBG graphs are alowed here! if cbgL.node_count() != current_last.node_count(): continue if lastGSG.add_codingblock(cbgL,only_try_adding=True, max_cbg_gtg_topo_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_TOPO_DIF, max_cbg_gtg_abs_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_ABS_DIF, min_cbg_gtg_id_ratio=self.MIN_CBG_FINAL_SPRDIF_GTG_ID_RATIO, ): # it is addable; prepare for final addition to the genestructure lsrCBG = None cbgL.IS_SPLITTED = False cbgL.IS_5P_SPLITTED = False cbgL.IS_FIRST = False cbgL.IS_LAST = True current_last.IS_LAST = False # if identical nodes -> create a lsrCBG if not cbgL.node_set().difference(current_last.get_nodes()): current_last.IS_SPLITTED = True current_last.IS_3P_SPLITTED = True cbgL.IS_SPLITTED = True cbgL.IS_5P_SPLITTED = True lsrCBG = graphAbgp.codingblock_splitting.create_intermediate_lowsimilarity_region( current_last, cbgL ) if not lsrCBG.node_count(): lsrCBG = None # now add the new last CBG status = self.add_codingblock(cbgL, max_cbg_gtg_topo_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_TOPO_DIF, max_cbg_gtg_abs_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_ABS_DIF, min_cbg_gtg_id_ratio=self.MIN_CBG_FINAL_SPRDIF_GTG_ID_RATIO, ) status = lastGSG.add_codingblock(cbgL, max_cbg_gtg_topo_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_TOPO_DIF, max_cbg_gtg_abs_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_ABS_DIF, min_cbg_gtg_id_ratio=self.MIN_CBG_FINAL_SPRDIF_GTG_ID_RATIO, ) # if added, update the return value (RETURN_STATUS_CBG_IS_ADDED) if status: RETURN_STATUS_CBG_IS_ADDED = True print cbgL print cbgL.IS_5P_SPLITTED, cbgL.IS_SPLITTED, cbgL.IS_3P_SPLITTED # and add the intermediate lsrCBG when available if lsrCBG: statusMainGSG = self.add_codingblock(lsrCBG) statusLastGSG = lastGSG.add_codingblock(lsrCBG) print "lsrCBG added:", statusMainGSG, statusLastGSG else: # not placeable in the genestructure pass # in exceptional cases, 2 CBGs can be added. In case the node_set() is identical, # yet another lsrCBG has to be created in between these 2 new CBGs # check this in the main GSG (NOT in the lastGSG; when a lsrCBG is added here, # splits are added to the surrounding CBGs. Because call-by-reference, these # splits are added to the main GSG (self) too, and adding the same lsrCBG # will fail (splitted CBGs are skipped! if RETURN_STATUS_CBG_IS_ADDED: self.finalize_genestructure() if self.join_false_inframe_introns(): print "EXTRA lsrCBG added!!" # recreate interfaces if there is a new one created self.create_cbginterfaces() # return the return status True|False return RETURN_STATUS_CBG_IS_ADDED
def pacbpCollection2AcceptedCodingBlockGraphs(pacbpCollection, gtg=None, prev=None, next=None, max_cbg_gtg_topo_dif=None, max_cbg_gtg_abs_dif=None, min_cbg_gtg_id_ratio=None): """ """ # make splitted subgraphs from PacbpCollection # cbgs must have collection.organism_set_size()-1 nodes # and no missing edges are aloued # to the number of nodes missing in the input splittedCBG) exact_cbg_node_count = pacbpCollection.organism_set_size() exact_cbg_edge_count = exact_cbg_node_count - 1 dpcPacbpCollection = deepcopy(pacbpCollection) splitted_subgraphs = pacbpCollection.find_fully_connected_subgraphs( edges=exact_cbg_edge_count, max_missing_edges=0) # get pacbps for the splitted subgraphs and update edge weights completed_subgraphs = [] for spl in splitted_subgraphs: # only deal with complete CBGs, not incomplete or collections if spl.node_count() != exact_cbg_node_count: continue if spl.__class__.__name__ == 'PacbpCollectionGraph': continue if spl.connectivitysaturation() < 1.0: continue # harvest pacbps from the deepcopied PacbpCollection spl.harvest_pacbps_from_pacbpcollection(dpcPacbpCollection) if not spl.has_overall_minimal_spanning_range(): continue if not spl.has_all_pacbps(): continue spl.update_edge_weights_by_minimal_spanning_range() completed_subgraphs.append(spl) # order graphs by total weight completed_subgraphs = ordering.order_graphlist_by_total_weight( completed_subgraphs) # and re-order on node occurrence: if a neighboring node is incorporated -> more likely! completed_subgraphs = ordering.reorder_cbgs_on_node_occurrence( completed_subgraphs, prev=prev, next=next) accepted_cbgs = [] for spl in completed_subgraphs: if gtg: if max_cbg_gtg_topo_dif: topo_dif = gtg.graphalignmentdifference(spl.genetree()) if topo_dif > max_cbg_gtg_topo_dif: continue if max_cbg_gtg_abs_dif: abs_dif = gtg.absolutegraphalignmentdifference(spl.genetree()) if abs_dif > max_cbg_gtg_abs_dif: continue if min_cbg_gtg_id_ratio: identity_ratio = spl.genetree().identity() / gtg.identity() if identity_ratio < min_cbg_gtg_id_ratio: continue # if this point is reached: splitted cbg is accepted! accepted_cbgs.append(spl) # return the accepted_cbgs return accepted_cbgs