def pacbpCollection2AcceptedCodingBlockGraphs(pacbpCollection,gtg=None,prev=None,next=None, max_cbg_gtg_topo_dif=None, max_cbg_gtg_abs_dif=None, min_cbg_gtg_id_ratio=None): """ """ # make splitted subgraphs from PacbpCollection # cbgs must have collection.organism_set_size()-1 nodes # and no missing edges are aloued # to the number of nodes missing in the input splittedCBG) exact_cbg_node_count = pacbpCollection.organism_set_size() exact_cbg_edge_count = exact_cbg_node_count - 1 dpcPacbpCollection = deepcopy(pacbpCollection) splitted_subgraphs = pacbpCollection.find_fully_connected_subgraphs( edges=exact_cbg_edge_count, max_missing_edges=0 ) # get pacbps for the splitted subgraphs and update edge weights completed_subgraphs = [] for spl in splitted_subgraphs: # only deal with complete CBGs, not incomplete or collections if spl.node_count() != exact_cbg_node_count: continue if spl.__class__.__name__ == 'PacbpCollectionGraph': continue if spl.connectivitysaturation() < 1.0: continue # harvest pacbps from the deepcopied PacbpCollection spl.harvest_pacbps_from_pacbpcollection(dpcPacbpCollection) if not spl.has_overall_minimal_spanning_range(): continue if not spl.has_all_pacbps(): continue spl.update_edge_weights_by_minimal_spanning_range() completed_subgraphs.append(spl) # order graphs by total weight completed_subgraphs = ordering.order_graphlist_by_total_weight(completed_subgraphs) # and re-order on node occurrence: if a neighboring node is incorporated -> more likely! completed_subgraphs = ordering.reorder_cbgs_on_node_occurrence(completed_subgraphs,prev=prev,next=next) accepted_cbgs = [] for spl in completed_subgraphs: if gtg: if max_cbg_gtg_topo_dif: topo_dif = gtg.graphalignmentdifference( spl.genetree() ) if topo_dif > max_cbg_gtg_topo_dif: continue if max_cbg_gtg_abs_dif: abs_dif = gtg.absolutegraphalignmentdifference( spl.genetree() ) if abs_dif > max_cbg_gtg_abs_dif: continue if min_cbg_gtg_id_ratio: identity_ratio = spl.genetree().identity() / gtg.identity() if identity_ratio < min_cbg_gtg_id_ratio: continue # if this point is reached: splitted cbg is accepted! accepted_cbgs.append(spl) # return the accepted_cbgs return accepted_cbgs
def findmostlikelyCBG2GSGinsert(partialGSG,cbglist, order_cbgs_by_total_weight=True, reorder_cbgs_on_node_occurrence=True, optimizetinyexoninterface=True, verbose=False): """ """ # if no CBG in list -> done! if not cbglist: return partialGSG # make interfaces in the partialGSG created = partialGSG.create_cbginterfaces() gsgsize = len(partialGSG) if order_cbgs_by_total_weight: # order graphs by total weight cbglist = ordering.order_cbgraphlist(cbglist) if reorder_cbgs_on_node_occurrence: # get prev and next cbg from partialGSG if len(partialGSG) >= 2: prev = partialGSG.codingblockgraphs[0] next = partialGSG.codingblockgraphs[-1] elif len(partialGSG) == 1: prev, next = None, partialGSG.codingblockgraphs[0] else: prev, next = None, None # re-order on node occurrence. cbglist = ordering.reorder_cbgs_on_node_occurrence(cbglist,prev=prev,next=next) #################################################################### if verbose and partialGSG and cbglist: print "elegiable CBGs for addition in partialGSG" for cbg in cbglist: print cbg #################################################################### # now try to insert CBGs in the partialGSG # we try several insertions, starting with the most stringest one ############################################################ # (1) only do both options for the top listed CBG #### # perfect cbgIF(s), topologically sound CBGs #### # perfect cbgIF(s), CBG2GTG topology not tested #### ############################################################ is_added = _place_cbg_in_partialgsg([ cbglist[0] ],partialGSG, omit_conditional_addition=False, optimizetinyexoninterface=optimizetinyexoninterface, verbose=verbose ) is_added = _place_cbg_in_partialgsg([ cbglist[0] ],partialGSG, omit_conditional_addition=True, optimizetinyexoninterface=optimizetinyexoninterface, verbose=verbose ) ############################################################ # (2) perfect cbgIF(s), topologically sound CBGs #### ############################################################ is_added = _place_cbg_in_partialgsg(cbglist,partialGSG, omit_conditional_addition=False, optimizetinyexoninterface=optimizetinyexoninterface, verbose=verbose ) ############################################################ # (3) perfect cbgIF(s), CBG2GTG topology not tested #### ############################################################ is_added = _place_cbg_in_partialgsg(cbglist,partialGSG, omit_conditional_addition=True, optimizetinyexoninterface=optimizetinyexoninterface, verbose=verbose ) # when partialGSG is not enlarged, its cbgIFs are likely # distorted in the proces of trying novel CBG inserts. # Recreate the cbgIFs here if len(partialGSG) == gsgsize: if gsgsize >=2: partialGSG.clear_central_cbginterfaces() partialGSG.create_cbginterfaces() if gsgsize == 2: # most obvious case of 2 CBGs in the GSG partialGSG.codingblockgraphs[0].IS_3P_SPLITTED = False partialGSG.codingblockgraphs[1].IS_5P_SPLITTED = False if not partialGSG.codingblockgraphs[0].IS_5P_SPLITTED: partialGSG.codingblockgraphs[0].IS_SPLITTED = False if not partialGSG.codingblockgraphs[1].IS_3P_SPLITTED: partialGSG.codingblockgraphs[1].IS_SPLITTED = False else: # a partialGSG of only a single CBG (first or last CBG) partialGSG.codingblockgraphs[0]._CBGinterface5p = None partialGSG.codingblockgraphs[0]._CBGinterface3p = None # Done with this function. Return the (incremented) partialGSG return partialGSG
def split_final_cbg_on_spanningrange_difference(self, sprdif_min_aa_length=CBG_FINAL_SPRDIF_MIN_AA_LENGTH, sprdif_min_node_count=CBG_FINAL_SPRDIF_MIN_NODE_COUNT, sprdif_min_gtid_ratio=0.55, only_perform_if_stopcodon_tw_ratio_lte=CBG_FINAL_SPRDIF_ONLY_IF_STOP_TW_RATIO_LTE, only_preform_if_cbg_id_gte=CBG_FINAL_SPRDIF_ONLY_IF_CBG_ID_GTE ): """ @type sprdif_min_aa_length: integer @param sprdif_min_aa_length: minimal length of the sprdif in aa's @type cbg_min_node_count: integer @param cbg_min_node_count: minimal number of nodes in a CBG to be elegiable for trying a split @type sprdif_min_gtid_ratio: float @param sprdif_min_gtid_ratio: @type only_perform_if_stopcodon_tw_ratio_lte: float @param only_perform_if_stopcodon_tw_ratio_lte: run function only when lastCBG.stopcodongraph.totalweight <= threshold @type only_preform_if_cbg_id_gte: float @param only_preform_if_cbg_id_gte: run function only when lastCBG.genetree.identity() >= threshold """ # get the CBG that is labelled as IS_LAST=True current_last = self.get_final_cbg() # check if we are alowed to peform this function # for groups of genes with very low identity, this function # is more likely to decrease the result then to improve the result # make AlignedStopCodonGraph current_last.align_stop_codons() tw_current = current_last._stopcodongraph.total_weight() ratio = tw_current / self.EXACT_SG_EDGE_COUNT # now check if it is alowed to enter the function: only_perform_if_... if only_perform_if_stopcodon_tw_ratio_lte and ratio > only_perform_if_stopcodon_tw_ratio_lte: return False if only_preform_if_cbg_id_gte and current_last.genetree().identity() < only_preform_if_cbg_id_gte: return False # check for rigth sprdif op requested size; if not => return False if not current_last.has_rigth_spanningrange_difference( sprdif_min_aa_length=sprdif_min_aa_length, sprdif_min_node_count=sprdif_min_node_count): # no rigth spanningrange difference -> done & return return False # make a deepcopy and clear cache of the one that will be processed last = deepcopy(current_last) last.clear_cache() # iteratively split splits = last.iteratively_split_codingblock_on_spanningrange_difference( side='rigth', sprdif_min_aa_length=sprdif_min_aa_length, sprdif_min_node_count=sprdif_min_node_count, ) # was the split succesfull? if len(splits) == 1: # no splits => done here! return False # when here process the sprdif CBGs # 1) cbghmmsearch2pacbpcollection # 2) pacbpCollection2acceptedcbgs all_accepted_cbgs = [] # loop over the splits; except for the most left one (the input `last` CBG) for splittedCBG in splits[1:]: if splittedCBG.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph': continue # complete with cbghmmsearch2pacbpcollection # get ratio of the GTG of this CBG ratio = splittedCBG.genetree().identity() / current_last.genetree().identity() # if ratio is bad -> do not perform! if sprdif_min_gtid_ratio and ratio < sprdif_min_gtid_ratio: continue pacbpCollection = cbghmmsearch2pacbpcollection(splittedCBG,self.input, prev=last, pacbp_min_length=sprdif_min_aa_length, hmmsearch_num_hits=3 ) # get list of accepted CBGs accepted = conversion.pacbpCollection2AcceptedCodingBlockGraphs(pacbpCollection,prev=last) all_accepted_cbgs.extend( accepted ) # if no accepted ones -> return False if not all_accepted_cbgs: return False # order graphs by total weight all_accepted_cbgs = ordering.order_graphlist_by_total_weight(all_accepted_cbgs) # and re-order on node occurrence: if a neighboring node is incorporated -> more likely! all_accepted_cbgs = ordering.reorder_cbgs_on_node_occurrence(all_accepted_cbgs,prev=last) # and now try to add the accepted cbgs into the genestructure # speedup the process by creating a tinyGSG object of only the last CBG # but, set the _GENETREE attribute to the genetree of the main GSG from graph_genestructure import GenestructureOfCodingBlockGraphs lastGSG = GenestructureOfCodingBlockGraphs(self.input) lastGSG.add_codingblock(current_last) lastGSG._GENETREE = self._GENETREE RETURN_STATUS_CBG_IS_ADDED = False for cbgL in all_accepted_cbgs: # only Ks CBG graphs are alowed here! if cbgL.node_count() != current_last.node_count(): continue if lastGSG.add_codingblock(cbgL,only_try_adding=True, max_cbg_gtg_topo_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_TOPO_DIF, max_cbg_gtg_abs_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_ABS_DIF, min_cbg_gtg_id_ratio=self.MIN_CBG_FINAL_SPRDIF_GTG_ID_RATIO, ): # it is addable; prepare for final addition to the genestructure lsrCBG = None cbgL.IS_SPLITTED = False cbgL.IS_5P_SPLITTED = False cbgL.IS_FIRST = False cbgL.IS_LAST = True current_last.IS_LAST = False # if identical nodes -> create a lsrCBG if not cbgL.node_set().difference(current_last.get_nodes()): current_last.IS_SPLITTED = True current_last.IS_3P_SPLITTED = True cbgL.IS_SPLITTED = True cbgL.IS_5P_SPLITTED = True lsrCBG = graphAbgp.codingblock_splitting.create_intermediate_lowsimilarity_region( current_last, cbgL ) if not lsrCBG.node_count(): lsrCBG = None # now add the new last CBG status = self.add_codingblock(cbgL, max_cbg_gtg_topo_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_TOPO_DIF, max_cbg_gtg_abs_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_ABS_DIF, min_cbg_gtg_id_ratio=self.MIN_CBG_FINAL_SPRDIF_GTG_ID_RATIO, ) status = lastGSG.add_codingblock(cbgL, max_cbg_gtg_topo_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_TOPO_DIF, max_cbg_gtg_abs_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_ABS_DIF, min_cbg_gtg_id_ratio=self.MIN_CBG_FINAL_SPRDIF_GTG_ID_RATIO, ) # if added, update the return value (RETURN_STATUS_CBG_IS_ADDED) if status: RETURN_STATUS_CBG_IS_ADDED = True print cbgL print cbgL.IS_5P_SPLITTED, cbgL.IS_SPLITTED, cbgL.IS_3P_SPLITTED # and add the intermediate lsrCBG when available if lsrCBG: statusMainGSG = self.add_codingblock(lsrCBG) statusLastGSG = lastGSG.add_codingblock(lsrCBG) print "lsrCBG added:", statusMainGSG, statusLastGSG else: # not placeable in the genestructure pass # in exceptional cases, 2 CBGs can be added. In case the node_set() is identical, # yet another lsrCBG has to be created in between these 2 new CBGs # check this in the main GSG (NOT in the lastGSG; when a lsrCBG is added here, # splits are added to the surrounding CBGs. Because call-by-reference, these # splits are added to the main GSG (self) too, and adding the same lsrCBG # will fail (splitted CBGs are skipped! if RETURN_STATUS_CBG_IS_ADDED: self.finalize_genestructure() if self.join_false_inframe_introns(): print "EXTRA lsrCBG added!!" # recreate interfaces if there is a new one created self.create_cbginterfaces() # return the return status True|False return RETURN_STATUS_CBG_IS_ADDED
def pacbpCollection2AcceptedCodingBlockGraphs(pacbpCollection, gtg=None, prev=None, next=None, max_cbg_gtg_topo_dif=None, max_cbg_gtg_abs_dif=None, min_cbg_gtg_id_ratio=None): """ """ # make splitted subgraphs from PacbpCollection # cbgs must have collection.organism_set_size()-1 nodes # and no missing edges are aloued # to the number of nodes missing in the input splittedCBG) exact_cbg_node_count = pacbpCollection.organism_set_size() exact_cbg_edge_count = exact_cbg_node_count - 1 dpcPacbpCollection = deepcopy(pacbpCollection) splitted_subgraphs = pacbpCollection.find_fully_connected_subgraphs( edges=exact_cbg_edge_count, max_missing_edges=0) # get pacbps for the splitted subgraphs and update edge weights completed_subgraphs = [] for spl in splitted_subgraphs: # only deal with complete CBGs, not incomplete or collections if spl.node_count() != exact_cbg_node_count: continue if spl.__class__.__name__ == 'PacbpCollectionGraph': continue if spl.connectivitysaturation() < 1.0: continue # harvest pacbps from the deepcopied PacbpCollection spl.harvest_pacbps_from_pacbpcollection(dpcPacbpCollection) if not spl.has_overall_minimal_spanning_range(): continue if not spl.has_all_pacbps(): continue spl.update_edge_weights_by_minimal_spanning_range() completed_subgraphs.append(spl) # order graphs by total weight completed_subgraphs = ordering.order_graphlist_by_total_weight( completed_subgraphs) # and re-order on node occurrence: if a neighboring node is incorporated -> more likely! completed_subgraphs = ordering.reorder_cbgs_on_node_occurrence( completed_subgraphs, prev=prev, next=next) accepted_cbgs = [] for spl in completed_subgraphs: if gtg: if max_cbg_gtg_topo_dif: topo_dif = gtg.graphalignmentdifference(spl.genetree()) if topo_dif > max_cbg_gtg_topo_dif: continue if max_cbg_gtg_abs_dif: abs_dif = gtg.absolutegraphalignmentdifference(spl.genetree()) if abs_dif > max_cbg_gtg_abs_dif: continue if min_cbg_gtg_id_ratio: identity_ratio = spl.genetree().identity() / gtg.identity() if identity_ratio < min_cbg_gtg_id_ratio: continue # if this point is reached: splitted cbg is accepted! accepted_cbgs.append(spl) # return the accepted_cbgs return accepted_cbgs