def _is_intermediate_overlapping_cbg_a_gsg_scaffold_enrichment(gsg, cbgA,cbgB,cbgC,minimal_scaffold_aa_enrichment=5): """ """ if not (gsg and cbgA and cbgB and cbgC): # (most likely) cbgA or cbgC is not defined # function behavious should be ti return False return False if not cbgA.mutual_nodes(cbgC): # series of CBGs does not represent an suitable # gene structure scaffold -> return False return False # perform this check from graph_genestructure import GenestructureOfCodingBlockGraphs partGSG = GenestructureOfCodingBlockGraphs(gsg.input) partGSG.codingblockgraphs = [ cbgA,cbgB,cbgC ] partGSG._GENETREE = gsg._GENETREE partOMSRa = partGSG.overall_minimal_spanning_range() partGSG.codingblockgraphs = [ cbgA,cbgC ] partOMSRb = partGSG.overall_minimal_spanning_range() scaffold_enrichments = [] for node in cbgA.mutual_nodes(cbgC): org = gsg.organism_by_node(node) scaffold_enrichments.append( len(partOMSRa[org]) - len(partOMSRb[org]) >=\ minimal_scaffold_aa_enrichment ) # check if True in scaffold_enrichments if True in scaffold_enrichments: return True else: return False
def split_final_cbg_on_spanningrange_difference(self, sprdif_min_aa_length=CBG_FINAL_SPRDIF_MIN_AA_LENGTH, sprdif_min_node_count=CBG_FINAL_SPRDIF_MIN_NODE_COUNT, sprdif_min_gtid_ratio=0.55, only_perform_if_stopcodon_tw_ratio_lte=CBG_FINAL_SPRDIF_ONLY_IF_STOP_TW_RATIO_LTE, only_preform_if_cbg_id_gte=CBG_FINAL_SPRDIF_ONLY_IF_CBG_ID_GTE ): """ @type sprdif_min_aa_length: integer @param sprdif_min_aa_length: minimal length of the sprdif in aa's @type cbg_min_node_count: integer @param cbg_min_node_count: minimal number of nodes in a CBG to be elegiable for trying a split @type sprdif_min_gtid_ratio: float @param sprdif_min_gtid_ratio: @type only_perform_if_stopcodon_tw_ratio_lte: float @param only_perform_if_stopcodon_tw_ratio_lte: run function only when lastCBG.stopcodongraph.totalweight <= threshold @type only_preform_if_cbg_id_gte: float @param only_preform_if_cbg_id_gte: run function only when lastCBG.genetree.identity() >= threshold """ # get the CBG that is labelled as IS_LAST=True current_last = self.get_final_cbg() # check if we are alowed to peform this function # for groups of genes with very low identity, this function # is more likely to decrease the result then to improve the result # make AlignedStopCodonGraph current_last.align_stop_codons() tw_current = current_last._stopcodongraph.total_weight() ratio = tw_current / self.EXACT_SG_EDGE_COUNT # now check if it is alowed to enter the function: only_perform_if_... if only_perform_if_stopcodon_tw_ratio_lte and ratio > only_perform_if_stopcodon_tw_ratio_lte: return False if only_preform_if_cbg_id_gte and current_last.genetree().identity() < only_preform_if_cbg_id_gte: return False # check for rigth sprdif op requested size; if not => return False if not current_last.has_rigth_spanningrange_difference( sprdif_min_aa_length=sprdif_min_aa_length, sprdif_min_node_count=sprdif_min_node_count): # no rigth spanningrange difference -> done & return return False # make a deepcopy and clear cache of the one that will be processed last = deepcopy(current_last) last.clear_cache() # iteratively split splits = last.iteratively_split_codingblock_on_spanningrange_difference( side='rigth', sprdif_min_aa_length=sprdif_min_aa_length, sprdif_min_node_count=sprdif_min_node_count, ) # was the split succesfull? if len(splits) == 1: # no splits => done here! return False # when here process the sprdif CBGs # 1) cbghmmsearch2pacbpcollection # 2) pacbpCollection2acceptedcbgs all_accepted_cbgs = [] # loop over the splits; except for the most left one (the input `last` CBG) for splittedCBG in splits[1:]: if splittedCBG.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph': continue # complete with cbghmmsearch2pacbpcollection # get ratio of the GTG of this CBG ratio = splittedCBG.genetree().identity() / current_last.genetree().identity() # if ratio is bad -> do not perform! if sprdif_min_gtid_ratio and ratio < sprdif_min_gtid_ratio: continue pacbpCollection = cbghmmsearch2pacbpcollection(splittedCBG,self.input, prev=last, pacbp_min_length=sprdif_min_aa_length, hmmsearch_num_hits=3 ) # get list of accepted CBGs accepted = conversion.pacbpCollection2AcceptedCodingBlockGraphs(pacbpCollection,prev=last) all_accepted_cbgs.extend( accepted ) # if no accepted ones -> return False if not all_accepted_cbgs: return False # order graphs by total weight all_accepted_cbgs = ordering.order_graphlist_by_total_weight(all_accepted_cbgs) # and re-order on node occurrence: if a neighboring node is incorporated -> more likely! all_accepted_cbgs = ordering.reorder_cbgs_on_node_occurrence(all_accepted_cbgs,prev=last) # and now try to add the accepted cbgs into the genestructure # speedup the process by creating a tinyGSG object of only the last CBG # but, set the _GENETREE attribute to the genetree of the main GSG from graph_genestructure import GenestructureOfCodingBlockGraphs lastGSG = GenestructureOfCodingBlockGraphs(self.input) lastGSG.add_codingblock(current_last) lastGSG._GENETREE = self._GENETREE RETURN_STATUS_CBG_IS_ADDED = False for cbgL in all_accepted_cbgs: # only Ks CBG graphs are alowed here! if cbgL.node_count() != current_last.node_count(): continue if lastGSG.add_codingblock(cbgL,only_try_adding=True, max_cbg_gtg_topo_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_TOPO_DIF, max_cbg_gtg_abs_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_ABS_DIF, min_cbg_gtg_id_ratio=self.MIN_CBG_FINAL_SPRDIF_GTG_ID_RATIO, ): # it is addable; prepare for final addition to the genestructure lsrCBG = None cbgL.IS_SPLITTED = False cbgL.IS_5P_SPLITTED = False cbgL.IS_FIRST = False cbgL.IS_LAST = True current_last.IS_LAST = False # if identical nodes -> create a lsrCBG if not cbgL.node_set().difference(current_last.get_nodes()): current_last.IS_SPLITTED = True current_last.IS_3P_SPLITTED = True cbgL.IS_SPLITTED = True cbgL.IS_5P_SPLITTED = True lsrCBG = graphAbgp.codingblock_splitting.create_intermediate_lowsimilarity_region( current_last, cbgL ) if not lsrCBG.node_count(): lsrCBG = None # now add the new last CBG status = self.add_codingblock(cbgL, max_cbg_gtg_topo_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_TOPO_DIF, max_cbg_gtg_abs_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_ABS_DIF, min_cbg_gtg_id_ratio=self.MIN_CBG_FINAL_SPRDIF_GTG_ID_RATIO, ) status = lastGSG.add_codingblock(cbgL, max_cbg_gtg_topo_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_TOPO_DIF, max_cbg_gtg_abs_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_ABS_DIF, min_cbg_gtg_id_ratio=self.MIN_CBG_FINAL_SPRDIF_GTG_ID_RATIO, ) # if added, update the return value (RETURN_STATUS_CBG_IS_ADDED) if status: RETURN_STATUS_CBG_IS_ADDED = True print cbgL print cbgL.IS_5P_SPLITTED, cbgL.IS_SPLITTED, cbgL.IS_3P_SPLITTED # and add the intermediate lsrCBG when available if lsrCBG: statusMainGSG = self.add_codingblock(lsrCBG) statusLastGSG = lastGSG.add_codingblock(lsrCBG) print "lsrCBG added:", statusMainGSG, statusLastGSG else: # not placeable in the genestructure pass # in exceptional cases, 2 CBGs can be added. In case the node_set() is identical, # yet another lsrCBG has to be created in between these 2 new CBGs # check this in the main GSG (NOT in the lastGSG; when a lsrCBG is added here, # splits are added to the surrounding CBGs. Because call-by-reference, these # splits are added to the main GSG (self) too, and adding the same lsrCBG # will fail (splitted CBGs are skipped! if RETURN_STATUS_CBG_IS_ADDED: self.finalize_genestructure() if self.join_false_inframe_introns(): print "EXTRA lsrCBG added!!" # recreate interfaces if there is a new one created self.create_cbginterfaces() # return the return status True|False return RETURN_STATUS_CBG_IS_ADDED
def replace_scaffold_breaking_cbgs(self,verbose=False): """ (Try) to replace CBG that break the GSG scaffold by other CBGs @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: Boolean @return: Is any CBG replaced? """ # Boolean return value scaffold_breaking_cbg_replaced = False for cbgpos in self.cbgpositerator(reversed=True)[1:]: cbg = self.codingblockgraphs[cbgpos] if cbg.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph': continue if cbg.node_count() < self.EXACT_SG_NODE_COUNT: continue # loop forwards through the GSG and look for mutual nodes identical_nodes = [] for backwardspos in range(cbgpos+1,len(self)): comparecbg = self.codingblockgraphs[backwardspos] if cbg.mutual_nodes(comparecbg): identical_nodes.append(True) break else: identical_nodes.append(False) if not identical_nodes: continue # final CBG -> continue elif identical_nodes == [True]: continue # neighboring node has mutual nodes -> continue elif identical_nodes.count(True) == 0: continue # no mutual nodes at all -> continue else: # this is what we are looking for, a list like # [ False, ... True ] with >1 False # get total_weights of the intermediate CBGs tws = [ self.codingblockgraphs[_pos].total_weight() for\ _pos in range(cbgpos+1,backwardspos) ] ################################################################ if verbose: print cbgpos, backwardspos, identical_nodes print cbg print tws print comparecbg ################################################################ # first, check if the CBGs are already partially overlapping # if so, get rid of this intermediate CBG omsrdist = cbg.omsr_distance_between_codingblocks(comparecbg) if max(omsrdist.values()) <= 1: # yes, all organisms glue these CBGs perfectly together # just remove this one without further checks cbg._CBGinterface3p = None comparecbg._CBGinterface5p = None self.codingblockgraphs.__setslice__( cbgpos+1, backwardspos+1, [ comparecbg ] ) scaffold_breaking_cbg_replaced = True # go to the next cbg in the list continue # do a more eleborate check by trying to create a CBG # in this large_scaffold_gap from graph_genestructure import GenestructureOfCodingBlockGraphs partialGSG = GenestructureOfCodingBlockGraphs(self.input) partialGSG.codingblockgraphs = [ cbg, comparecbg ] partialGSG._GENETREE = self._GENETREE partialGSG.create_large_intermediate_cbg_for_scaffold_gap( sprdif_min_node_count = 2, cbg_min_node_count = self.EXACT_SG_NODE_COUNT, verbose = verbose ) if len(partialGSG) == 2: ############################################################ if verbose: print "NO scaffold CBGs found!" ############################################################ pass else: new_tws = [ _cbg.total_weight() for _cbg in\ partialGSG.codingblockgraphs[1:-1] ] if sum(new_tws) > sum(tws): # replace! self.codingblockgraphs.__setslice__( cbgpos+1, backwardspos, partialGSG.codingblockgraphs[1:-1] ) scaffold_breaking_cbg_replaced = True else: pass ############################################################ if verbose: if sum(new_tws) > sum(tws): print "REPLACING scaffold-breaking CBGs!!!" else: print "MAINTAINING scaffold-breaking CBGs!!!" for cbg in partialGSG: print cbg ############################################################ # return if CBGs are removed return scaffold_breaking_cbg_replaced