def search_for_lowsimilarity_regions(self,aligned_intron_min_aa_length=ALIGNED_INTRON_MIN_AA_LENGTH,verbose=False): """ Search CBGs in genestructure for lowsimilarity regions """ ################################################################ if verbose: stw = StopWatch(name='lsrCBGsearch') stw.start() ################################################################ # Loop reversed through genestructure to make sure that once # a CBG is splitted, the positions of the remainder of the # list stay intact. for posinGSG in range(len(self)-1,-1,-1): sg = self.codingblockgraphs[posinGSG] # skip IGNORED, lsrCBG and CBGs that are incomplete (still await HMM completion) if sg.IS_IGNORED: continue if sg.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph': continue if sg.node_count() < self.EXACT_SG_NODE_COUNT: continue if verbose: print stw.lap(), posinGSG, "start" # check for potential aligned intron if sg.potentially_contains_aligned_intron(window_aa_size=aligned_intron_min_aa_length): ######################################################## if verbose: print stw.lap(), posinGSG, "found" for k,v in sg.getomsrproteinsequences().iteritems(): print ">%s\n%s\n" % (k,v) print "ABOUT TO SPLIT:", sg print sg._cexpander.binarystring, print sg._cexpander.projected_on sg.printmultiplealignment() for k,pacbp in sg.pacbps.iteritems(): print k, pacbp ######################################################## # now actually split by inframe intron res = sg.split_codingblock_by_inframe_intron() if len(res) == 1: # no inframe intron found here pass else: # prepare the CBGs for insertion for pos in range(0,len(res)): splittedCBG = res[pos] splittedCBG.extend_pacbporfs(self.input) splittedCBG.update_edge_weights_by_minimal_spanning_range() splittedCBG.IS_SPLITTED = True if pos > 0: splittedCBG.IS_5P_SPLITTED = True splittedCBG.IS_FIRST = False if pos < len(res)-1: splittedCBG.IS_3P_SPLITTED = True splittedCBG.IS_LAST = False # (re)create the cache for the splitted CBGs splittedCBG.create_cache() ################################################ if verbose: print stw.lap(), posinGSG, "done!" print "SUCCESFULLY SPLITTED:", splittedCBG splittedCBG.printmultiplealignment() print splittedCBG._cexpander.binarystring, print splittedCBG._cexpander.projected_on print splittedCBG._omsr for trf in splittedCBG._cexpander._transferblocks: print trf.binarystring, trf.projected_on for k,v in splittedCBG._cexpander.inputsequences.iteritems(): print v,"\t",k for _org,orflist in splittedCBG.get_orfs_of_graph().iteritems(): print orflist[0], _org for pacbp in splittedCBG.pacbps.values(): print pacbp pacbp.print_protein(_linesize=100) ################################################ # create lsrCBGs and cbgIFs between them by looping in reversed # order over all pairs of CBGs (because lsrCBG insertion in list) for pos in range(len(res)-2,-1,-1): cbgL,cbgR = res[pos:pos+2] lsrCBG = create_intermediate_lowsimilarity_region(cbgL,cbgR) res.insert(pos+1,lsrCBG) # create cbgIF between the CBGs and the lsrCBG # just create -> cbgIF with lsrCBG is immediately is_optimal() cbgIFa = CodingBlockGraphInterface(cbgL,lsrCBG) cbgIFb = CodingBlockGraphInterface(lsrCBG,cbgR) # set cbgIF objects to the CBGs and the lsrCBG cbgL._CBGinterface3p = cbgIFa lsrCBG._CBGinterface5p = cbgIFa lsrCBG._CBGinterface3p = cbgIFb cbgR._CBGinterface5p = cbgIFb # update the first and last CBG in this list with the # cbgIFs of the parental CBG (variable sg) res[0]._CBGinterface5p = sg._CBGinterface5p res[-1]._CBGinterface3p = sg._CBGinterface3p # update the original IS_FIRST/IS_LAST status res[0].IS_FIRST = sg.IS_FIRST res[-1].IS_LAST = sg.IS_LAST # and set splittedCBGs to genestructure # by replacing the existing CBG (variable sg) on the # position posinGSG with the list op splitted CBGs self.codingblockgraphs.__setslice__(posinGSG,posinGSG+1,res) else: # nope, no potential inframe intron; just append ###print sg.total_weight(), False pass
def cbg_cexpander_inframe_intron_search(self, min_total_pssm_score = MIN_TOTAL_PSSM_INFRAME_INTRON, min_intron_nt_length = MIN_INTRON_NT_LENGTH, verbose=False): """ @type self: CodingBlockGraph @param self: CodingBlockGraph instance @type min_total_pssm_score: float @param min_total_pssm_score: MIN_TOTAL_PSSM_INFRAME_INTRON @type min_intron_nt_length: integer @param min_intron_nt_length: MIN_INTRON_NT_LENGTH @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list or False @return: list with new (sub)CBGs or False when not splitted """ ######################################################################## if verbose: stw = StopWatch(name="cexpCbgIfIntron") stw.start() ######################################################################## # return variable; list of splitted CBGs. return_cbg_list = [ self ] # create cexpander multiplealignment blocks cbgMA = lib_cexpander.cexpander2multiplealignment(self._cexpander, verbose=verbose) # In freak-accident cases (one in thousends of times), cexpander produces # unequal amount of 1's in the binarystrings. This is theoretically impossible. # Problem is worked on; in the meanwhile, cexpander2multiplealignment returns # False in these cases. Catch this here by quiting current # cbg_cexpander_inframe_intron_search() function call and return False TODO=True if not cbgMA: return False ######################################################################## if verbose: print stw.lap() blockscnt = len( cbgMA[ cbgMA.keys()[0] ] ) print self print "BLOCKS:", blockscnt, self._cexpander.binarystring, print self._cexpander.projected_on for org in cbgMA.keys(): print org, "\t", for blockid in range(0,blockscnt): if cbgMA[org][blockid].count("1") >= 1: print len(cbgMA[org][blockid]), else: print cbgMA[org][blockid], print "" ######################################################################## # loop over the aligned cexpander blocks and check the # non-uniformly aligned blocks for length variation blockscnt = len( cbgMA[ cbgMA.keys()[0] ] ) oricbgomsr = self.overall_minimal_spanning_range() for blockid in range(0,blockscnt): # obtain non-uniformly aligned AA lengths for this block lengths = {} for org in cbgMA.keys(): lengths[org] = cbgMA[org][blockid].count("0") # skip the uniformly aligned blocks if list(Set(lengths.values())) == [0]: continue #################################################################### if verbose: print stw.lap(), "lengths:", lengths #################################################################### # obtain coordinates for this area lsrcoords = {} for org in cbgMA.keys(): node = self.node_by_organism(org) coordSta = min(oricbgomsr[node]) # make summation of length of preceeding (non)aligned blocks for i in range(0,blockid): coordSta += cbgMA[org][i].count("1") +\ cbgMA[org][i].count("0") # end coord is start coord + length of current block coordEnd = coordSta + lengths[org] lsrcoords[org] = ( coordSta, coordEnd ) #################################################################### if verbose: print stw.lap(), "lsrcoords:", lsrcoords #################################################################### # translate AA lengths to NT lengths for k in lengths.keys(): lengths[k] = lengths[k]*3 # check lenght discrepancy and assign putative inframe introns putative_inframe_intron_orgs =\ _length_discrepancy_to_potential_inframe_introns(lengths) if not putative_inframe_intron_orgs: # no length discrepancy that can represent an inframe intron continue # organisms/genes for which an inframe intron can be an improvement # data dictionary. Keys: 'max_nt_length', 'min_nt_length', # 'min_donor_pos', 'max_acceptor_pos', 'min_total_pssm' inframe_intron_criteria = {} # find putative inframe introns in assigned genes/organisms putative_inframe_introns = {} for org in putative_inframe_intron_orgs: # assign inframe intron criteria for this organism inframe_intron_criteria[org] = { 'min_nt_length' : min_intron_nt_length, 'min_total_pssm' : min_total_pssm_score, 'min_donor_pos' : (min(lsrcoords[org]) - 5) * 3, 'max_acceptor_pos' : (max(lsrcoords[org]) + 5) * 3, } # search for potential introns that can be responsible for this event theorf = self.get_orfs_of_graph(organism=org)[0] introns = pacb.connecting.merge_orfs_with_intron( theorf,theorf, min_intron_nt_length=min_intron_nt_length ) ################################################################ if verbose: print "introns:", org, len(introns), "raw" ################################################################ # filter introns for all outside the OMSR, to short, to long, # total pssm_score etc introns = _filter_putative_inframe_intron_list( introns,org,inframe_intron_criteria) putative_inframe_introns[org] = introns ################################################################ if verbose: print "introns:", org, len(introns), "filtered" ################################################################ # check if all putative_inframe_intron_orgs have indeed introns # and check if all have at least a single intron phase in common if 0 in [ len(ill) for ill in putative_inframe_introns.values() ]: # no introns in one or more organisms/genes -> continue continue if len( putative_inframe_introns )> 1: # do phase check in all organisms/genes phases = Set([0,1,2]) for org, intronlist in putative_inframe_introns.iteritems(): thisphases = Set([ intron.phase for intron in intronlist ]) phases.intersection_update(thisphases) if len(phases) == 0: ################################################################ if verbose: print "no mutual phase -> no cbgIF.is_optimal()" ################################################################ # no mutual phase -> no cbgIF.is_optimal() possible lateron continue else: pass # if an intron in at least a single organism is still there, # then split the involved pacbps in the `original` cbgL, the last # added CBG element in the return_cbg_list, and make a (virtual) # deepcopy of a novel cbgL. Both CBGs have actually the SAME pacbps! cbgR = self.deepcopy() cbgL = self.deepcopy() # loop over the organisms/genes with inframe introns split # the Pacbps of these orgs in both to-become L and R CBGs inframe_intron_orgs = putative_inframe_introns.keys() for org in inframe_intron_orgs: ################################################################ if verbose: print "splitting PACBPs for org:", org print "L", cbgL print "R", cbgL ################################################################ node = self.node_by_organism(org) replacementsL = {} replacementsR = {} for (key,node1,node2), pacbporf in cbgL.pacbps.iteritems(): if node in [node1,node2]: # get the pacbp of this pacbporf and split it! pacbp = pacb.conversion.pacbporf2pacbp(pacbporf) org1 = self.organism_by_node(node1) org2 = self.organism_by_node(node2) if org1 in putative_inframe_introns.keys() and\ org2 in putative_inframe_introns.keys() and\ inframe_intron_orgs.index(org) > 0: # already splitted; both orgs are inframe introns! continue # make split coordinates relative splitL = lsrcoords[org1][0] - pacbp.query_start splitR = lsrcoords[org1][1] - pacbp.query_start pacbpL = pacb.splitting.split_pacb_on_coordinates( pacbp,(splitL,splitL),returnside='left') pacbpR = pacb.splitting.split_pacb_on_coordinates( pacbp,(splitR,splitR),returnside='rigth') # check if both cbgL and cbgR make sence # if not -> return False! if not pacbpL: return False if not pacbpR: return False ######################################################## if verbose: print "#", node1, node2, lsrcoords[org1], print "L:", splitL, "R:", splitR print pacbp print pacbpL print pacbpR ######################################################## # pacbpL -> extented pacbporfL -> store to replacementsL newpacbporfL = pacb.conversion.pacbp2pacbporf(pacbpL, pacbporf.orfQ,pacbporf.orfS) newpacbporfL.extend_pacbporf_after_stops() replacementsL[(key,node1,node2)] = newpacbporfL # pacbpR -> extented pacbporfR -> store to replacementsR newpacbporfR = pacb.conversion.pacbp2pacbporf(pacbpR, pacbporf.orfQ,pacbporf.orfS) newpacbporfR.extend_pacbporf_after_stops() replacementsR[(key,node1,node2)] = newpacbporfR # do the pacbporf replacements in both CBGs statusL = _update_cbg_with_pacbporf_replacements( cbgL,replacementsL) statusR = _update_cbg_with_pacbporf_replacements( cbgR,replacementsR) # check if both cbgL and cbgR make sence if not statusL or not statusR: # return unchanged cbg status -> False return False # Verify the interface between cbgL and cbgR. # Most likely, the sites are nicely alignable. cbgIF = CodingBlockGraphInterface(cbgL,cbgR) cbgIF.force_intron_in_organisms( putative_inframe_introns.keys() ) cbgIF.allow_intron_in_organisms( putative_inframe_introns.keys() ) cbgIF.harvest_splice_sites() cbgIF.find_conserved_splice_sites() #################################################################### if verbose: print cbgL print cbgIF print cbgR cbgIF.interfaceproperties() #################################################################### # check the properties of the CBGinterface if cbgIF.optimalitycheck().count(True) >= 2: # yes; is_compatible and donor and/or acceptor is optimal cbgL._CBGinterface3p = cbgIF cbgR._CBGinterface5p = cbgIF cbgL.copy_5pcbginterface_from_othercbg(self) cbgR.copy_3pcbginterface_from_othercbg(self) return_cbg_list = [ cbgL, cbgR ] ################################################################ if verbose: print "INFRAME INTRON CONFIRMED!!" ################################################################ else: # no compatible interface... although intron(s) was/were found! # (at least) two options are now open: # 1. enforce the intron(s) and create cbgIF with _forced_ends # 2. ignore the intron(s) and create an intermediate lsrCBG # 1. is `tricky`. First, how sure is this inframe intron, # what type of criteria do we assume etc etc. # second, how to create a coorect cbgIF? It must be an # IS_SPLITTED interface, of which the boundaries might fall # outside the OMSR's of the CBGs. # 2. ignore the intron(s) and create an intermediate lsrCBG lsrCBG = create_intermediate_lowsimilarity_region(cbgL,cbgR) prepare_lsrcbg_and_cbg_for_gsg_insertion(cbgL,lsrCBG) prepare_lsrcbg_and_cbg_for_gsg_insertion(lsrCBG,cbgR) cbgL.copy_5pcbginterface_from_othercbg(self) cbgR.copy_3pcbginterface_from_othercbg(self) return_cbg_list = [ cbgL, lsrCBG, cbgR ] ################################################################ if verbose: print "no INFRAME INTRON -> lsrCBG" print cbgL print " ", lsrCBG._CBGinterface5p print " ", lsrCBG print " ", lsrCBG._CBGinterface3p print cbgR self.printmultiplealignment() print cbgL cbgL.printmultiplealignment() print cbgR cbgR.printmultiplealignment() ################################################################ # EOF this function. # return False if this CBG remained intact, list of splits when splitted if len(return_cbg_list) == 1: return False else: return return_cbg_list
def construct_final_tiny_cbg(self, max_exon_nt_length=SHORT_TAILINGEXON_MAX_NT_LENGTH, max_intron_nt_length=SHORT_TAILINGEXON_MAX_INTRON_NT_LENGTH, take_max_best_acceptors=SHORT_TAILINGEXON_TAKE_MAX_BEST_ACCEPTORS, take_max_best_ecgs=SHORT_TAILINGEXON_TAKE_MAX_BEST_ECGS, take_max_best_cbgs=SHORT_TAILINGEXON_TAKE_MAX_BEST_CBGS, maximal_current_stopcodongraph_average_weight=0.90, minimal_last_vs_new_identity_ratio=0.80, maximal_cexpander_cbg_tail_uniformity_aa_length=3, elegiable_donor_omsr_nt_offset=21, verbose=False): """ Make a tiny final CBG by ``shooting tiny exons into the deep`` """ # get current last CBG last = self.get_final_cbg() # check if final tail of this CBG is uniformaly alignable cxpdrOutput = cexpanderanalyses_omsr2orfend(last) IS_UNIFORMLY_ALIGNED = True for trf in cxpdrOutput._transferblocks: if trf.binarystring[-maximal_cexpander_cbg_tail_uniformity_aa_length:].count("0"): IS_UNIFORMLY_ALIGNED = False break ############################################################ if verbose: print "Cexpander uniformaly aligned:", print maximal_cexpander_cbg_tail_uniformity_aa_length, print "->", IS_UNIFORMLY_ALIGNED print "omsr: ", last._cexpander.projected_on, print last._cexpander.binarystring trf = cxpdrOutput.get_transfer_of_projected_on( last._cexpander.projected_on) if trf and trf != True: print "omsr2orfend:", last._cexpander.projected_on, print trf.binarystring ############################################################ if IS_UNIFORMLY_ALIGNED: # break out of this function. Chance of overpredicting # a final tiny exon is bigger then finding a True one! return False # check if the stopcodongraph is not (very) good already if last._stopcodongraph.average_weight() >=\ maximal_current_stopcodongraph_average_weight: # break out of this function. Chance of overpredicting # a final tiny exon is bigger then finding a True existing one return False # start the timer (performance benchmark in verbose mode) stw = StopWatch(name='stwFinalECG') stw.start() # get FinalExons on elegiable Orfs based on distance towards OMSR of # current last CBG and minimal acceptor site score omsr = last.overall_minimal_spanning_range() maxsr = last.maximal_spanning_range() ECG = ExonCollectionGraph() ################################################################ if verbose: print "currentLAST", last print last._stopcodongraph print last._stopcodongraph.is_optimal() for org in last.organism_set(): print org, last._stopcodongraph.is_optimal(organism=org) for organism in last.organism_set(): node = last.node_by_organism(organism) theorf = last.get_orfs_of_graph(organism=organism)[0] print organism, "\t", node, "\t", max(omsr[node]), "\t", print max(maxsr[node]), theorf.endPY/3 ################################################################ for organism in last.organism_set(): node = last.node_by_organism(organism) # calculate an offset for the acceptor position # variable elegiable_acceptor_omsr_nt_offset is needed to # enlarge the OMSR definded offset. When the OMSR is by chance # a few nt or aa larger than the actual exon length, the true # acceptor position can be erroneously abandoned. offset = max(omsr[node]) * 3 - elegiable_donor_omsr_nt_offset theorf = last.get_orfs_of_graph(organism=organism)[0] # check if this final orf is self can serve as a final extension remaining_orf_nt_length = (theorf.protein_endPY - max(omsr[node])) * 3 remaining_maxsr_nt_length = (max(maxsr[node]) - max(omsr[node])) * 3 remaining_maxsr_tostop_nt_length = (theorf.protein_endPY - max(maxsr[node])) * 3 FIND_NEW_FINAL_ORFS = True STORE_CURRENT_ORF_AS_FIOO = False if remaining_maxsr_nt_length >= max_exon_nt_length: # exceptionally large maxsr on rigth side of omsr # store as FIOO but to NOT search for an orf extension! ### FIND_NEW_FINAL_ORFS = False # discarded 17/09/2009; when poos maxsr present, overruled! STORE_CURRENT_ORF_AS_FIOO = True elif remaining_maxsr_tostop_nt_length <= 18: # maxsr is less then 6 AA apart from stop on current orf #FIND_NEW_FINAL_ORFS = False STORE_CURRENT_ORF_AS_FIOO = True elif remaining_orf_nt_length < max_exon_nt_length: # final piece of unaligned sequence is a perfect HMM seed STORE_CURRENT_ORF_AS_FIOO = True else: pass if STORE_CURRENT_ORF_AS_FIOO: cbs = CodingBlockStart( theorf.aapos2dnapos( max(omsr[node]) ) ) # set pssm_score to (very) high; this rewards # using the current Orf as the last Orf cbs.pssm_score = 20.0 fioo = FinalExonOnOrf(cbs,theorf.endPY,theorf) node = (organism,theorf.id,fioo.start,fioo.end) ECG.add_node_and_object(node,fioo) ################################################################ if verbose: print organism,theorf.id,"self==potential last exon", remaining_orf_nt_length print organism, theorf.id, fioo, fioo.start,fioo.end, theorf.endPY ################################################################ if not FIND_NEW_FINAL_ORFS: # quit here -> no orf extension of this CBG continue # get elegiable (new) final orfs orflist = self.input[organism]['orfs'].get_elegiable_orfs( max_orf_start=offset+max_intron_nt_length, min_orf_end=offset ) ################################################################ if verbose: print organism, [ orf.id for orf in orflist ], "offset:", offset, offset/3 ################################################################ for orf in orflist: results = find_tailing_exon_on_orf( theorf,orf, current_donor_pos=offset, max_tailingexon_nt_length=max_exon_nt_length, max_tailingexon_intron_nt_length=max_intron_nt_length, ) for exon,intron in results: node = (organism,orf.id,exon.start,exon.end) if node not in ECG.get_nodes(): ECG.add_node_and_object(node,exon) if verbose: print organism, node, exon if verbose: print stw.lap(), "Exon objects gathered", ECG.node_count() # now take only the best `take_max_best_acceptors` # because there can be quite some of them! for organism in ECG.organism_set(): objects = ordering.order_list_by_attribute( ECG.get_organism_objects(organism), order_by='pssm_score', reversed=True ) for obj in objects[take_max_best_acceptors:]: node = (organism,obj.orf.id,obj.start,obj.end) ECG.del_node(node) if verbose: print "deleted:", node, obj.orf.id, obj.pssm_score ######################################################################## if verbose: print stw.lap(), ">take_max_best_acceptors DELETED" for organism in ECG.organism_set(): for obj in ordering.order_list_by_attribute( ECG.get_organism_objects(organism), order_by='pssm_score', reversed=True ): print "remaining", organism, obj.orf.id, obj.length, obj ######################################################################## # only continue if all organisms are represented in the ECG if last.organism_set_size() > ECG.organism_set_size(): if verbose: print "To few organisms/genes present -> return False" return False # create edges in the ECG between compatible phases and # exon length, then make pacbps for these edges ECG.create_edges() ECG.make_pacbps_for_edges() if verbose: print stw.lap(), "edges + PACBPS created:", ECG.edge_count(), ECG.node_count(), len(ECG.pacbps) # search for complete graphs in this last_exon_graphs = ECG.find_fully_connected_subgraphs() ######################################################################## if verbose: print stw.lap(), "duration of ECG.find_fully_connected_subgraphs()", print len(last_exon_graphs) ######################################################################## # only continue if there is an perfectly aligned last exon graph if not (last_exon_graphs and last_exon_graphs[0].connectivitysaturation() == 1.0): #################################################################### if verbose: print "no perfect aligned last exon graph -> return False" #################################################################### return False # convert to CodingBlockGraphs new_last_cbgs = [] for leg in last_exon_graphs[0:take_max_best_ecgs]: cbg = ExonCollectionGraph2CodingBlockGraph(leg,is_last=True,lastCBG=last) if cbg != False and cbg != None and cbg.organism_set_size() == last.organism_set_size(): # create cache of CBG and do final check on quality cbg.create_cache() if (cbg.total_weight() < 0 or cbg.omsrlength() <= 10) and\ cbg._cexpander.binarystring.find("1") == -1: # discard hardly alignable CBGs continue # if here, then append this cbg as a possible novel final CBG new_last_cbgs.append( cbg ) ################################################################ if verbose: print "LEGcbg", cbg ################################################################ ######################################################################## if verbose: print stw.lap(), "ECGs converted to CBGs", len(new_last_cbgs) ######################################################################## if not new_last_cbgs: #################################################################### if verbose: print "no ecgs convertable to CBGs -> return False" #################################################################### return False # order by total weight, get the optimal CBG and its corresponding ECG new_last_cbgs = ordering.order_graphlist_by_total_weight(new_last_cbgs) theNewLastCbg = None cbgIF = None # check all interfaces between the novel final CBGs and the previous # CBG. The best interface is added to the GSG! cbgif_accepted_new_last_cbgs = [] already_checked_node_sets = [] for newcbg in new_last_cbgs[0:take_max_best_cbgs]: lastExonGraph = newcbg._ExonCollectionGraph del( newcbg._ExonCollectionGraph ) # check if it is not the extention of the current # last CBG (identical nodes) if len(last.node_set().symmetric_difference(newcbg.node_set())) == 0: if verbose: print "newCBG is the extention of current last CBG!!" continue # check if this combination of nodes (orfs) has not been tried already if newcbg.get_ordered_nodes() in already_checked_node_sets: ############################################################### if verbose: print "newCBG node set done earlier:", print newcbg.get_ordered_nodes() ############################################################### continue else: # append this set of nodes (as a list) to checklist already_checked_node_sets.append( newcbg.get_ordered_nodes() ) # check if this new final tinyexon graph has a compatible interface # with the current last one cbgIF = CodingBlockGraphInterface(last,newcbg) cbgIF.harvest_splice_sites() distinct_orgs = [] for node in lastExonGraph.get_nodes(): exon = lastExonGraph.get_node_object(node) if exon.acceptor.__class__.__name__ == 'SpliceAcceptor': distinct_orgs.append( lastExonGraph.organism_by_node(node) ) cbgIF.allow_intron_in_organisms(distinct_orgs) cbgIF.find_conserved_splice_sites() # do NOT optimize -> consumes a lot of time and is helpfull # only in extreme cases... #cbgIF.optimize() if not cbgIF.is_compatible(): ################################################################ if verbose: print "newCBG not a is_compatible() cbgIF" print newcbg ################################################################ continue # append to cbgif_accepted_new_last_cbgs newcbg._CBGinterface5p = cbgIF cbgif_accepted_new_last_cbgs.append( ( cbgIF.optimalitycheck().count(True), newcbg.total_weight(), newcbg ) ) ######################################################################## if verbose: print stw.lap(), "cbgIFs checked %s/%s" % ( len(cbgif_accepted_new_last_cbgs), len(new_last_cbgs[0:take_max_best_cbgs]) ) ######################################################################## # now start by adding the highest scoring newcbg first cbgif_accepted_new_last_cbgs.sort() cbgif_accepted_new_last_cbgs.reverse() ######################################################################## if verbose: print "candidate novel final CBGs:", len(cbgif_accepted_new_last_cbgs) for (true_cnt,totalwt,newcbg) in cbgif_accepted_new_last_cbgs: print true_cnt,totalwt,newcbg._CBGinterface5p print newcbg ######################################################################## for (true_cnt,totalwt,newcbg) in cbgif_accepted_new_last_cbgs: # get the already created cbgIF from the newcbg graph cbgIF = newcbg._CBGinterface5p # now check 4 criteria: # (1) cbgIF.is_optimal() (2) >GTG.identity # (3) >STG.totalweight (4) <STG.distance criteria = [] criteria.append( cbgIF.is_optimal() ) criteria.append( newcbg._stopcodongraph.total_weight() > last._stopcodongraph.total_weight() ) criteria.append( newcbg.genetree().identity() > last.genetree().identity() ) criteria.append( newcbg._stopcodongraph.stopcodon2omsrdistance() <= last._stopcodongraph.stopcodon2omsrdistance() ) #################################################################### if verbose: print "TRYING ADDITION of final newcbg", criteria print true_cnt,totalwt,newcbg._CBGinterface5p print newcbg #################################################################### # check if there is only a single different node/orf changed in the newcbg # this is recognized by a symmetric_difference of size 2 # in this case, be very strict! This easily causes overprediction (FP) tiny exons if len(last.node_set().symmetric_difference(newcbg.node_set())) == 2: # check if 4 criteria are valid; # a single False results in not accepting this new last tiny cbg if False in criteria: if verbose: print "# NOVEL lastTinyExon discarded; single orf extension, criteria", criteria # continue -> no new tiny CBG continue # now start check the criteria. # if criteria[0] == True, means a fully is_optimal interface! # do not perform any additional check, just add! if criteria[0] == True: theNewLastCbg = newcbg break # total weight criterion -> new.tw() > last.tw() if criteria[1] == False: ########################################################################## if verbose: print "# NOVEL lastTinyExon discarded; to low total weight" print "#", newcbg._stopcodongraph ########################################################################## # continue -> no new tiny CBG continue # identity criterion -> allow a ratio i.s.o. new.id() > last.id() # this strict criterion (>) is applied for single-new-orf-CBGs if criteria[2] == False: ratio = newcbg.genetree().identity() / last.genetree().identity() if ratio < minimal_last_vs_new_identity_ratio: ###################################################################### if verbose: print "# NOVEL lastTinyExon discarded; to low identity" print "#", newcbg._stopcodongraph, newcbg.genetree().identity() ###################################################################### # continue -> no new tiny CBG continue if criteria[3] == False: ########################################################################## if verbose: print "# NOVEL lastTinyExon discarded; higher stopcodon2omsrdistance" print "#", newcbg._stopcodongraph ########################################################################## # continue -> no new tiny CBG continue # if this point is reached, a new tiny last CBG has been found! theNewLastCbg = newcbg # break out of the for loop; store into the genestructure break # all okay -> ready for inserting the new CBG if theNewLastCbg and verbose: ################################################################################ print "NEW FINAL TINY EXON FOUND!!" print theNewLastCbg print cbgIF, cbgIF.is_optimal(), cbgIF.is_acceptable() print cbgIF._optimal_aligned_donor, cbgIF.donor_phase() print cbgIF._optimal_aligned_acceptor, cbgIF.acceptor_phase() ################################################################################ # hard-insert into the genestructure # using add_codingblock is likely to cause problems # because of the tinyness of the CBG if theNewLastCbg: for pos in range(0,len(self)): if self.codingblockgraphs[pos].IS_IGNORED: continue if self.codingblockgraphs[pos].IS_LAST: thelast = self.codingblockgraphs[pos] thelast.IS_LAST = False newcbg.IS_LAST = True self.codingblockgraphs.insert(pos+1,theNewLastCbg) # set the CBGInterface object in next and prev CBG self.codingblockgraphs[pos]._CBGinterface3p = cbgIF self.codingblockgraphs[pos+1]._CBGinterface5p = cbgIF # break out; end of this function break # done! return a True because newcbg is created & inserted return True else: # no newLastCbg found return False
def mine(self, identifier, verbose=None): """ """ # (re)set mined results to empty self._data = [] self._loci = [] # start timer stw = StopWatch("dbwarehouseMiner.mine('%s')" % identifier) if verbose: print stw.start() # find the current identifier in the warehouse identifier = identifier.replace("'", "").replace('"', '').strip() if not identifier: return False genomedir = self.identifier2genomedir(identifier) if not genomedir: return False # append the main/central locusdir to the loci locusdir = self.identifier2locusdir(identifier, genomedir=genomedir) if not locusdir: return False self._loci.append(locusdir) if verbose: print stw.lap(), "main locus identified" # now mine in the warehouse if self.SEARCH_METHOD != 'SIMILARITY': # set some column restraints as VERY strict (&&) i.s.o loose (||) column_restrain = "&&" else: column_restrain = "||" ####genomedirtag = os.path.basename(os.path.split(genomedir)[0]) genomedirtag = os.path.basename(genomedir) blastarchpatAB = os.path.join( self.dbwarehouse_path, "_crossblastp", "blast.%s_x_*.symmetrized" % (genomedirtag)) blastarchpatBA = os.path.join( self.dbwarehouse_path, "_crossblastp", "blast.*_x_%s.symmetrized" % (genomedirtag)) basecommand = """ awk -F':' '{ print $1"\\t"$2 }' | awk """ +\ """ '{ if (($5>=%1.3f %s $6>=%1.3f) && ($7>=%1.3f %s $8>=%1.3f) && """ % ( self.MINIMAL_OVERLAP_RATIO, column_restrain, self.MINIMAL_OVERLAP_RATIO, self.MINIMAL_BITSCORE_RATIO, column_restrain, self.MINIMAL_BITSCORE_RATIO, ) +\ """ (($5/$6)<=%1.2f %s ($6/$5)<=%1.2f)) { print $0"\t"(($5+$6)*$4)/2 } }' """ % ( self.MAXIMAL_LENGTH_RATIO, column_restrain, self.MAXIMAL_LENGTH_RATIO, ) +\ """ | sort -gr -k 9 """ # commands with grep and zgrep for *.symmetrized and *.symmetrized.gz files command_grep = """grep "%s" %s %s | sort -u | %s""" % ( identifier, blastarchpatAB, blastarchpatBA, basecommand) command_zgrep = """zgrep "%s" %s %s | sort -u | %s""" % ( identifier, blastarchpatAB + ".gz", blastarchpatBA + ".gz", basecommand) # run the grep command ci, co, ce = os.popen3(command_grep) ci.close() lines = co.readlines() co.close() ce.close() # run the zgrep command ci, co, ce = os.popen3(command_zgrep) ci.close() lines.extend(co.readlines()) co.close() ce.close() seentags = [] ignoretags = [] for line in lines: fname, idA, idB, bitscore, overlapA, overlapB, ratioA, ratioB, order = line.strip( ).split("\t") if fname.find(".symmetrized.gz") >= 0: # process the lines obtained with the zgrep command tagA, tagB = fname[0:fname.find(".symmetrized.gz" )][fname.find("/blast.") + 7:].split("_x_") else: # process the lines obtained with the (normal) grep command tagA, tagB = fname[0:fname.find(".symmetrized" )][fname.find("/blast.") + 7:].split("_x_") # ignore the line completely when a limitation on genomedirs is applied and valid if self.genometags_to_use: if not (tagA in self.genometags_to_use and tagB in self.genometags_to_use): continue if self.genometags_to_ignore: if tagA in self.genometags_to_ignore or tagB in self.genometags_to_ignore: continue # ignore this line when one of the tags are (in) ignoretags if tagA in ignoretags: continue if tagB in ignoretags: continue # swap tagA & tagB when the tag's are in reversed order # this is due to the dbwarehouse crossblastp files # blast.B_x_A.symmetrized.gz isa symbolic link to # blast.A_x_B.symmetrized.gz if B > A (in string order) ordered_tags = [tagA, tagB] ordered_tags.sort() if [tagA, tagB] != ordered_tags: # swap tagA & tagB tagA, tagB = tagB, tagA if self.SEARCH_METHOD == 'HOMOLOGS': if self.ALLOW_PARALOGS: pass else: if tagA == tagB: continue if tagA in seentags and tagB in seentags: continue elif self.SEARCH_METHOD == 'BDBH': if tagA == tagB and self.ALLOW_PARALOGS: if tagA in [tup[0] for tup in self._data]: continue # there is already a fine hit gathered else: pass else: if tagA == tagB and not self.ALLOW_PARALOGS: continue if tagA in seentags and tagB in seentags: continue elif self.SEARCH_METHOD == 'SAFEORTHOLOGS': if tagA == tagB: # check if there is not a paralog in the identifier's species it self # that is to close nearby this identifier (a hypothetical paralogue) ratioA, ratioB = float(ratioA), float(ratioB) if max([ratioA, ratioB]) > self.SAFEORTHOLOGS_RATIO: # there is in its own genome a hypothetical paralogue! # empty data and break out! self._data = [] break else: continue elif tagA in seentags and tagB in seentags: if idA == identifier: ratio = float(ratioA) thetag = tagB else: ratio = float(ratioB) thetag = tagA maxratio = self._getfromdata(self._data, thetag)[5] if min([ratio / maxratio, maxratio / ratio ]) > self.SAFEORTHOLOGS_RATIO: # remove this tag from data -> ortholog assignment is not 100% shure! self._removefromdata(self._data, thetag) ignoretags.append(thetag) continue else: continue else: pass else: # mode similarity -> all hits are okay pass # append tags to seentags if tagA not in seentags: seentags.append(tagA) if tagB not in seentags: seentags.append(tagB) # if here, a similar protein is mined! # gather locusdir and similarity data bitscore = int(float(bitscore)) overlapA = float(overlapA) overlapB = float(overlapB) ratioA = float(ratioA) ratioB = float(ratioB) #if idA == identifier: # self._data.append(( tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB )) # ###print line.strip() # ###print "A", self._data[-1],"\n" #else: # self._data.append(( tagA, idA, bitscore, overlapB, overlapA, ratioB, ratioA )) # ###print line.strip() # ###print "B", self._data[-1],"\n" if idA == identifier: self._data.append( (tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB)) elif idB == identifier: self._data.append( (tagA, idA, bitscore, overlapB, overlapA, ratioB, ratioA)) elif idA.find(identifier) == 0: self._data.append( (tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB)) elif idB.find(identifier) == 0: self._data.append( (tagA, idA, bitscore, overlapB, overlapA, ratioB, ratioA)) else: print "WHAT ELSE!?::", tagA, tagB, idA, idB, bitscore, overlapA, overlapB, ratioA, ratioB # remove the TEMPORARILY element in mode SAFEORTHOLOGS if self.SEARCH_METHOD == 'SAFEORTHOLOGS': self._removefromdata(self._data, genomedirtag) # order _data on bitscore tmpdata = [] for item in self._data: tmpdata.append((item[2], item)) tmpdata.sort() tmpdata.reverse() self._data = [item for (s, item) in tmpdata] print len(self._data), self.maximal_num_loci # remove _data elements when self.maximal_num_loci is exceeded if len(self._data) > self.maximal_num_loci - 1: if (self.verbose and verbose == None) or verbose: # print the removed loci to screen print "# removed loci (%s): --maximal_num_loci (%s) exceeded" % ( len(self._data) - self.maximal_num_loci + 1, self.maximal_num_loci) for tup in self._data[self.maximal_num_loci - 1:]: row = list(tup) row.insert(0, genomedirtag) row.insert(2, identifier) print "\t".join([str(elem) for elem in row]) # now actually remove the rows from _data # minus 1 is for the --identifier locus itself self._data = self._data[0:self.maximal_num_loci - 1] # get the loci belonging to the mined similar proteins for (tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB) in self._data: tagBgenomedir = os.path.join(self.dbwarehouse_path, tagB) locusdir = self.identifier2locusdir(idB, genomedir=tagBgenomedir) if not locusdir: print "HEROOO...." self._loci.append(locusdir) # add genomedirtag and identifier to _data rows for i in range(0, len(self._data)): row = list(self._data[i]) row.insert(0, genomedirtag) row.insert(2, identifier) self._data[i] = tuple(row) if (self.verbose and verbose == None) or verbose: # print the results! print "# main (1th) and mined loci" for locus in self._loci: print locus print "# similarity data" #for ( tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB ) in self._data: # print "\t".join([ str(elem) for elem in [genomedirtag, tagB, identifier, idB, bitscore, overlapA, overlapB, ratioA, ratioB ]]) for row in self._data: print "\t".join([str(elem) for elem in row]) print "# settings/options" print "seentags: ", seentags print "ignoretags:", ignoretags print "use: ", self.genometags_to_use print "ignore: ", self.genometags_to_ignore print "# timing/performace" print stw.lap() return self._loci, self._data
def mine(self,identifier,verbose=None): """ """ # (re)set mined results to empty self._data = [] self._loci = [] # start timer stw = StopWatch("dbwarehouseMiner.mine('%s')" % identifier ) if verbose: print stw.start() # find the current identifier in the warehouse identifier = identifier.replace("'","").replace('"','').strip() if not identifier: return False genomedir = self.identifier2genomedir(identifier) if not genomedir: return False # append the main/central locusdir to the loci locusdir = self.identifier2locusdir(identifier,genomedir=genomedir) if not locusdir: return False self._loci.append( locusdir ) if verbose: print stw.lap(), "main locus identified" # now mine in the warehouse if self.SEARCH_METHOD != 'SIMILARITY': # set some column restraints as VERY strict (&&) i.s.o loose (||) column_restrain = "&&" else: column_restrain = "||" ####genomedirtag = os.path.basename(os.path.split(genomedir)[0]) genomedirtag = os.path.basename(genomedir) blastarchpatAB = os.path.join(self.dbwarehouse_path,"_crossblastp","blast.%s_x_*.symmetrized" % (genomedirtag)) blastarchpatBA = os.path.join(self.dbwarehouse_path,"_crossblastp","blast.*_x_%s.symmetrized" % (genomedirtag)) basecommand = """ awk -F':' '{ print $1"\\t"$2 }' | awk """ +\ """ '{ if (($5>=%1.3f %s $6>=%1.3f) && ($7>=%1.3f %s $8>=%1.3f) && """ % ( self.MINIMAL_OVERLAP_RATIO, column_restrain, self.MINIMAL_OVERLAP_RATIO, self.MINIMAL_BITSCORE_RATIO, column_restrain, self.MINIMAL_BITSCORE_RATIO, ) +\ """ (($5/$6)<=%1.2f %s ($6/$5)<=%1.2f)) { print $0"\t"(($5+$6)*$4)/2 } }' """ % ( self.MAXIMAL_LENGTH_RATIO, column_restrain, self.MAXIMAL_LENGTH_RATIO, ) +\ """ | sort -gr -k 9 """ # commands with grep and zgrep for *.symmetrized and *.symmetrized.gz files command_grep = """grep "%s" %s %s | sort -u | %s""" % ( identifier, blastarchpatAB,blastarchpatBA, basecommand) command_zgrep = """zgrep "%s" %s %s | sort -u | %s""" % ( identifier, blastarchpatAB+".gz",blastarchpatBA+".gz", basecommand) # run the grep command ci,co,ce = os.popen3(command_grep) ci.close() lines = co.readlines() co.close() ce.close() # run the zgrep command ci,co,ce = os.popen3(command_zgrep) ci.close() lines.extend( co.readlines() ) co.close() ce.close() seentags = [] ignoretags = [] for line in lines: fname, idA, idB, bitscore, overlapA, overlapB, ratioA, ratioB, order = line.strip().split("\t") if fname.find(".symmetrized.gz") >= 0: # process the lines obtained with the zgrep command tagA,tagB = fname[0:fname.find(".symmetrized.gz")][fname.find("/blast.")+7:].split("_x_") else: # process the lines obtained with the (normal) grep command tagA,tagB = fname[0:fname.find(".symmetrized")][fname.find("/blast.")+7:].split("_x_") # ignore the line completely when a limitation on genomedirs is applied and valid if self.genometags_to_use: if not (tagA in self.genometags_to_use and tagB in self.genometags_to_use): continue if self.genometags_to_ignore: if tagA in self.genometags_to_ignore or tagB in self.genometags_to_ignore: continue # ignore this line when one of the tags are (in) ignoretags if tagA in ignoretags: continue if tagB in ignoretags: continue # swap tagA & tagB when the tag's are in reversed order # this is due to the dbwarehouse crossblastp files # blast.B_x_A.symmetrized.gz isa symbolic link to # blast.A_x_B.symmetrized.gz if B > A (in string order) ordered_tags = [ tagA, tagB ] ordered_tags.sort() if [ tagA, tagB ] != ordered_tags: # swap tagA & tagB tagA,tagB = tagB,tagA if self.SEARCH_METHOD == 'HOMOLOGS': if self.ALLOW_PARALOGS: pass else: if tagA == tagB: continue if tagA in seentags and tagB in seentags: continue elif self.SEARCH_METHOD == 'BDBH': if tagA == tagB and self.ALLOW_PARALOGS: if tagA in [ tup[0] for tup in self._data ]: continue # there is already a fine hit gathered else: pass else: if tagA == tagB and not self.ALLOW_PARALOGS: continue if tagA in seentags and tagB in seentags: continue elif self.SEARCH_METHOD == 'SAFEORTHOLOGS': if tagA == tagB: # check if there is not a paralog in the identifier's species it self # that is to close nearby this identifier (a hypothetical paralogue) ratioA, ratioB = float(ratioA), float(ratioB) if max([ratioA,ratioB]) > self.SAFEORTHOLOGS_RATIO: # there is in its own genome a hypothetical paralogue! # empty data and break out! self._data = [] break else: continue elif tagA in seentags and tagB in seentags: if idA == identifier: ratio = float(ratioA) thetag = tagB else: ratio = float(ratioB) thetag = tagA maxratio = self._getfromdata(self._data,thetag)[5] if min([ratio/maxratio, maxratio/ratio]) > self.SAFEORTHOLOGS_RATIO: # remove this tag from data -> ortholog assignment is not 100% shure! self._removefromdata(self._data,thetag) ignoretags.append(thetag) continue else: continue else: pass else: # mode similarity -> all hits are okay pass # append tags to seentags if tagA not in seentags: seentags.append(tagA) if tagB not in seentags: seentags.append(tagB) # if here, a similar protein is mined! # gather locusdir and similarity data bitscore = int(float(bitscore)) overlapA = float(overlapA) overlapB = float(overlapB) ratioA = float(ratioA) ratioB = float(ratioB) #if idA == identifier: # self._data.append(( tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB )) # ###print line.strip() # ###print "A", self._data[-1],"\n" #else: # self._data.append(( tagA, idA, bitscore, overlapB, overlapA, ratioB, ratioA )) # ###print line.strip() # ###print "B", self._data[-1],"\n" if idA == identifier: self._data.append(( tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB )) elif idB == identifier: self._data.append(( tagA, idA, bitscore, overlapB, overlapA, ratioB, ratioA )) elif idA.find(identifier) == 0: self._data.append(( tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB )) elif idB.find(identifier) == 0: self._data.append(( tagA, idA, bitscore, overlapB, overlapA, ratioB, ratioA )) else: print "WHAT ELSE!?::", tagA,tagB,idA,idB,bitscore, overlapA, overlapB, ratioA, ratioB # remove the TEMPORARILY element in mode SAFEORTHOLOGS if self.SEARCH_METHOD == 'SAFEORTHOLOGS': self._removefromdata(self._data,genomedirtag) # order _data on bitscore tmpdata = [] for item in self._data: tmpdata.append( ( item[2], item ) ) tmpdata.sort() tmpdata.reverse() self._data = [ item for (s,item) in tmpdata ] print len(self._data), self.maximal_num_loci # remove _data elements when self.maximal_num_loci is exceeded if len(self._data) > self.maximal_num_loci -1: if (self.verbose and verbose==None) or verbose: # print the removed loci to screen print "# removed loci (%s): --maximal_num_loci (%s) exceeded" % ( len(self._data)-self.maximal_num_loci+1, self.maximal_num_loci ) for tup in self._data[self.maximal_num_loci-1:]: row = list( tup ) row.insert(0,genomedirtag) row.insert(2,identifier) print "\t".join([ str(elem) for elem in row ]) # now actually remove the rows from _data # minus 1 is for the --identifier locus itself self._data = self._data[0:self.maximal_num_loci-1] # get the loci belonging to the mined similar proteins for ( tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB ) in self._data: tagBgenomedir = os.path.join(self.dbwarehouse_path,tagB) locusdir = self.identifier2locusdir(idB,genomedir=tagBgenomedir) if not locusdir: print "HEROOO...." self._loci.append( locusdir ) # add genomedirtag and identifier to _data rows for i in range(0,len(self._data)): row = list( self._data[i] ) row.insert(0,genomedirtag) row.insert(2,identifier) self._data[i] = tuple(row) if (self.verbose and verbose==None) or verbose: # print the results! print "# main (1th) and mined loci" for locus in self._loci: print locus print "# similarity data" #for ( tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB ) in self._data: # print "\t".join([ str(elem) for elem in [genomedirtag, tagB, identifier, idB, bitscore, overlapA, overlapB, ratioA, ratioB ]]) for row in self._data: print "\t".join([ str(elem) for elem in row ]) print "# settings/options" print "seentags: ", seentags print "ignoretags:", ignoretags print "use: ", self.genometags_to_use print "ignore: ", self.genometags_to_ignore print "# timing/performace" print stw.lap() return self._loci, self._data
def blastanalysescbgjunction( gsg, prevCBG, nextCBG, omit_cbg_orfs=False, omit_non_cbg_orfs=False, extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS, omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK, verbose=False): """ """ ############################################################ if verbose: stw = StopWatch('blastanalysescbgjunction') stw.start() ############################################################ orfs = {} if not omit_cbg_orfs: # gather Orfs from prevCBG and nextCBG for org, orflist, in prevCBG.get_orfs_of_graph().iteritems(): orf = orflist[0] orfs[(org, orf.id)] = orf for org, orflist, in nextCBG.get_orfs_of_graph().iteritems(): orf = orflist[0] orfs[(org, orf.id)] = orf ############################################################ if verbose: print stw.lap(), "orfs (1):", len(orfs) print _format_orf_nodes_to_string(orfs.keys()) ############################################################ # create masked fasta database in a dict fastadbmfa = parseFasta( create_hmmdb_for_neighbouring_cbgs( gsg.input, prevCBG, nextCBG, omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction, ).split("\n")) ############################################################ if verbose: print stw.lap(), "fasta db (1):", len(fastadbmfa) ############################################################ # remove ORFs that do not belong to prevCBG and nextCBG, # or that DO belong to prevCBG and nextCBG, or neither fastaheaders = fastadbmfa.keys() for header in fastaheaders: org, orfid = header.split("_orf_") orfid = int(orfid) node = (org, orfid) # check for the omit_non_cbg_orfs criterion add_orf = False if omit_non_cbg_orfs: if node not in orfs: del (fastadbmfa[header]) else: add_orf = True # check for the omit_cbg_orfs criterion if omit_cbg_orfs and node in orfs: del (fastadbmfa[header]) if add_orf: # get this Orf and add to orfs orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid) ############################################################ if verbose: print stw.lap(), "fasta db (2):", len(fastadbmfa) print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys()) ############################################################ ############################################################ if verbose: print stw.lap(), "orfs (2):", len(orfs) print _format_orf_nodes_to_string(orfs.keys()) ############################################################ # no query/sbjct range left at all if not fastadbmfa: return [] # check if all organisms are still covered orgSet = Set([k.split("_orf_")[0] for k in fastadbmfa.keys()]) if orgSet.symmetric_difference(gsg.organism_set()): return [] # create !single! fasta database fastadbname = prevCBG.barcode() + "_" + nextCBG.barcode() + ".mfa" writeMultiFasta(fastadbmfa, fastadbname) formatdb(fname=fastadbname) # remap the identifiers of the orf objects i.o.t.... multifastas = {} blastdbs = {} pacbpcol = PacbpCollectionGraph() dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps ############################################################ if verbose: print stw.lap(), "blastp starting" ############################################################ for orgQ, orgS in prevCBG.pairwisecrosscombinations_organism(): for nodeQ, orfQ in orfs.iteritems(): # only blast the (masked) Orfs of orgQ if prevCBG.organism_by_node(nodeQ) != orgQ: continue # get the masked protein sequence of this orfObj header = orgQ + "_orf_" + str(orfQ.id) # check if key exists in fastadbmfa. In a case where # an Orf is masked out completely, it is absent here! if not fastadbmfa.has_key(header): continue protseq = fastadbmfa[orgQ + "_orf_" + str(orfQ.id)] # run blast_seqs2db blastrec = blastall_seq2db(orfQ.id, protseq, fastadbname, extra_blastp_params=extra_blastp_params) # omit empty blast records if len(blastrec.alignments) == 0: continue for alignment in blastrec.alignments: # get sbjct Org and Orf identifiers _orgS, _orfSid = alignment.title.replace(">", "").split("_orf_") if _orgS != orgS: continue nodeS = (_orgS, int(_orfSid)) orfS = orfs[nodeS] # take only the *best* HSP (highest scoring first one) hsp = alignment.hsps[0] # correct to absolute positions hsp.query_start = hsp.query_start + orfQ.protein_startPY hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY # initialize the PacbP pacbporf = pacb.conversion.pacbp2pacbporf( pacb.PacbP(blastp_hsp=hsp), orfQ, orfS) ################################################################ if verbose: print pacbporf, orgQ, orgS, orfQ print pacbporf.query print pacbporf.match print pacbporf.sbjct ################################################################ # create nodes; ( Organism Identifier, Orf Identifier ) nodeQ = (orgQ, orfQ.id) nodeS = (orgS, orfS.id) uqkey = pacbporf.construct_unique_key(nodeQ, nodeS) if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ) if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS) pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore) # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron! dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf ############################################################ if verbose: print stw.lap(), "blastp done" ############################################################ # file cleanup _file_cleanup(multifastas.values()) _file_cleanup(["formatdb.log"]) _file_cleanup([fname + ".*" for fname in blastdbs.values()]) # check if all Organism/Gene identifiers are covered in PacbPs if not pacbpcol.organism_set_size() == gsg.organism_set_size(): return [] # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol # In dpcpacbpcol the actual PacbPORFs are stores & kept, # whereas pacbpcol itself is splitted in CBGs (which # function does not yet (!?) take the actual pacbps into account) dpcpacbpcol.add_nodes(pacbpcol.get_nodes()) for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys(): (bitscore, length, orfQid, orfSid) = uqkey dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore) ################################################################ if verbose: print pacbpcol print "PCG bitscores:", print[p.bitscore for p in dpcpacbpcol.pacbps.values()] print "PCG nodes:", dpcpacbpcol.get_ordered_nodes() ################################################################ #### do some transformations on the pacbpcol ####pacbpcol.remove_low_connectivity_nodes(min_connectivity=gsg.EXACT_SG_NODE_COUNT-1) ####splittedCBGs = pacbpcol.find_fully_connected_subgraphs( #### edges=gsg.node_count()-1 , max_missing_edges=0 ) ##### convert to list of CBGs and do some transformations ####cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={}) ####cbgList.remove_all_but_complete_cbgs() ####cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT) ####cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol) ####cbgList.remove_cbgs_without_omsr() ####cbgList.update_edge_weights_by_minimal_spanning_range() ####cbgList.order_list_by_attribute(order_by='total_weight',reversed=True) min_connectivity = max([1, gsg.EXACT_SG_NODE_COUNT - 1 - 2]) pacbpcol.remove_low_connectivity_nodes(min_connectivity=min_connectivity) max_missing_edges = gsg.EXACT_SG_NODE_COUNT - 3 splittedCBGs = pacbpcol.find_fully_connected_subgraphs( edges=gsg.node_count() - 1, max_missing_edges=max_missing_edges) # convert to list of CBGs and do some transformations cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={}) cbgList.remove_all_but_cbgs() cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol) cbgList.make_pacbps_for_missing_edges() cbgList.remove_all_but_complete_cbgs() cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT) cbgList.remove_cbgs_without_omsr() cbgList.update_edge_weights_by_minimal_spanning_range() cbgList.order_list_by_attribute(order_by='total_weight', reversed=True) # and create_cache() for these CBGs for cbg in cbgList: cbg.create_cache() #################################################################### if verbose: print stw.lap(), "CBGs created", len(cbgList) for newcbg in cbgList: print "new:", newcbg #################################################################### # return list with CBGs return cbgList.codingblockgraphs
def cexpander2multiplealignment(cxpdr, verbose=False): """ This function and its application are still under development. In future version, this cexpander obtained data will replace the (deprecated) PAOC and PASC VISTA-like tracks which were far to computationally expensive to obtain. """ ######################################################################## if verbose: stw = StopWatch(name="cxpdr2multiplealignment") stw.start() ######################################################################## # for each of the _transferblocks (1 for each organism/gene), the # binarystring **should** contain an identical number of 1's # in freak-accident cases (1 in hundreds of thousand of cases), # it is observed that this not the case. Catch this exception here # before it hard-crashes with a raise somewhere later in this function if len(Set([trf.binarystring.count("1") for trf in cxpdr._transferblocks])) > 1: print "WARNING: unequal Cexpander.transferblocks.binarystring 1's count:", print Set( [trf.binarystring.count("1") for trf in cxpdr._transferblocks]) return False # split the cexpander binarystrings on character changes 0->1 and 1->0 substrings = {} orgs = [trf.header for trf in cxpdr._transferblocks] for ipos in range(0, len(orgs)): org = orgs[ipos] trf = cxpdr._transferblocks[ipos] substrings[org] = [ x.group() for x in re.finditer("(1+|0+)", trf.binarystring) ] # maximum number of blocks in the cexpander output # WARNING TODO THIS IS STILL NOT 100% SAFE!! try: maxblocks = max( Set([len(substrings[org]) for org in substrings.keys()])) except: print "ERROR in cexpander2multiplealignment" print substrings.keys() print "inputseqs:", len(cxpdr.sequences) for k, v in substrings.iteritems(): print k, len(v) print v # now raise the error... maxblocks = max( Set([len(substrings[org]) for org in substrings.keys()])) curblock = 0 ######################################################################## if verbose: print "maxblocks:", maxblocks, print[len(substrings[org]) for org in substrings.keys()] if len(Set([len(substrings[org]) for org in substrings.keys()])) > 1: for ipos in range(0, len(orgs)): org = orgs[ipos] print org, print[ Set(substrings[org][block]) for block in range(0, len(substrings[org])) ] trf = cxpdr._transferblocks[ipos] print trf.binarystring, len(trf.binarystring), print trf.binarystring.count("1"), trf.binarystring.count("0") ######################################################################## while curblock < maxblocks: try: # create curblocktypeset curblocktypeset = Set("".join( [substrings[org][curblock] for org in substrings.keys()])) except IndexError: # substrings[org][curblock](s) IndexError # can happen on EOF blocks if some have zeros, others have nothing # append empty block; this will be dealth with in the # curblocktypeset Set("0") for org in substrings.keys(): if len(substrings[org]) == curblock: substrings[org].append("") # recreate curblocktypeset in 2th instance curblocktypeset = Set("".join( [substrings[org][curblock] for org in substrings.keys()])) ######################################################################## if verbose: print "curiter::", curblock, maxblocks, print[len(substrings[org][curblock]) for org in substrings.keys()] ######################################################################## if curblocktypeset == Set("1"): # block of just ones; settle this block by limiting on minimal length # of all organisms of 111-string. curblocklengths = Set( [len(substrings[org][curblock]) for org in substrings.keys()]) if len(curblocklengths) == 1: pass # all normal... else: minlength = min(curblocklengths) for org in substrings.keys(): if len(substrings[org][curblock]) > minlength: blocklen = len(substrings[org][curblock]) substrings[org][curblock] = substrings[org][curblock][ 0:minlength] substrings[org].insert(curblock + 1, "1" * (blocklen - minlength)) substrings[org].insert(curblock + 1, "") # increase maxblocks counter maxblocks = max( Set([len(substrings[org]) for org in substrings.keys()])) #################################################################### if verbose: print "TRBLOCKS CHANGED!, curblock, maxblocks:", curblock, maxblocks, print[len(substrings[org]) for org in substrings.keys()] for ipos in range(0, len(orgs)): org = orgs[ipos] print org, print[ Set(substrings[org][block]) for block in range(0, len(substrings[org])) ] #################################################################### elif curblocktypeset == Set("0"): # check lengths of the blocks lengths = [ len(substrings[org][curblock]) for org in substrings.keys() ] for org in substrings.keys(): if len(substrings[org][curblock]) != max(lengths): substrings[org][curblock] += "." * ( max(lengths) - len(substrings[org][curblock])) elif curblocktypeset == Set(["0", "1"]): # situation where frontal or intermediate zeros complicate the multiplealignment for org in substrings.keys(): if Set(substrings[org][curblock]) == Set(['1']): substrings[org].insert(curblock, "") # next, do as if cublocktypeset == Set("0") (which it is now! # check lengths of the blocks lengths = [ len(substrings[org][curblock]) for org in substrings.keys() ] for org in substrings.keys(): if len(substrings[org][curblock]) != max(lengths): substrings[org][curblock] += "." * ( max(lengths) - len(substrings[org][curblock])) else: print "MIXED!!", curblocktypeset, "curblock:", curblock, "maxblocks:", maxblocks print "ERROR WILL LIKELY OCCUR QUICKLY AFTER HERE..." pass import sys sys.exit() # increase the blocks counter curblock += 1 ######################################################################## if verbose: for org in substrings.keys(): # print the sequence itself for block in range(0, maxblocks): offset = sum([ substrings[org][i].count("1") + substrings[org][i].count("0") for i in range(0, block) ]) blocklen = len(substrings[org][block]) if Set(substrings[org][block]) == Set("1"): print cxpdr.sequences[org][offset:offset + blocklen].upper(), elif Set(substrings[org][block]) == Set("0"): print cxpdr.sequences[org][offset:offset + blocklen].lower(), else: gaps = substrings[org][block].count(".") nongaps = blocklen - gaps print cxpdr.sequences[org][offset:offset + nongaps].lower() + "-" * gaps, print org for block in range(0, maxblocks): print substrings[org][block], print org ######################################################################## if verbose: for block in range(0, maxblocks): if substrings[substrings.keys()[0]][block].count("1") > 0: continue for org in substrings.keys(): offset = sum([ substrings[org][i].count("1") + substrings[org][i].count("0") for i in range(0, block) ]) blocklen = len(substrings[org][block]) if Set(substrings[org][block]) == Set("1"): print cxpdr.sequences[org][offset:offset + blocklen].upper(), elif Set(substrings[org][block]) == Set("0"): print cxpdr.sequences[org][offset:offset + blocklen].lower(), else: gaps = substrings[org][block].count(".") nongaps = blocklen - gaps print cxpdr.sequences[org][offset:offset + nongaps].lower() + "-" * gaps, print substrings[org][block], print org ######################################################################## if verbose: for org in substrings.keys(): print org, "\t", for block in range(0, maxblocks): print len(substrings[org][block]), if substrings[org][block].count("1") == 0: print "(%s,%s)" % (substrings[org][block].count('0'), substrings[org][block].count('.')), print "\t\t", sum( [len(substrings[org][block]) for block in range(0, maxblocks)]) print stw.lap() ######################################################################## return substrings
def blastanalysescbgjunction(gsg,prevCBG,nextCBG, omit_cbg_orfs = False, omit_non_cbg_orfs = False, extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS, omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK, verbose=False): """ """ ############################################################ if verbose: stw = StopWatch('blastanalysescbgjunction') stw.start() ############################################################ orfs = {} if not omit_cbg_orfs: # gather Orfs from prevCBG and nextCBG for org,orflist, in prevCBG.get_orfs_of_graph().iteritems(): orf = orflist[0] orfs[(org,orf.id)] = orf for org,orflist, in nextCBG.get_orfs_of_graph().iteritems(): orf = orflist[0] orfs[(org,orf.id)] = orf ############################################################ if verbose: print stw.lap(), "orfs (1):",len(orfs) print _format_orf_nodes_to_string(orfs.keys()) ############################################################ # create masked fasta database in a dict fastadbmfa = parseFasta( create_hmmdb_for_neighbouring_cbgs( gsg.input,prevCBG,nextCBG, omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction, ).split("\n") ) ############################################################ if verbose: print stw.lap(), "fasta db (1):",len(fastadbmfa) ############################################################ # remove ORFs that do not belong to prevCBG and nextCBG, # or that DO belong to prevCBG and nextCBG, or neither fastaheaders = fastadbmfa.keys() for header in fastaheaders: org,orfid = header.split("_orf_") orfid = int(orfid) node = (org,orfid) # check for the omit_non_cbg_orfs criterion add_orf = False if omit_non_cbg_orfs: if node not in orfs: del(fastadbmfa[header]) else: add_orf = True # check for the omit_cbg_orfs criterion if omit_cbg_orfs and node in orfs: del(fastadbmfa[header]) if add_orf: # get this Orf and add to orfs orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid) ############################################################ if verbose: print stw.lap(), "fasta db (2):",len(fastadbmfa) print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys()) ############################################################ ############################################################ if verbose: print stw.lap(), "orfs (2):",len(orfs) print _format_orf_nodes_to_string(orfs.keys()) ############################################################ # no query/sbjct range left at all if not fastadbmfa: return [] # check if all organisms are still covered orgSet = Set([ k.split("_orf_")[0] for k in fastadbmfa.keys()]) if orgSet.symmetric_difference(gsg.organism_set()): return [] # create !single! fasta database fastadbname = prevCBG.barcode()+"_"+nextCBG.barcode()+".mfa" writeMultiFasta(fastadbmfa,fastadbname) formatdb(fname=fastadbname) # remap the identifiers of the orf objects i.o.t.... multifastas = {} blastdbs = {} pacbpcol = PacbpCollectionGraph() dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps ############################################################ if verbose: print stw.lap(), "blastp starting" ############################################################ for orgQ,orgS in prevCBG.pairwisecrosscombinations_organism(): for nodeQ,orfQ in orfs.iteritems(): # only blast the (masked) Orfs of orgQ if prevCBG.organism_by_node(nodeQ) != orgQ: continue # get the masked protein sequence of this orfObj header = orgQ+"_orf_"+str(orfQ.id) # check if key exists in fastadbmfa. In a case where # an Orf is masked out completely, it is absent here! if not fastadbmfa.has_key(header): continue protseq = fastadbmfa[orgQ+"_orf_"+str(orfQ.id)] # run blast_seqs2db blastrec = blastall_seq2db(orfQ.id,protseq,fastadbname, extra_blastp_params=extra_blastp_params) # omit empty blast records if len(blastrec.alignments) == 0: continue for alignment in blastrec.alignments: # get sbjct Org and Orf identifiers _orgS,_orfSid = alignment.title.replace(">","").split("_orf_") if _orgS != orgS: continue nodeS = (_orgS,int(_orfSid)) orfS = orfs[nodeS] # take only the *best* HSP (highest scoring first one) hsp = alignment.hsps[0] # correct to absolute positions hsp.query_start = hsp.query_start + orfQ.protein_startPY hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY # initialize the PacbP pacbporf = pacb.conversion.pacbp2pacbporf( pacb.PacbP(blastp_hsp=hsp),orfQ,orfS) ################################################################ if verbose: print pacbporf, orgQ,orgS, orfQ print pacbporf.query print pacbporf.match print pacbporf.sbjct ################################################################ # create nodes; ( Organism Identifier, Orf Identifier ) nodeQ = ( orgQ, orfQ.id ) nodeS = ( orgS, orfS.id ) uqkey = pacbporf.construct_unique_key(nodeQ,nodeS) if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ) if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS) pacbpcol.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore) # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron! dpcpacbpcol.pacbps[(uqkey,nodeQ,nodeS)] = pacbporf ############################################################ if verbose: print stw.lap(), "blastp done" ############################################################ # file cleanup _file_cleanup(multifastas.values()) _file_cleanup(["formatdb.log"]) _file_cleanup([ fname+".*" for fname in blastdbs.values()]) # check if all Organism/Gene identifiers are covered in PacbPs if not pacbpcol.organism_set_size() == gsg.organism_set_size(): return [] # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol # In dpcpacbpcol the actual PacbPORFs are stores & kept, # whereas pacbpcol itself is splitted in CBGs (which # function does not yet (!?) take the actual pacbps into account) dpcpacbpcol.add_nodes( pacbpcol.get_nodes() ) for (uqkey,nodeQ,nodeS) in dpcpacbpcol.pacbps.keys(): (bitscore,length,orfQid,orfSid) = uqkey dpcpacbpcol.add_edge(nodeQ,nodeS,wt=bitscore) ################################################################ if verbose: print pacbpcol print "PCG bitscores:", print [ p.bitscore for p in dpcpacbpcol.pacbps.values() ]
def _recrute_pacbporfs_from_parental_cbg(self, parentcbg, create_cache=True, ignore_nonexisting_edges=False, verbose=False): """ Harvest PacbPORFs from (parental) CodingBlockGraph @attention: alternative for harvest_pacbps_from_crossdata() @attention: required in _place_cbg_in_partialgsg() function @attention: use create_cache=False with care! @type parentcbg: CodingBlockGraph @param parentcbg: CodingBlockGraph that has to delived PacbPORFs @type create_cache: Boolean @param create_cache: run the create_cache() function on the CBG (self) @type ignore_nonexisting_edges: Boolean @param ignore_nonexisting_edges: when False, do not create edges in the CBG (self) that are absent (but present in the parentcbg) @type verbose: Boolean @param verbose: print debugging information to STDOUT when True """ replacements = {} substituted = 0 #################################################################### if verbose: stw = StopWatch("recruteParentalPacbps") print stw.start() print "target:", self print "source:", parentcbg #################################################################### for (node1, node2) in self.pairwisecrosscombinations_node(): # if this edge is not present in the parent, ignore it if not parentcbg.has_edge(node1, node2): continue # get PacbPORF of the parent origpacbporf = parentcbg.get_pacbps_by_nodes(node1=node1, node2=node2)[0] curpacbporf = None replace_pacbporf = False if not self.has_edge(node1, node2): if ignore_nonexisting_edges: # if ignore_nonexisting_edges -> do not recrute this pacbp continue else: # replace this Pacbporf if it exists and # simultaniously create novel edge replace_pacbporf = True elif self.has_edge(node1,node2) and not\ self.get_pacbps_by_nodes(node1=node1,node2=node2): replace_pacbporf = True else: curpacbporf = self.get_pacbps_by_nodes(node1=node1, node2=node2)[0] if pacb.comparison.IsIdenticalPacbPORF(origpacbporf, curpacbporf): # Pacbporfs are already identical; not relevant to copy continue if origpacbporf.issuperset(curpacbporf): # store to replacements dict replacements[(node1, node2)] = curpacbporf # remove from the CBG -> replacement in progress self.remove_pacbp(curpacbporf, node1, node2) replace_pacbporf = True # check if replace_pacbporf is set to True if replace_pacbporf: ################################################################ if verbose: print stw.lap(), "REPLACING PacbPORF Source->Target:" print "T:", curpacbporf, "(current)" print "S:", origpacbporf origpacbporf.print_protein(_linesize=100) ################################################################ newkey = origpacbporf.construct_unique_key(node1, node2) self.set_edge_weight(node1, node2, wt=origpacbporf.bitscore) self.pacbps[(newkey, node1, node2)] = origpacbporf substituted += 1 # check if substitutions have been taken place if create_cache and substituted: ##################################################################### if verbose: print stw.lap(), "CREATE_CACHE & substituted PacbPORFS:", print substituted, "edges:", len(self.weights) / 2, print "pacbps:", len(self.pacbps) ####for k,pacbporf in self.pacbps.iteritems(): #### print k,"\n",pacbporf ##################################################################### self.clear_cache() # check if there is an OMSR upon recreation; in very # exceptional cases, OMSR can get lost in this step if self.has_overall_minimal_spanning_range(): self.create_cache() self.update_edge_weights_by_minimal_spanning_range() else: ############################################################# if verbose: print stw.lap(), "OMSR got lost!", print "replacements:", len(replacements) for (n1, n2), curpacbporf in replacements.iteritems(): print "REP:", curpacbporf, n1, n2 ############################################################# # OMSR got lost! Restore replacements dict and as such # restore the original PacbPs one by one (in random order) # and quit as soon as an OMSR is restored for (node1, node2), curpacbporf in replacements.iteritems(): newkey = curpacbporf.construct_unique_key(node1, node2) tobereplpacbporf = self.get_pacbps_by_nodes(node1=node1, node2=node2)[0] # remove from the CBG self.remove_pacbp(tobereplpacbporf, node1, node2) # and place back the original one self.set_edge_weight(node1, node2, wt=curpacbporf.bitscore) self.pacbps[(newkey, node1, node2)] = curpacbporf substituted -= 1 if self.has_overall_minimal_spanning_range(): self.create_cache() self.update_edge_weights_by_minimal_spanning_range() ######################################################### if verbose: print stw.lap(), "OMSR restored, substitutions:", print substituted print "T:", self ########################################################## # break out of the for loop of PacbP replacement break # return number of replaced/added pacbporfs return substituted