def CodingBlockGraph2GeneTreeGraph(cbg): """ Convert CodingBlockGraph 2 GeneTree @attention: function just converts, error check is not performed here! @type cbg: CodingBlockGraph @param cbg: CodingBlockGraph instance @rtype: GeneTreeGraph @return: GeneTreeGraph instance """ gtg = GeneTreeGraph() cbgnode2orgnode = {} for node in cbg.get_nodes(): org = cbg._organism_from_node(node) gtg.add_node(org) # add node/org combi to mapping dict cbgnode2orgnode[ node ] = org # now add all the edges omsr = cbg.overall_minimal_spanning_range() for (n1,n2) in cbg.pairwisecrosscombinations_node(): if cbg.has_edge(n1,n2): # get pacbp(orf) object thepacbp = cbg.get_pacbps_by_nodes(node1=n1,node2=n2)[0] # get relative coordinates of the OMSR part of the alignment omsrQs = thepacbp.alignmentposition_by_query_pos( min( omsr[n1] ) ) omsrQe = thepacbp.alignmentposition_by_query_pos( max( omsr[n1] ) ) # CHECK these coordinates; pacb.exceptions.CoordinateOutOfRange can occur # in freaky cases. They shouldn't, but do without discovered reason. # However, in the majority of cases, it is just a 1/few aa offset, which # can be easily corrected here. if str(omsrQs) == str(pacb.exceptions.CoordinateOutOfRange): if thepacbp.__class__.__name__ == 'PacbP': # solve by taking thepacbp.query_start omsrQs = thepacbp.alignmentposition_by_query_pos( thepacbp.query_start ) else: # thepacbp.__class__.__name__ in ['PacbPDNA','PacbPORF']: # solve by taking orginal alignment position start omsrQs = thepacbp.alignmentposition_by_query_pos( thepacbp._get_original_alignment_pos_start().query_pos ) ########################################################################### ## print warning message(s) #print "WARNING: pacb.exceptions.CoordinateOutOfRange (omsrQs, ", #print "node %s in CodingBlockGraph2GeneTreeGraph" % ( str(n1) ) #print "WARNING: min(omsr(", min( omsr[n1] ), ")", min(omsr[n1]), #print max(omsr[n1]), " taken ->", thepacbp.query_start, omsrQs #print "WARNING: ", thepacbp ########################################################################### if str(omsrQe) == str(pacb.exceptions.CoordinateOutOfRange): if thepacbp.__class__.__name__ == 'PacbP': # solve by taking thepacbp.query_end omsrQe = thepacbp.alignmentposition_by_query_pos( thepacbp.query_end ) else: # thepacbp.__class__.__name__ in ['PacbPDNA','PacbPORF']: # solve by taking orginal alignment position end omsrQe = thepacbp.alignmentposition_by_query_pos( thepacbp._get_original_alignment_pos_end().query_pos ) + 1 # add +1 to create a python list range coordinate ########################################################################### ## print warning message(s) #print "WARNING: pacb.exceptions.CoordinateOutOfRange (omsrQe, ", #print node %s in CodingBlockGraph2GeneTreeGraph" % ( str(n1) ) #print "WARNING: max(omsr(", max( omsr[n1] ), ")", min(omsr[n1]), #print max(omsr[n1]), " taken ->", thepacbp.query_end, omsrQe #print "WARNING: ", thepacbp ########################################################################### else: # omsrQe was nicely an integer; add +1 because max(OMSR) is not a range coord omsrQe += 1 # calculate identityscore identityscore = pacb.calculate_identityscore( thepacbp.alignment[omsrQs:omsrQe] ) else: # this edge is absent in the CBG! # TODO -> this will cause a crash a few lines later # by definition, a CBG MUST HAVE ALL EDGES at this stage! print "about to crash!!!!" print cbg print cbg.node_count(), cbg.edge_count(), "missing:", (n1,n2) identityscore = 0.0 # get organism identifyers from node and add edge o1,o2 = cbgnode2orgnode[ n1 ], cbgnode2orgnode[ n2 ] # Wt used is identityscore == Identity + 0.5* Similarity gtg.add_edge( o1, o2, wt=identityscore ) # add additional statistics to gtg object. Wt used is # identitypercentage is TRUE aa indentity % identityperc = pacb.calculate_identity( thepacbp.alignment[omsrQs:omsrQe] ) gtg._aa_identity_percentages[(o1,o2)] = identityperc gtg._aa_identity_percentages[(o2,o1)] = identityperc # bitscoreratio is ratio of bits / max bits bitscoreratio = pacb.calculate_bitscoreratio( thepacbp.query[omsrQs:omsrQe], thepacbp.sbjct[omsrQs:omsrQe], matrix = thepacbp.MATRIX ) gtg._bitscore_ratios[(o1,o2)] = bitscoreratio gtg._bitscore_ratios[(o2,o1)] = bitscoreratio # ntidentity is obviously nt identity% dnaQseq, dnaSseq = thepacbp.get_unextended_aligned_dna_sequences() ntidentity = sequence_identity_ratio(dnaQseq,dnaSseq) gtg._nt_identity_percentages[(o1,o2)] = ntidentity gtg._nt_identity_percentages[(o2,o1)] = ntidentity # check if the graph is saturated (complete) # if not (organism/node/orf missing), add this as a zero-wt edge gtg.makecompletegraph(wt=0.0) # and return this new genetree graph return gtg
def cexpander_checkCBG4omsrbordergaps(cbg, omit5pside = False, omit3pside = False, max_bitscoreratio_threshold =\ CBG_CEXPANDER_OMSRBORDERGAPS_MAX_BITSCORERATIO_THRESHOLD, nonuniform_aa_offset = CBG_CEXPANDER_OMSRBORDERGAPS_NONUNIFORM_AA_OFFSET, gap_size = CBG_CEXPANDER_OMSRBORDERGAPS_GAP_SIZE, verbose = False): """ Check the area directly around the OMSR of a CBG for non-uniform alignments @type cbg: CodingBlockGraph @param cbg: CodingBlockGraph instance to optimize @type omit5pside: Boolean @param omit5pside: Do not process the 5' side (left) of the CBG @type omit3pside: Boolean @param omit3pside: Do not process the 3' side (rigth) of the CBG @type nonuniform_aa_offset: integer @param nonuniform_aa_offset: area of the nonuniform stretch to check for gaps @type gap_size: integer @param gap_size: continuous gap length to occur in the nonuniform_aa_offset in order to shorten the CBG @type max_bitscoreratio_threshold: float @param max_bitscoreratio_threshold: maximal bitscore ratio of Q vs. S slice to enforce a CBG shortening @type verbose: Boolean @param verbose: print debugging/intermediate information to STDOUT @rtype: Boolean ( or NoOverallMinimalSpanningRange or ZeroUniformlyAlignedPositions exception ) @return: status weather or not the CBG was shortened """ hasconsistency = cbg._cexpander.binarystring.count("1") >= 1 has5Pomsrflaw = cbg._cexpander.binarystring[0] == "0" PACBPS_CORRECTED = 0 if not hasconsistency: # a priori error. CBGs must have at least a single Uniformly Aligned AA position raise ZeroUniformlyAlignedPositions if not omit5pside and (hasconsistency, has5Pomsrflaw) == (True, True): # start correction on the 5' side of the OMSR omsr = cbg.overall_minimal_spanning_range() replacements = {} ######################################################################## if verbose: print "STARTING cexpander_checkCBG4omsrbordergaps 5p side" print cbg print "cexp::", cbg._cexpander.binarystring, cbg._cexpander.header ######################################################################## for (currentkey, nodeQ, nodeS), pacbporf in cbg.pacbps.iteritems(): # get slice of the pacbporf around the max(OMSR) query value orgQ = cbg.organism_by_node(nodeQ) cexpQstr = cbg._cexpander.get_transferblock(orgQ).binarystring endQpos = min(omsr[nodeQ]) + cexpQstr.find("1") staQpos = endQpos - nonuniform_aa_offset # get slice of the pacbporf around the max(OMSR) sbjct value orgS = cbg.organism_by_node(nodeS) cexpSstr = cbg._cexpander.get_transferblock(orgS).binarystring endSpos = min(omsr[nodeS]) + cexpSstr.find("1") staSpos = endSpos - nonuniform_aa_offset # correct staQpos if < pacbporf.orfQ.protein_startPY staQpos = max([pacbporf.orfQ.protein_startPY, staQpos]) editedQ = staQpos != endQpos - nonuniform_aa_offset # correct staSpos if < pacbporf.orfS.protein_startPY staQpos = max([pacbporf.orfS.protein_startPY, staSpos]) editedS = staSpos != endSpos - nonuniform_aa_offset if editedQ and editedS: if ( endSpos - (staSpos + nonuniform_aa_offset) ) >=\ (endQpos - (staQpos + nonuniform_aa_offset) ): # editing on Sbjct is gte as on Query -> take Sbjct (q, m, s, coords) = pacbporf.alignmentpart_by_sbjct( staSpos, endSpos) else: # other way around -> take Query (q, m, s, coords) = pacbporf.alignmentpart_by_query( staQpos, endQpos) elif editedS: # take by sbjct coords (q, m, s, coords) = pacbporf.alignmentpart_by_sbjct(staSpos, endSpos) else: # unedited or edited Query -> take by query coords (q, m, s, coords) = pacbporf.alignmentpart_by_query(staQpos, endQpos) # check minval of coords; CBGs at the far 5' end of the # input DNA sequence can get negative coords for their # non-existing Orf frontal STOPcodon (up to -3) if min(coords) < 0: continue # get bitscore-ratio of this Query/Sbjct slice (qS, qE, sS, sE) = coords bitscoreratio = pacb.calculate_bitscoreratio(q, s) # get bitscore-ratio of this Query/Sbjct slice bitscoreratio = pacb.calculate_bitscoreratio(q, s) # if more gaps in this alignment slice then expected -> a pacbp split will follow # the slice is of size (2*omsr_offset)+1 if q.find('-' * gap_size) >= 0: # convert (back) to pacbp and obtain position where to split pacbp = pacb.conversion.pacbporf2pacbp(pacbporf) pos = q.find('-' * gap_size) while pos + gap_size < len(q) and q[pos + gap_size] == "-": pos += 1 splitpos = qS + pos + gap_size # correct splitpos by pacbp.query_start splitpos = splitpos - pacbp.query_start elif s.find('-' * gap_size) >= 0: # convert (back) to pacbp and obtain position where to split pacbp = pacb.conversion.pacbporf2pacbp(pacbporf) pos = s.find('-' * gap_size) while pos + gap_size < len(s) and s[pos + gap_size] == "-": pos += 1 splitpos = sS + pos + gap_size # correct splitpos by pacbp.sbjct_start splitpos = splitpos - pacbp.sbjct_start elif bitscoreratio <= max_bitscoreratio_threshold: # convert (back) to pacbp and obtain position where to split pacbp = pacb.conversion.pacbporf2pacbp(pacbporf) # correct for matches on the rigth of the match string splitpos = qE - (len(m) - m.rfind(" ") - 1) # correct splitpos by pacbp.query_start splitpos = splitpos - pacbp.query_start else: ################################################################ if verbose: print nodeQ, nodeS, "'%s' '%s' '%s'" % (q, m, s), coords, print "settings:", (nonuniform_aa_offset, gap_size), print "bitsratio: %1.3f" % bitscoreratio ################################################################ # not passing the cut-off for splitting this pacbp continue #################################################################### if verbose: print "5p,", nodeQ, nodeS, (q, m, s, coords), print "bitsratio: %1.3f (thr:%1.3f)" % ( bitscoreratio, max_bitscoreratio_threshold) print pacbp, "relative splitpos:", splitpos pacbp.print_protein(_linesize=120) #################################################################### # now split the pacbp on this position and recreate the pacbporf pacbpR = pacb.splitting.split_pacb_on_coordinates( pacbp, (splitpos, splitpos), returnside='rigth') if pacbpR: newpacbporf = pacb.conversion.pacbp2pacbporf( pacbpR, pacbporf.orfQ, pacbporf.orfS) newpacbporf.extend_pacbporf_after_stops() # store to replacements dict replacements[(currentkey, nodeQ, nodeS)] = newpacbporf ################################################################ if verbose: print pacbpR pacbpR.print_protein(_linesize=120) print newpacbporf ################################################################ # increase counter for how much pacbps are corrected PACBPS_CORRECTED += 1 # do the replacements of 5' PacbP corrections status = _update_cbg_with_pacbporf_replacements(cbg, replacements) if status == True: pass # cbg succesfully updated; still an OMSR elif status == False: # raise a NoOverallMinimalSpanningRange Exception print "WARNING: NoOverallMinimalSpanningRange", cbg raise NoOverallMinimalSpanningRange, str(cbg) else: pass # check (again!) if the is any consistency and if there is a 3' inconsistency hasconsistency = cbg._cexpander.binarystring.count("1") >= 1 has3Pomsrflaw = cbg._cexpander.binarystring[-1] == "0" if not hasconsistency: # due to 5' optimization, the complete CBG alignment collapsed! raise ZeroUniformlyAlignedPositions if not omit3pside and (hasconsistency, has3Pomsrflaw) == (True, True): # start correction on the 3' side of the OMSR omsr = cbg.overall_minimal_spanning_range() replacements = {} ######################################################################## if verbose: print "STARTING cexpander_checkCBG4omsrbordergaps 3p side" print cbg, "\ncexp::", cbg._cexpander.binarystring, print cbg._cexpander.header ######################################################################## for (currentkey, nodeQ, nodeS), pacbporf in cbg.pacbps.iteritems(): # get slice of the pacbporf around the max(OMSR) query value orgQ = cbg.organism_by_node(nodeQ) cexpQstr = cbg._cexpander.get_transferblock(orgQ).binarystring staQpos = max(omsr[nodeQ]) - (len(cexpQstr) - cexpQstr.rfind("1")) endQpos = staQpos + nonuniform_aa_offset # get slice of the pacbporf around the max(OMSR) sbjct value orgS = cbg.organism_by_node(nodeS) cexpSstr = cbg._cexpander.get_transferblock(orgS).binarystring staSpos = max(omsr[nodeS]) - (len(cexpSstr) - cexpSstr.rfind("1")) endSpos = staSpos + nonuniform_aa_offset # correct endQpos if > pacbporf.orfQ.protein_endPY endQpos = min([pacbporf.orfQ.protein_endPY, endQpos]) editedQ = endQpos != staQpos + nonuniform_aa_offset # correct endSpos if > pacbporf.orfQ.protein_endPY endSpos = min([pacbporf.orfS.protein_endPY, endSpos]) editedS = endSpos != staSpos + nonuniform_aa_offset if editedQ and editedS: if ( endSpos - (staSpos + nonuniform_aa_offset) ) >=\ (endQpos - (staQpos + nonuniform_aa_offset) ): # editing on Sbjct is gte as on Query -> take Sbjct (q, m, s, coords) = pacbporf.alignmentpart_by_sbjct( staSpos, endSpos) else: # other way around -> take Query (q, m, s, coords) = pacbporf.alignmentpart_by_query( staQpos, endQpos) elif editedS: # take by sbjct coords (q, m, s, coords) = pacbporf.alignmentpart_by_sbjct(staSpos, endSpos) else: # unedited or edited Query -> take by query coords (q, m, s, coords) = pacbporf.alignmentpart_by_query(staQpos, endQpos) # get bitscore-ratio of this Query/Sbjct slice (qS, qE, sS, sE) = coords bitscoreratio = pacb.calculate_bitscoreratio(q, s) # if more gaps in this alignment slice then expected -> a pacbp split will follow # the slice is of size (2*omsr_offset)+1 if q.find('-' * gap_size) >= 0: # convert (back) to pacbp and obtain position where to split pacbp = pacb.conversion.pacbporf2pacbp(pacbporf) splitpos = qS + q.find('-' * gap_size) # correct splitpos by pacbp.query_start splitpos = splitpos - pacbp.query_start elif s.find('-' * gap_size) >= 0: # convert (back) to pacbp and obtain position where to split pacbp = pacb.conversion.pacbporf2pacbp(pacbporf) splitpos = sS + s.find('-' * gap_size) # correct splitpos by pacbp.sbjct_start splitpos = splitpos - pacbp.sbjct_start elif bitscoreratio <= max_bitscoreratio_threshold: # convert (back) to pacbp and obtain position where to split pacbp = pacb.conversion.pacbporf2pacbp(pacbporf) # correct for matches on the left of the match string splitpos = qS + m.find(" ") # correct splitpos by pacbp.query_start splitpos = splitpos - pacbp.query_start else: ################################################################ if verbose: print nodeQ, nodeS, "'%s' '%s' '%s'" % (q, m, s), coords, print "settings:", (nonuniform_aa_offset, gap_size), print "bitsratio: %1.3f" % bitscoreratio ################################################################ # not passing the cut-off for splitting this pacbp continue #################################################################### if verbose: print "3p,", nodeQ, nodeS, (q, m, s, coords), print "bitsratio: %1.3f (thr:%1.3f)" % ( bitscoreratio, max_bitscoreratio_threshold) print pacbp, "relative splitpos:", splitpos pacbp.print_protein(_linesize=120) #################################################################### # now split the pacbp on this position and recreate the pacbporf pacbpL = pacb.splitting.split_pacb_on_coordinates( pacbp, (splitpos, splitpos), returnside='left') if pacbpL: newpacbporf = pacb.conversion.pacbp2pacbporf( pacbpL, pacbporf.orfQ, pacbporf.orfS) newpacbporf.extend_pacbporf_after_stops() # store to replacements dict replacements[(currentkey, nodeQ, nodeS)] = newpacbporf ################################################################ if verbose: print pacbpL pacbpL.print_protein(_linesize=120) print newpacbporf ################################################################ # increase counter for how much pacbps are corrected PACBPS_CORRECTED += 1 # do the replacements of 3' PacbP corrections status = _update_cbg_with_pacbporf_replacements(cbg, replacements) if status == True: pass # cbg succesfully updated; still an OMSR elif status == False: # raise a NoOverallMinimalSpanningRange Exception raise NoOverallMinimalSpanningRange, str(cbg) elif status == None: pass # no updates done at all else: # NOT POSSIBLE -> status isa NoneBoolean pass #################################################################### if verbose and PACBPS_CORRECTED: print "REPLACEMENTS DONE:", PACBPS_CORRECTED, "omit5pside:", print omit5pside, "omit3pside", omit3pside print cbg cbg.printmultiplealignment() #################################################################### # return if there is something improved if PACBPS_CORRECTED: return True else: return False
def cexpander_checkCBG4omsrbordergaps(cbg, omit5pside = False, omit3pside = False, max_bitscoreratio_threshold =\ CBG_CEXPANDER_OMSRBORDERGAPS_MAX_BITSCORERATIO_THRESHOLD, nonuniform_aa_offset = CBG_CEXPANDER_OMSRBORDERGAPS_NONUNIFORM_AA_OFFSET, gap_size = CBG_CEXPANDER_OMSRBORDERGAPS_GAP_SIZE, verbose = False): """ Check the area directly around the OMSR of a CBG for non-uniform alignments @type cbg: CodingBlockGraph @param cbg: CodingBlockGraph instance to optimize @type omit5pside: Boolean @param omit5pside: Do not process the 5' side (left) of the CBG @type omit3pside: Boolean @param omit3pside: Do not process the 3' side (rigth) of the CBG @type nonuniform_aa_offset: integer @param nonuniform_aa_offset: area of the nonuniform stretch to check for gaps @type gap_size: integer @param gap_size: continuous gap length to occur in the nonuniform_aa_offset in order to shorten the CBG @type max_bitscoreratio_threshold: float @param max_bitscoreratio_threshold: maximal bitscore ratio of Q vs. S slice to enforce a CBG shortening @type verbose: Boolean @param verbose: print debugging/intermediate information to STDOUT @rtype: Boolean ( or NoOverallMinimalSpanningRange or ZeroUniformlyAlignedPositions exception ) @return: status weather or not the CBG was shortened """ hasconsistency = cbg._cexpander.binarystring.count("1") >= 1 has5Pomsrflaw = cbg._cexpander.binarystring[0] == "0" PACBPS_CORRECTED = 0 if not hasconsistency: # a priori error. CBGs must have at least a single Uniformly Aligned AA position raise ZeroUniformlyAlignedPositions if not omit5pside and (hasconsistency,has5Pomsrflaw) == (True,True): # start correction on the 5' side of the OMSR omsr = cbg.overall_minimal_spanning_range() replacements = {} ######################################################################## if verbose: print "STARTING cexpander_checkCBG4omsrbordergaps 5p side" print cbg print "cexp::", cbg._cexpander.binarystring, cbg._cexpander.header ######################################################################## for (currentkey,nodeQ,nodeS),pacbporf in cbg.pacbps.iteritems(): # get slice of the pacbporf around the max(OMSR) query value orgQ = cbg.organism_by_node(nodeQ) cexpQstr = cbg._cexpander.get_transferblock(orgQ).binarystring endQpos = min(omsr[nodeQ]) + cexpQstr.find("1") staQpos = endQpos - nonuniform_aa_offset # get slice of the pacbporf around the max(OMSR) sbjct value orgS = cbg.organism_by_node(nodeS) cexpSstr = cbg._cexpander.get_transferblock(orgS).binarystring endSpos = min(omsr[nodeS]) + cexpSstr.find("1") staSpos = endSpos - nonuniform_aa_offset # correct staQpos if < pacbporf.orfQ.protein_startPY staQpos = max([ pacbporf.orfQ.protein_startPY, staQpos ]) editedQ = staQpos != endQpos - nonuniform_aa_offset # correct staSpos if < pacbporf.orfS.protein_startPY staQpos = max([ pacbporf.orfS.protein_startPY, staSpos ]) editedS = staSpos != endSpos - nonuniform_aa_offset if editedQ and editedS: if ( endSpos - (staSpos + nonuniform_aa_offset) ) >=\ (endQpos - (staQpos + nonuniform_aa_offset) ): # editing on Sbjct is gte as on Query -> take Sbjct (q,m,s,coords) = pacbporf.alignmentpart_by_sbjct( staSpos, endSpos ) else: # other way around -> take Query (q,m,s,coords) = pacbporf.alignmentpart_by_query( staQpos, endQpos ) elif editedS: # take by sbjct coords (q,m,s,coords) = pacbporf.alignmentpart_by_sbjct( staSpos, endSpos ) else: # unedited or edited Query -> take by query coords (q,m,s,coords) = pacbporf.alignmentpart_by_query( staQpos, endQpos ) # check minval of coords; CBGs at the far 5' end of the # input DNA sequence can get negative coords for their # non-existing Orf frontal STOPcodon (up to -3) if min(coords) < 0: continue # get bitscore-ratio of this Query/Sbjct slice (qS,qE,sS,sE) = coords bitscoreratio = pacb.calculate_bitscoreratio(q,s) # get bitscore-ratio of this Query/Sbjct slice bitscoreratio = pacb.calculate_bitscoreratio(q,s) # if more gaps in this alignment slice then expected -> a pacbp split will follow # the slice is of size (2*omsr_offset)+1 if q.find('-'*gap_size) >= 0: # convert (back) to pacbp and obtain position where to split pacbp = pacb.conversion.pacbporf2pacbp(pacbporf) pos = q.find('-'*gap_size) while pos+gap_size < len(q) and q[pos+gap_size] == "-": pos+=1 splitpos = qS + pos + gap_size # correct splitpos by pacbp.query_start splitpos = splitpos - pacbp.query_start elif s.find('-'*gap_size) >= 0: # convert (back) to pacbp and obtain position where to split pacbp = pacb.conversion.pacbporf2pacbp(pacbporf) pos = s.find('-'*gap_size) while pos+gap_size < len(s) and s[pos+gap_size] == "-": pos+=1 splitpos = sS + pos + gap_size # correct splitpos by pacbp.sbjct_start splitpos = splitpos - pacbp.sbjct_start elif bitscoreratio <= max_bitscoreratio_threshold: # convert (back) to pacbp and obtain position where to split pacbp = pacb.conversion.pacbporf2pacbp(pacbporf) # correct for matches on the rigth of the match string splitpos = qE - ( len(m) - m.rfind(" ") - 1) # correct splitpos by pacbp.query_start splitpos = splitpos - pacbp.query_start else: ################################################################ if verbose: print nodeQ, nodeS, "'%s' '%s' '%s'" % (q,m,s), coords, print "settings:", (nonuniform_aa_offset, gap_size), print "bitsratio: %1.3f" % bitscoreratio ################################################################ # not passing the cut-off for splitting this pacbp continue #################################################################### if verbose: print "5p,", nodeQ,nodeS, (q,m,s,coords), print "bitsratio: %1.3f (thr:%1.3f)" % ( bitscoreratio,max_bitscoreratio_threshold) print pacbp, "relative splitpos:", splitpos pacbp.print_protein(_linesize=120) #################################################################### # now split the pacbp on this position and recreate the pacbporf pacbpR = pacb.splitting.split_pacb_on_coordinates(pacbp,( splitpos,splitpos),returnside='rigth') if pacbpR: newpacbporf = pacb.conversion.pacbp2pacbporf( pacbpR,pacbporf.orfQ,pacbporf.orfS) newpacbporf.extend_pacbporf_after_stops() # store to replacements dict replacements[(currentkey,nodeQ,nodeS)] = newpacbporf ################################################################ if verbose: print pacbpR pacbpR.print_protein(_linesize=120) print newpacbporf ################################################################ # increase counter for how much pacbps are corrected PACBPS_CORRECTED+=1 # do the replacements of 5' PacbP corrections status = _update_cbg_with_pacbporf_replacements(cbg,replacements) if status == True: pass # cbg succesfully updated; still an OMSR elif status == False: # raise a NoOverallMinimalSpanningRange Exception print "WARNING: NoOverallMinimalSpanningRange", cbg raise NoOverallMinimalSpanningRange, str(cbg) else: pass # check (again!) if the is any consistency and if there is a 3' inconsistency hasconsistency = cbg._cexpander.binarystring.count("1") >= 1 has3Pomsrflaw = cbg._cexpander.binarystring[-1] == "0" if not hasconsistency: # due to 5' optimization, the complete CBG alignment collapsed! raise ZeroUniformlyAlignedPositions if not omit3pside and (hasconsistency,has3Pomsrflaw) == (True,True): # start correction on the 3' side of the OMSR omsr = cbg.overall_minimal_spanning_range() replacements = {} ######################################################################## if verbose: print "STARTING cexpander_checkCBG4omsrbordergaps 3p side" print cbg, "\ncexp::", cbg._cexpander.binarystring, print cbg._cexpander.header ######################################################################## for (currentkey,nodeQ,nodeS),pacbporf in cbg.pacbps.iteritems(): # get slice of the pacbporf around the max(OMSR) query value orgQ = cbg.organism_by_node(nodeQ) cexpQstr = cbg._cexpander.get_transferblock(orgQ).binarystring staQpos = max(omsr[nodeQ]) - ( len(cexpQstr) - cexpQstr.rfind("1") ) endQpos = staQpos + nonuniform_aa_offset # get slice of the pacbporf around the max(OMSR) sbjct value orgS = cbg.organism_by_node(nodeS) cexpSstr = cbg._cexpander.get_transferblock(orgS).binarystring staSpos = max(omsr[nodeS]) - ( len(cexpSstr) - cexpSstr.rfind("1") ) endSpos = staSpos + nonuniform_aa_offset # correct endQpos if > pacbporf.orfQ.protein_endPY endQpos = min([ pacbporf.orfQ.protein_endPY, endQpos ]) editedQ = endQpos != staQpos + nonuniform_aa_offset # correct endSpos if > pacbporf.orfQ.protein_endPY endSpos = min([ pacbporf.orfS.protein_endPY, endSpos ]) editedS = endSpos != staSpos + nonuniform_aa_offset if editedQ and editedS: if ( endSpos - (staSpos + nonuniform_aa_offset) ) >=\ (endQpos - (staQpos + nonuniform_aa_offset) ): # editing on Sbjct is gte as on Query -> take Sbjct (q,m,s,coords) = pacbporf.alignmentpart_by_sbjct( staSpos, endSpos ) else: # other way around -> take Query (q,m,s,coords) = pacbporf.alignmentpart_by_query( staQpos, endQpos ) elif editedS: # take by sbjct coords (q,m,s,coords) = pacbporf.alignmentpart_by_sbjct( staSpos, endSpos ) else: # unedited or edited Query -> take by query coords (q,m,s,coords) = pacbporf.alignmentpart_by_query( staQpos, endQpos ) # get bitscore-ratio of this Query/Sbjct slice (qS,qE,sS,sE) = coords bitscoreratio = pacb.calculate_bitscoreratio(q,s) # if more gaps in this alignment slice then expected -> a pacbp split will follow # the slice is of size (2*omsr_offset)+1 if q.find('-'*gap_size) >= 0: # convert (back) to pacbp and obtain position where to split pacbp = pacb.conversion.pacbporf2pacbp(pacbporf) splitpos = qS + q.find('-'*gap_size) # correct splitpos by pacbp.query_start splitpos = splitpos - pacbp.query_start elif s.find('-'*gap_size) >= 0: # convert (back) to pacbp and obtain position where to split pacbp = pacb.conversion.pacbporf2pacbp(pacbporf) splitpos = sS + s.find('-'*gap_size) # correct splitpos by pacbp.sbjct_start splitpos = splitpos - pacbp.sbjct_start elif bitscoreratio <= max_bitscoreratio_threshold: # convert (back) to pacbp and obtain position where to split pacbp = pacb.conversion.pacbporf2pacbp(pacbporf) # correct for matches on the left of the match string splitpos = qS + m.find(" ") # correct splitpos by pacbp.query_start splitpos = splitpos - pacbp.query_start else: ################################################################ if verbose: print nodeQ, nodeS, "'%s' '%s' '%s'" % (q,m,s), coords, print "settings:", (nonuniform_aa_offset, gap_size), print "bitsratio: %1.3f" % bitscoreratio ################################################################ # not passing the cut-off for splitting this pacbp continue #################################################################### if verbose: print "3p,", nodeQ,nodeS, (q,m,s,coords), print "bitsratio: %1.3f (thr:%1.3f)" % ( bitscoreratio,max_bitscoreratio_threshold) print pacbp, "relative splitpos:", splitpos pacbp.print_protein(_linesize=120) #################################################################### # now split the pacbp on this position and recreate the pacbporf pacbpL = pacb.splitting.split_pacb_on_coordinates( pacbp,(splitpos,splitpos),returnside='left') if pacbpL: newpacbporf = pacb.conversion.pacbp2pacbporf( pacbpL,pacbporf.orfQ,pacbporf.orfS) newpacbporf.extend_pacbporf_after_stops() # store to replacements dict replacements[(currentkey,nodeQ,nodeS)] = newpacbporf ################################################################ if verbose: print pacbpL pacbpL.print_protein(_linesize=120) print newpacbporf ################################################################ # increase counter for how much pacbps are corrected PACBPS_CORRECTED+=1 # do the replacements of 3' PacbP corrections status = _update_cbg_with_pacbporf_replacements(cbg,replacements) if status == True: pass # cbg succesfully updated; still an OMSR elif status == False: # raise a NoOverallMinimalSpanningRange Exception raise NoOverallMinimalSpanningRange, str(cbg) elif status == None: pass # no updates done at all else: # NOT POSSIBLE -> status isa NoneBoolean pass #################################################################### if verbose and PACBPS_CORRECTED: print "REPLACEMENTS DONE:", PACBPS_CORRECTED, "omit5pside:", print omit5pside, "omit3pside", omit3pside print cbg cbg.printmultiplealignment() #################################################################### # return if there is something improved if PACBPS_CORRECTED: return True else: return False
def CodingBlockGraph2GeneTreeGraph(cbg): """ Convert CodingBlockGraph 2 GeneTree @attention: function just converts, error check is not performed here! @type cbg: CodingBlockGraph @param cbg: CodingBlockGraph instance @rtype: GeneTreeGraph @return: GeneTreeGraph instance """ gtg = GeneTreeGraph() cbgnode2orgnode = {} for node in cbg.get_nodes(): org = cbg._organism_from_node(node) gtg.add_node(org) # add node/org combi to mapping dict cbgnode2orgnode[node] = org # now add all the edges omsr = cbg.overall_minimal_spanning_range() for (n1, n2) in cbg.pairwisecrosscombinations_node(): if cbg.has_edge(n1, n2): # get pacbp(orf) object thepacbp = cbg.get_pacbps_by_nodes(node1=n1, node2=n2)[0] # get relative coordinates of the OMSR part of the alignment omsrQs = thepacbp.alignmentposition_by_query_pos(min(omsr[n1])) omsrQe = thepacbp.alignmentposition_by_query_pos(max(omsr[n1])) # CHECK these coordinates; pacb.exceptions.CoordinateOutOfRange can occur # in freaky cases. They shouldn't, but do without discovered reason. # However, in the majority of cases, it is just a 1/few aa offset, which # can be easily corrected here. if str(omsrQs) == str(pacb.exceptions.CoordinateOutOfRange): if thepacbp.__class__.__name__ == 'PacbP': # solve by taking thepacbp.query_start omsrQs = thepacbp.alignmentposition_by_query_pos( thepacbp.query_start) else: # thepacbp.__class__.__name__ in ['PacbPDNA','PacbPORF']: # solve by taking orginal alignment position start omsrQs = thepacbp.alignmentposition_by_query_pos( thepacbp._get_original_alignment_pos_start().query_pos) ########################################################################### ## print warning message(s) #print "WARNING: pacb.exceptions.CoordinateOutOfRange (omsrQs, ", #print "node %s in CodingBlockGraph2GeneTreeGraph" % ( str(n1) ) #print "WARNING: min(omsr(", min( omsr[n1] ), ")", min(omsr[n1]), #print max(omsr[n1]), " taken ->", thepacbp.query_start, omsrQs #print "WARNING: ", thepacbp ########################################################################### if str(omsrQe) == str(pacb.exceptions.CoordinateOutOfRange): if thepacbp.__class__.__name__ == 'PacbP': # solve by taking thepacbp.query_end omsrQe = thepacbp.alignmentposition_by_query_pos( thepacbp.query_end) else: # thepacbp.__class__.__name__ in ['PacbPDNA','PacbPORF']: # solve by taking orginal alignment position end omsrQe = thepacbp.alignmentposition_by_query_pos( thepacbp._get_original_alignment_pos_end().query_pos ) + 1 # add +1 to create a python list range coordinate ########################################################################### ## print warning message(s) #print "WARNING: pacb.exceptions.CoordinateOutOfRange (omsrQe, ", #print node %s in CodingBlockGraph2GeneTreeGraph" % ( str(n1) ) #print "WARNING: max(omsr(", max( omsr[n1] ), ")", min(omsr[n1]), #print max(omsr[n1]), " taken ->", thepacbp.query_end, omsrQe #print "WARNING: ", thepacbp ########################################################################### else: # omsrQe was nicely an integer; add +1 because max(OMSR) is not a range coord omsrQe += 1 # calculate identityscore identityscore = pacb.calculate_identityscore( thepacbp.alignment[omsrQs:omsrQe]) else: # this edge is absent in the CBG! # TODO -> this will cause a crash a few lines later # by definition, a CBG MUST HAVE ALL EDGES at this stage! print "about to crash!!!!" print cbg print cbg.node_count(), cbg.edge_count(), "missing:", (n1, n2) identityscore = 0.0 # get organism identifyers from node and add edge o1, o2 = cbgnode2orgnode[n1], cbgnode2orgnode[n2] # Wt used is identityscore == Identity + 0.5* Similarity gtg.add_edge(o1, o2, wt=identityscore) # add additional statistics to gtg object. Wt used is # identitypercentage is TRUE aa indentity % identityperc = pacb.calculate_identity( thepacbp.alignment[omsrQs:omsrQe]) gtg._aa_identity_percentages[(o1, o2)] = identityperc gtg._aa_identity_percentages[(o2, o1)] = identityperc # bitscoreratio is ratio of bits / max bits bitscoreratio = pacb.calculate_bitscoreratio( thepacbp.query[omsrQs:omsrQe], thepacbp.sbjct[omsrQs:omsrQe], matrix=thepacbp.MATRIX) gtg._bitscore_ratios[(o1, o2)] = bitscoreratio gtg._bitscore_ratios[(o2, o1)] = bitscoreratio # ntidentity is obviously nt identity% dnaQseq, dnaSseq = thepacbp.get_unextended_aligned_dna_sequences() ntidentity = sequence_identity_ratio(dnaQseq, dnaSseq) gtg._nt_identity_percentages[(o1, o2)] = ntidentity gtg._nt_identity_percentages[(o2, o1)] = ntidentity # check if the graph is saturated (complete) # if not (organism/node/orf missing), add this as a zero-wt edge gtg.makecompletegraph(wt=0.0) # and return this new genetree graph return gtg