def multiplealignment(self): """ """ # get sequences & coordinated and rewrite Nodes to Organism identifiers seqs,coords = self.get_maxsr_proteinsequences_and_coords() coords = dict([ (self.organism_by_node(node),[min(vlist),max(vlist)+1]) for node,vlist in coords.iteritems() ]) seqs = dict([ (self.organism_by_node(node),seq) for node,seq in seqs.iteritems() ]) # align sequences with ClustalW (alignedseqs,alignment) = clustalw( seqs= seqs ) # trim alignment for leading & trailing gaps alignedseqs,alignment,coords = strip_alignment_for_exterior_gaps(alignedseqs,alignment,coords) # return single string of multilined fasta return "\n".join([">%s_orf_%s\n%s" % (k,self.node_by_organism(k)[1],v) for k,v in alignedseqs.iteritems()])
def multiplealignment(self): """ """ # get sequences & coordinated and rewrite Nodes to Organism identifiers seqs, coords = self.get_maxsr_proteinsequences_and_coords() coords = dict([(self.organism_by_node(node), [min(vlist), max(vlist) + 1]) for node, vlist in coords.iteritems()]) seqs = dict([(self.organism_by_node(node), seq) for node, seq in seqs.iteritems()]) # align sequences with ClustalW (alignedseqs, alignment) = clustalw(seqs=seqs) # trim alignment for leading & trailing gaps alignedseqs, alignment, coords = strip_alignment_for_exterior_gaps( alignedseqs, alignment, coords) # return single string of multilined fasta return "\n".join([ ">%s_orf_%s\n%s" % (k, self.node_by_organism(k)[1], v) for k, v in alignedseqs.iteritems() ])
def get_unguided_nt_identity(self): """ Get identity% of UNGUIDED DNA alignment """ # if zerosized -> return 0.0 if self.length == 0: return 0.0 # get DNA sequences dnaQ,dnaS = self.get_aligned_dna_sequences() dnaQ,dnaS = dnaQ.replace("-",""), dnaS.replace("-","") # make (semi) unique headers uniqueid = get_random_string_tag() (qs,qe,ss,se) = self.barcode()[0:4] headerQ = "query%s%s%s" % (qs,qe,uniqueid) headerS = "sbjct%s%s%s" % (ss,se,uniqueid) # prepare & run clustalw seqs = { headerQ: dnaQ, headerS: dnaS } out,alignment = clustalw( seqs=seqs ) # get id% on aligned dna sequences cnt = 0 for pos in range(0,len(out[headerQ])): if out[headerQ][pos] == out[headerS][pos]: cnt+=1 # return relative ratio return float(cnt) / len(out[headerQ])
def get_unguided_nt_identity(self): """ Get identity% of UNGUIDED DNA alignment """ # if zerosized -> return 0.0 if self.length == 0: return 0.0 # get DNA sequences dnaQ, dnaS = self.get_aligned_dna_sequences() dnaQ, dnaS = dnaQ.replace("-", ""), dnaS.replace("-", "") # make (semi) unique headers uniqueid = get_random_string_tag() (qs, qe, ss, se) = self.barcode()[0:4] headerQ = "query%s%s%s" % (qs, qe, uniqueid) headerS = "sbjct%s%s%s" % (ss, se, uniqueid) # prepare & run clustalw seqs = {headerQ: dnaQ, headerS: dnaS} out, alignment = clustalw(seqs=seqs) # get id% on aligned dna sequences cnt = 0 for pos in range(0, len(out[headerQ])): if out[headerQ][pos] == out[headerS][pos]: cnt += 1 # return relative ratio return float(cnt) / len(out[headerQ])
def make_pacbps_for_edges(gra,aa_extra_offset=1,verbose=False): """ """ coordsandseqs = {} # create dummy omsr attribute! # omsr is filled with Exon.acceptor and Exon.donor positions # recalculate nt positions to aa positions! for node in gra.get_ordered_nodes(): accep = gra._node_object[node].acceptor donor = gra._node_object[node].donor aaStart = accep.pos / 3 if donor.pos - accep.pos % 3 == 0: aaEnd = donor.pos / 3 else: aaEnd = (donor.pos / 3) +1 # get orf, seequence coordinates and sequence itself theorg = gra._organism_from_node(node) theorf = gra.get_orfs_of_graph(node=node)[0] aaStart -= aa_extra_offset aaEnd += aa_extra_offset # correct end coordinates when falling outside of Orf if aaEnd > theorf.protein_endPY: aaEnd = theorf.protein_endPY if aaStart < theorf.protein_startPY: aaStart = theorf.protein_startPY theseq = theorf.getaas(abs_pos_start=aaStart,abs_pos_end=aaEnd) # store to dict coordsandseqs[node] = (theseq,theorg,theorf,aaStart,aaEnd) for (node1,node2) in gra.pairwisecrosscombinations_node(): # check if these are nodes present as an edge if not gra.has_edge(node1,node2): continue # start makeing a Pacbp from clustalw (seq1,org1,orf1,aa1start,aa1end) = coordsandseqs[node1] (seq2,org2,orf2,aa2start,aa2end) = coordsandseqs[node2] # create headers and fetch sequences from Orf objects header1 = "%s_orf_%s_%s_%s" % (org1,orf1.id,aa1start,aa1end) header2 = "%s_orf_%s_%s_%s" % (org2,orf2.id,aa2start,aa2end) # check if sequences exist/ at least 1 AA if not seq1 and not seq2: print "Warning: ZeroProteinSequenceLengthException", "S1", aa1start, aa1end, node1, node2, orf1 print "Warning: ZeroProteinSequenceLengthException", "S2", aa2start, aa2end, node1, node2, orf2 continue elif not seq2: print "Warning: ZeroProteinSequenceLengthException", "S2", aa2start, aa2end, node1, node2, orf2 continue elif not seq1: print "Warning: ZeroProteinSequenceLengthException", "S1", aa1start, aa1end, node1, node2, orf1 continue else: pass # align the sequences with clustalw seqs = { header1: seq1, header2: seq2 } (alignedseqs,alignment) = clustalw(seqs=seqs) # make pacbp from clustalw alignment pacbp = pacb.conversion.pacbp_from_clustalw( alignment=( alignedseqs[header1], alignment, alignedseqs[header2] ), coords=(aa1start,aa1end,aa2start,aa2end) ) if pacbp: # make & extend PacbPORF pacbporf = pacb.PacbPORF(pacbp,orf1,orf2) pacbporf.extend_pacbporf_after_stops() # update edge weight #new_wt = pacbporf.bitscore # wt was sum(PSSM) * distance ratio # now multiply with identityscore (0.0-1.0) float too new_wt = pacbporf.identityscore * gra.get_edge_weight(node1,node2) gra.set_edge_weight(node1,node2,wt=new_wt) # add pacbporf to CBG key = pacbporf.construct_unique_key(node1,node2) gra.pacbps[(key,node1,node2)] = pacbporf else: # pacbp.conversion.pacbp_from_clustalw did # not yield any proper alignment if verbose: print "NO PACBP!!", node1,node2, seq1,seq2 pass
def update_PCG_with_signalpexons(signalpexonseqs, PCG, OPTIONS, min_pacbporf_identityscore=0.20, verbose=True): """ """ if not signalpexonseqs.has_key(OPTIONS.target): return False is_any_pacbporf_added = False for targetSPexon in signalpexonseqs[OPTIONS.target]: target = OPTIONS.target for informant, infSPlist in signalpexonseqs.iteritems(): if informant == OPTIONS.target: continue # check if informant has been deleted in the meanwhile if informant not in PCG.organism_set(): continue # list to store signalp exons into signalpexon_pacbp_list = [] # get ordered pacbporfs fromt he PCG thepacbporfs = order_pacbporf_list( PCG.get_pacbps_by_organisms(OPTIONS.target, informant)) if not thepacbporfs: # no alignments present for this organism (can happen!) continue for informantSPexon in infSPlist: coords = [ targetSPexon.protein_start(), targetSPexon.protein_end(), informantSPexon.protein_start(), informantSPexon.protein_end(), ] # prior to making ClustalW-PacbP, check PacbPCOORD placeability # into the list of pacbporfs pacbpCoordsObj = PacbPCOORDS(input=( targetSPexon.proteinsequence(), informantSPexon.proteinsequence(), targetSPexon.protein_start(), informantSPexon.protein_start(), )) if False in [ pacbpCoordsObj.is_positioned_compatibly(pacbporf) for pacbporf in thepacbporfs ]: # *NOT* placable in current ordered list of PacbPORFS continue dist = pacbpCoordsObj.distance_towards(thepacbporfs[0]) if dist > SIGNALP_FIRSTEXON_MAX_INTRON_NT_LENGTH / 3: # WAY TO FAR in front of current gene structure parts. # Do not allow (pooras a *NOT* placable in current ordered list of PacbPORFS continue elif dist == 0: # NOT placeable in front of the rest of the PacbPORFS. continue else: pass # perform ClustalW alignment on the SP exons (alignedseqs,alignment) =\ clustalw( seqs= { OPTIONS.target: targetSPexon.proteinsequence(), informant: informantSPexon.proteinsequence() } ) # make pacbp from clustalw alignment pacbp = pacbp_from_clustalw( alignment=(alignedseqs[OPTIONS.target], alignment, alignedseqs[informant]), coords=coords) # is there any alignment constructed? if not pacbp: continue # ignore (very) poor identyscore alignments if pacbp.identityscore < min_pacbporf_identityscore: continue # if here make extended pacbpORF signalpexonPacbpORF = pacbp2pacbporf(pacbp, targetSPexon.orf, informantSPexon.orf) signalpexonPacbpORF.extend_pacbporf_after_stops() # and store in signalpexon_pacbp_list signalpexon_pacbp_list.append(signalpexonPacbpORF) ################################################################ if verbose: print alignedseqs[OPTIONS.target], OPTIONS.target print alignment print alignedseqs[informant], informant if pacbp: print pacbp, (OPTIONS.target, targetSPexon.orf.id), print(informant, informantSPexon.orf.id), print "DISTANCE::", dist pacbp.print_protein() print "" ################################################################ # If there are signalpexon-guided pacbporfs found, store the one # with the highest bitscore if signalpexon_pacbp_list: signalpexon_pacbp_list = order_list_by_attribute( signalpexon_pacbp_list, order_by='bits', reversed=True) # store best bitscoring pacbporf to PCG signalp_pacbporf = signalpexon_pacbp_list[0] pacbporf2PCG(signalp_pacbporf, OPTIONS.target, informant, PCG, source='SignalP-ClustalW') is_any_pacbporf_added = True #################################################################### if verbose: print "SignalP Exon added to PCG:", signalp_pacbporf, informant #################################################################### else: pass # return pointer is_any_pacbporf_added return is_any_pacbporf_added
def _create_hmm_profile(cbg, area="OMSR", prevcbg=None, nextcbg=None, strip_nonaligned_residues=False, verbose=False, **kwargs): """ """ # area must be one of # OMSR MINSR MAXSR # LEFTSPRDIF RIGTHSPRDIF # OMSRANDLEFTSPRDIF OMSRANDRIGTHSPRDIF # RIGTHORFEND # update to default value if not kwargs.has_key('sprdif_min_aa_length'): kwargs['sprdif_min_aa_length'] = 20 if area == "OMSR": if cbg.has_overall_minimal_spanning_range(): coords = cbg.overall_minimal_spanning_range() else: return None, {} elif area == "MINSR": if cbg.has_minimal_spanning_range(): coords = cbg.minimal_spanning_range() else: return None, {} elif area == "MAXSR": if cbg.has_maximal_spanning_range(): coords = cbg.maximal_spanning_range() else: return None, {} elif area == "LEFTSPRDIF": if cbg.has_left_spanningrange_difference(**kwargs): coords = cbg.left_spanningrange_difference(**kwargs) else: return None, {} elif area == "RIGTHSPRDIF": if cbg.has_rigth_spanningrange_difference(**kwargs): coords = cbg.rigth_spanningrange_difference(**kwargs) else: return None, {} elif area == "OMSRANDLEFTSPRDIF": kwargs['sprdif_min_aa_length'] = 20 if not cbg.has_overall_minimal_spanning_range() or\ not cbg.has_left_spanningrange_difference(**kwargs): return None, {} # if here, start preparing coords coords = cbg.left_spanningrange_difference(**kwargs) # remove short contributors to left SPRDIF coords = _remove_short_sprdif_contributors(coords, verbose=verbose) # increase coord range by OMSR area omsr = cbg.overall_minimal_spanning_range() for node, coordrange in coords.iteritems(): coords[node] = Set(range(min(coordrange), max(omsr[node]) + 1)) elif area == "OMSRANDRIGTHSPRDIF": kwargs['sprdif_min_aa_length'] = 20 if not cbg.has_overall_minimal_spanning_range() or\ not cbg.has_rigth_spanningrange_difference(**kwargs): return None, {} # if here, start preparing coords coords = cbg.rigth_spanningrange_difference(**kwargs) # remove short contributors to left SPRDIF coords = _remove_short_sprdif_contributors(coords, verbose=verbose) # increase coord range by OMSR area omsr = cbg.overall_minimal_spanning_range() for node, coordrange in coords.iteritems(): coords[node] = Set(range(min(omsr[node]), max(coordrange) + 1)) elif area == "RIGTHORFEND": # area in between MAXSR and orfend if not cbg.has_maximal_spanning_range(): return None, {} # get coords & obtain Orf ends coords = cbg.maximal_spanning_range() nodes = coords.keys() for node in nodes: organism = cbg.organism_by_node(node) theorf = cbg.get_orfs_of_graph(organism=organism)[0] coords[node] = range(max(coords[node]) + 1, theorf.protein_endPY) # remove zero-length ranges if len(coords[node]) == 0: del (coords[node]) else: raise "WHAT ELSE!?" ############################################################################ if verbose: print area, sum([(max(v) - min(v)) for k, v in coords.iteritems()]), len(coords) ############################################################################ # decrease coord range by prevcbg if applicable if area in ["MAXSR", "LEFTSPRDIF", "OMSRANDLEFTSPRDIF"] and prevcbg: omsr = prevcbg.overall_minimal_spanning_range() for org in cbg.organism_set().intersection(prevcbg.organism_set()): # omsr/coords have Node keys -> translate to Organism keys nodeCbg = cbg.get_organism_nodes(org)[0] nodePrev = prevcbg.get_organism_nodes(org)[0] # check if node not deleted earlier in coords dict if not coords.has_key(nodeCbg): continue if not omsr.has_key(nodePrev): continue sta = max([max(omsr[nodePrev]) + 1, min(coords[nodeCbg])]) end = max(coords[nodeCbg]) + 1 coords[nodeCbg] = Set(range(sta, end)) if not coords[nodeCbg]: del (coords[nodeCbg]) # decrease coord range by nextcbg if applicable if area in ["MAXSR", "RIGTHSPRDIF", "OMSRANDRIGTHSPRDIF"] and nextcbg: omsr = nextcbg.overall_minimal_spanning_range() for org in cbg.organism_set().intersection(nextcbg.organism_set()): # omsr/coords have Node keys -> translate to Organism keys nodeCbg = cbg.get_organism_nodes(org)[0] nodeNext = nextcbg.get_organism_nodes(org)[0] # check if node not deleted earlier in coords dict if not coords.has_key(nodeCbg): continue if not omsr.has_key(nodeNext): continue sta = min(coords[nodeCbg]) end = min([min(omsr[nodeNext]), max(coords[nodeCbg]) + 1]) coords[nodeCbg] = Set(range(sta, end)) if not coords[nodeCbg]: del (coords[nodeCbg]) # check if coords still present if not coords: return None, {} ############################################################################ if verbose: print area, sum([(max(v) - min(v)) for k, v in coords.iteritems()]), len(coords) ############################################################################ # do/redo _remove_short_sprdif_contributors id required if area in [ "MAXSR", "LEFTSPRDIF", "RIGTHSPRDIF", "OMSRANDLEFTSPRDIF", "OMSRANDRIGTHSPRDIF", "RIGTHORFEND" ]: coords = _remove_short_sprdif_contributors(coords) ############################################################################ if verbose: print area, sum([(max(v) - min(v)) for k, v in coords.iteritems()]), len(coords) ############################################################################ # check if at least 2 sequences/nodes are remaining if len(coords) <= 1: return None, {} # check sprdif_min_aa_length if applicable if area in [ "RIGTHSPRDIF", "LEFTSPRDIF", "OMSRANDRIGTHSPRDIF", "OMSRANDLEFTSPRDIF" ]: maxlength = max([len(vlist) for vlist in coords.values()]) if maxlength < kwargs['sprdif_min_aa_length']: return None, {} # if here, obtain sequences and build HMM search profile # get fasta sequences and fastaseqs = cbg._get_sequences_by_coords(coords) # rewrite dict (node) keys to string keys fastaseqs, coords = _rename_dict_keys_to_strings(fastaseqs, coords) # remove empty sequence strings from fastaseqs dict empty_seq_keys = [] for k, seq in fastaseqs.iteritems(): if seq == "" or len(seq) == 1: empty_seq_keys.append(k) for k in empty_seq_keys: del (coords[k]) del (fastaseqs[k]) # check (again) if at least 2 sequences/nodes are remaining if len(coords) <= 1: return None, {} # rewrite coords to (min,max) tuple coords = dict([(key, [min(vlist), max(vlist) + 1]) for key, vlist in coords.iteritems()]) # perform clustalw multiple alignment (alignedseqs, alignment) = clustalw(seqs=fastaseqs) # strip exterior gaps in case of OMSR/MINSR area if area in ["OMSR", "MINSR"]: alignedseqs, alignment, coords = strip_alignment_for_exterior_gaps( deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords)) # strip poorly conserved residues in case of RIGTHORFEND if area in ["RIGTHORFEND"]: alignedseqs, alignment, coords = strip_poorly_supported_tails( deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords), 0.20) # strip_overall_nonaligned_residues if requested for: THIS IS VERY RIGID! if strip_nonaligned_residues: alignedseqs, alignment, coords = strip_overall_nonaligned_residues( deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords)) # check if alignment was completely consumed or not if not alignment or len(alignment) <= 1: return None, {} ############################################################################ if verbose: print "## HMM clustalw input profile:", prevcbg != None, area, nextcbg != None for node, algseq in alignedseqs.iteritems(): print algseq, node, coords[node] print alignment ############################################################################ # make unique filename for hmm profile file fname_hmm_profile = "hmmbuild_profile_%s.hmmprof" % get_random_string_tag() # write multiple alignment input file writeMultiFasta(alignedseqs, fname_hmm_profile) # make hmmbuild file of the multiplealignment fname_hmmbuild_file = hmmbuild_protein(fname_hmm_profile) # remove hmm profile multiple alignment file osRemove(fname_hmm_profile) # return HMM serach profile filename return fname_hmmbuild_file, coords
def hmmhit2pacbp(queryorf, queryorg, querycoords, sbjctorf, sbjctorg, hmmhit, verbose=False): """ """ # trim hmmhit for unmatched characters (sbjct_header, sbjct_start, sbjct_end, query_start, query_end, query, match, sbjct, score, expect) = hmmhit while match and match[0] == ' ': query = query[1:] match = match[1:] sbjct = sbjct[1:] sbjct_start += 1 query_start += 1 while match and match[-1] == ' ': query = query[0:-1] match = match[0:-1] sbjct = sbjct[0:-1] sbjct_end -= 1 query_end -= 1 # get orf, node and AA and DNA coordinates of this sbjct hit; # correct for -1 offset in start coordinate!! sbjct_aa_start = sbjct_start - 1 + sbjctorf.protein_startPY sbjct_aa_end = sbjct_end + sbjctorf.protein_startPY sbjctNode = (sbjctorg, sbjctorf.id) query = query.replace(".", "-").upper() sbjct = sbjct.replace(".", "-").upper() ############################################################################ if verbose: print "hmmhit2pacbp CREATING pacbps for organism/orf: (%s,%s)" % ( sbjctorg, sbjctorf.id) print "hmmhit2pacbp Q '%s'" % query print "hmmhit2pacbp m '%s'" % match print "hmmhit2pacbp S '%s'" % sbjct print "hmmQ:", query, query_start, query_end, "gaps:", print query.count('-'), len(query) print "hmmM:", match print "hmmS:", sbjct, sbjctNode, sbjct_aa_start, sbjct_aa_end, print "len:", sbjct_aa_end - sbjct_aa_start, len(sbjct) ############################################################################ # get Node and sequence of the query queryNode = (queryorg, queryorf.id) queryseq = deepcopy(query) # calculate query sequence position on queryorf query_aa_start = querycoords[0] + query_start - 1 query_aa_end = query_aa_start + len(queryseq) - queryseq.count('-') ############################################################################ if verbose: print "hmmq:", queryseq, queryNode, query_aa_start, query_aa_end, print "len:", query_aa_end - query_aa_start, len(queryseq) ############################################################################ # make a deepcopy; sbjct is needed unchanged for the next iteration # in the for loop, but here we want to trim of gap sequences sbjctseq = deepcopy(sbjct) sbjctaastart = deepcopy(sbjct_aa_start) sbjctaaend = deepcopy(sbjct_aa_end) while queryseq and queryseq[0] == '-': queryseq = queryseq[1:] sbjctseq = sbjctseq[1:] sbjctaastart += 1 while sbjctseq and sbjctseq[0] == '-': queryseq = queryseq[1:] sbjctseq = sbjctseq[1:] query_aa_start += 1 while queryseq and queryseq[-1] == '-': queryseq = queryseq[0:-1] sbjctseq = sbjctseq[0:-1] sbjctaaend -= 1 while sbjctseq and sbjctseq[-1] == '-': queryseq = queryseq[0:-1] sbjctseq = sbjctseq[0:-1] query_aa_end -= 1 # NEW NEW code in december 2010. Since inwpCBGs are implemented, HMM # profiles are build from clustalw alignments which have loosely aligned # tails (SPRDIF sequences). Problem with HMM is, that in the result file # no information is written on where in teh constructed HMM this hit # starts. This **sucks** because special care was taken in ABFGP code to # make shure the exact aa-coordinates of the applied sequences to ClustalW # are known. Hmmbuild here nullifies this effort by not giving start # coordinates. Therefore, we have to check the exact start position # of the HMM match on the queryorf. if queryseq.replace("-", "") != queryorf.getaas(query_aa_start, query_aa_end): # obtain (search) query sequence, replace gaps by X symbol searchqueryseq = queryseq.upper().replace("-", "X") # count length of the query sequence; here IGNORE THE GAPS!! seqlen = len(queryseq.upper().replace("-", "")) # make fasta sequence dictionary seqdict = { 'query_hmm': searchqueryseq, 'query_orf': queryorf.protein_sequence, } # make coords dictionary for remapping coords = { 'query_hmm': [0, seqlen], 'query_orf': [queryorf.protein_startPY, queryorf.protein_endPY], } # perform clustalw multiple alignment (alignedseqs, alignment) = clustalw(seqs=seqdict) # strip exterior gaps alignedseqs, alignment, coords = strip_alignment_for_exterior_gaps( deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords)) if alignedseqs['query_hmm'].count("-") > 0: # in (very) exceptional cases, gaps can be introduced in the # clustalw alignment in the HMM seq. This normally does not # occur! Fix this here by placing gaps in sbjctseq too. sbjctseq_as_list = list(sbjctseq) for pos in range(0, len(alignedseqs['query_hmm'])): if alignedseqs['query_hmm'][pos] == "-": sbjctseq_as_list.insert(pos, "-") if alignedseqs['query_hmm'].find("-", pos) == -1: break sbjctseq = "".join(sbjctseq_as_list) ######################################################################## if verbose: print "\t", "FALSE::", sbjctseq, "[ WITH GAPS,SBJCT ]" print "\t", "FALSE::", queryseq, "[ WITH GAPS ]" for k, algseq in alignedseqs.iteritems(): print "\t", "FALSE::", algseq, k, coords[k], len(algseq) print "\t", "FALSE::", sbjctseq, "SBJCT", len(sbjctseq) print "\t", "FALSE::", alignment, "ALMNT", len(alignment) print "\t", "SOLVED:", len( alignedseqs['query_orf']) == len(sbjctseq) ######################################################################## # update query sequence & coordinates if len(alignedseqs['query_orf']) == len(sbjctseq): queryseq = alignedseqs['query_orf'] query_aa_start = coords['query_orf'][0] query_aa_end = coords['query_orf'][1] else: # still not identical lengths. ClustalW recovery of HMM hit # failed miserably. For now: omit # TODO: resolve this case!! # example: --filewithloci examples/bilal/CFU_830450.bothss.csv # ## HMM clustalw input profile: False MAXSR True # FPKGCESGKFINWKTFKANGVNLGAWLAKEKTHDPVW foxga [561, 598] # FQRACR--KFID-ETLSAHAL---EWESKEIVPPEVW CFU [357, 388] # hmmhit2pacbp CREATING pacbps for organism/orf: (NP1064101[anid],1) # hmmhit2pacbp Q 'FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD' # hmmhit2pacbp m '+ ka + F W k + nLG Wl E d' # hmmhit2pacbp S 'YTKAFQ--PF-SWSSAKVRGANLGGWLVQEASID' # hmmQ: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD 1 34 gaps: 0 34 # hmmM: + ka + F W k + nLG Wl E d # hmmS: YTKAFQ--PF-SWSSAKVRGANLGGWLVQEASID ('NP1064101[anid]', 1) 33 64 len: 31 34 # hmmq: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD ('CFU', 91) 357 391 len: 34 34 # FALSE:: YTKAFQ---------PF-SWSS-----------------AKVR----------GANLGG--W-LVQEASID [ WITH GAPS,SBJCT ] # FALSE:: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD [ WITH GAPS ] # FALSE:: FQKACR-------SGKFIDWKT-----------------LKAN----------ALNLGE--W-LAKEKVH query_hmm [0, 33] 70 # FALSE:: FQRACRKFIDETLSAHALEWESKEIVPPEVWQRFAEANMLIPNLAALASRMVGEIGIGNAFWRLSVQGLR query_orf [357, 427] 70 # FALSE:: YTKAFQ---------PF-SWSS-----------------AKVR----------GANLGG--W-LVQEASID SBJCT 71 # FALSE:: **:*** *.: ::*:: * .* :.:*: * *: : :: ALMNT 70 # SOLVED: False # Pacbp creation failed! return False, None if queryseq and sbjctseq: ################################################################ if len(queryseq) != len(sbjctseq): # this will result in a exception to be raised: # pacb.exceptions.InproperlyAppliedArgument # print data here about what went wrong, then # just let the error be raised print queryseq, len(queryseq), sbjctseq, len(sbjctseq) print hmmhit print "Q:", query_aa_start, query_aa_end, print query_aa_end - query_aa_start, "len:", len(queryseq) print "S:", sbjctaastart, sbjctaaend, print sbjctaaend - sbjctaastart, "len:", len(sbjctseq) ################################################################ pacbpinput = (queryseq, sbjctseq, query_aa_start, sbjctaastart) pacbp = PacbP(input=pacbpinput) # remove consistent internal gaps caused hy HMM profile search pacbp.strip_consistent_internal_gaps() pacbp.source = 'hmmsearch' pacbporf = PacbPORF(pacbp, queryorf, sbjctorf) pacbporf.strip_unmatched_ends() if pacbporf.length == 0: # Pacbp creation failed! return False, None else: pacbporf.extend_pacbporf_after_stops() pacbpkey = pacbporf.construct_unique_key(queryNode, sbjctNode) # return unique key and pacbporf return (pacbpkey, queryNode, sbjctNode), pacbporf else: # Pacbp creation failed! return False, None
def improvealignment( cbg, verbose=False, allow_3p_optimization=True, allow_5p_optimization=True, maximal_cbg_identity=CBG_OPTIMIZE_MAXIMAL_IDENTITY, clustalw_gap_size=CBG_OPTIMIZE_CLUSTALW_GAP_SIZE, optimization_bitscore_ratio=CBG_OPTIMIZE_MINIMAL_BITSCORE_RATIO, optimization_identity_ratio=CBG_OPTIMIZE_MINIMAL_IDENTITY_RATIO): """ (Try to) Improve the multiple alignment of this CBG with clustalw @type cbg: CodingBlockGraph @param cbg: CodingBlockGraph instance to optimize @type verbose: Boolean @param verbose: print debugging/intermediate information to STDOUT @type allow_3p_optimization: Boolean @param allow_3p_optimization: allow optimization(extension!) on the 3p side @type allow_5p_optimization: Boolean @param allow_5p_optimization: allow optimization(extension!) on the 5p side @type maximal_cbg_identity: float @param maximal_cbg_identity: do not optimize CBG when its GTG.identity() > this number @type clustalw_gap_size: integer @param clustalw_gap_size: split ClustalW-multiplealignment obtained PacbPs on gap size @type optimization_bitscore_ratio: float @param optimization_bitscore_ratio: only allow longer ClustalW-PacbPs when at least this ratio towards the original PacbP @type optimization_identity_ratio: float @param optimization_identity_ratio: only allow longer ClustalW-PacbPs when at least this ratio towards the original PacbP @attention: when a CBG is flanked by a lsrCBG in the GSG, it advised to set allow_*p_optimization to False @rtype: Boolean @return: is the CBG optimized or not """ IS_IMPROVED = False # if both allow_*p_optimization are False -> no optimization! if not allow_3p_optimization and not allow_5p_optimization: return False # check if there is a likely chance that we can optimize this cbg # This chance is defined by parameter if cbg.get_genetree().identity() > maximal_cbg_identity: return False # gather data of the current cbg to compare before/after clustalw optimization current_cbg_total_weight = cbg.total_weight() current_cbg_string = str(cbg) current_cbg_omsr = cbg.overall_minimal_spanning_range() current_cbg_maxsr = cbg.maximal_spanning_range() # get the orf's sequences in a dict and do clustalw seqs = cbg.getorfproteinsequences() (_algseqs, _algm) = clustalw(seqs=seqs) # check if there is at least a single aligned position if len(_algm) == _algm.count(' '): return False # get the position of the first and last aligned AA in the clustalw alignment firstalignedpos = 0 finalalignedpos = len(_algm) - 1 while _algm[firstalignedpos] == ' ': firstalignedpos += 1 while _algm[finalalignedpos] == ' ': finalalignedpos -= 1 # increase finalalignedpos+=1 for compatibility asa list slice finalalignedpos += 1 # translate clustalw multiple alignment start & end to OMSR coordinates # While doing this, check if the current OMSR is fully covered by the # ClustalW OMSR. In case of long orf sequences and small CBGs, # ClustalW is likely to produce out-of-range alignments! newomsr = {} OMSR_IS_COMPLETELY_COVERED = True for org in seqs.keys(): orf = cbg.get_orfs_of_graph(organism=org)[0] node = cbg.node_by_organism(org) omsrstart = orf.protein_startPY + ( firstalignedpos - _algseqs[org][0:firstalignedpos].count('-')) omsrend = omsrstart + ( finalalignedpos - firstalignedpos - _algseqs[org][firstalignedpos:finalalignedpos].count('-')) newomsr[org] = (omsrstart, omsrend) omsrunion = current_cbg_omsr[node].intersection( Set(range(omsrstart, omsrend + 1))) if len(omsrunion) < len(current_cbg_omsr[node]): OMSR_IS_COMPLETELY_COVERED = False if verbose: print org, len(omsrunion), " < ", len(current_cbg_omsr[node]) continue if verbose: print org, min(current_cbg_omsr[node]), max( current_cbg_omsr[node]), "new:", (omsrstart, omsrend), print "maxsr:", min(current_cbg_maxsr[node]), max( current_cbg_maxsr[node]), print node, orf, orf.protein_startPY, orf.protein_endPY, print len(_algseqs[org]), len( _algseqs[org]) - _algseqs[org].count('-'), orf.length / 3 # Check if current CBG OMSR is overlapping with clustalw OMSR if not OMSR_IS_COMPLETELY_COVERED: if verbose: print "NO improvement, ClustalW out-of-range-alignment" return False ####################################################################### if verbose: linesize = 100 print "<ClustalW obtained multiple alignment>" for offset in range(0, len(_algm), linesize): start = firstalignedpos + offset end = start + linesize if end > finalalignedpos: end = finalalignedpos if offset == 0 and finalalignedpos - firstalignedpos < linesize: end = finalalignedpos for org in seqs.keys(): print _algseqs[org][start:end], org print _algm[start:end] print "" if end == finalalignedpos: break print current_cbg_string cbg.printmultiplealignment() ####################################################################### # loop over the pairwise organism combinations and make new pacbps # but only if the new OMSR extends the known OMSR. # In this process, split the ClustalW PacbpOrfs for gaps # of size clustalw_gap_size for orgA, orgB in cbg.pairwisecrosscombinations_organism(): # get the current/original pacbporf pacbporf = cbg.get_pacbp_by_organisms(orgA, orgB) # are the new multiplealignment OMSR coords bigger as the current ones? spos = pacbporf._get_original_alignment_pos_start() epos = pacbporf._get_original_alignment_pos_end() isextended5p = (newomsr[orgA][0] < spos.query_pos, newomsr[orgB][0] < spos.sbjct_pos) isextended3p = (newomsr[orgA][1] - 1 > epos.query_pos, newomsr[orgB][1] - 1 > epos.sbjct_pos) # check if there is novel extention and on which side extention = None if isextended5p == (True, True) and isextended3p == (True, True): extention = 'both' # extention on both sides elif isextended5p == (True, True): extention = '5p' # extention on 5p side alone elif isextended3p == (True, True): extention = '3p' # extention on 3p side alone else: # no extention at all -> continue continue # Check if extention is alowed in this side # This check is recommended to be included for CBGs # that are neigbored/delimited/separated by lsrCBG(s) if not allow_3p_optimization and extention in ['both', '3p']: continue # not alowed! if not allow_5p_optimization and extention in ['both', '5p']: continue # not alowed! # get orf objects and aligned sequence parts orfA = cbg.get_orfs_of_graph(organism=orgA)[0] orfB = cbg.get_orfs_of_graph(organism=orgB)[0] seqA = _algseqs[orgA][firstalignedpos:finalalignedpos] seqB = _algseqs[orgB][firstalignedpos:finalalignedpos] nodeQ = cbg.node_by_organism(orgA) nodeS = cbg.node_by_organism(orgB) # make pacbp from this clustalw alignment and extend it alignment = (seqA, _algm[firstalignedpos:finalalignedpos], seqB) alignment = _remove_gaps_from_clustalw_alignment(alignment) coords = (newomsr[orgA][0], newomsr[orgA][1], newomsr[orgB][0], newomsr[orgB][1]) newpacbp = pacb.conversion.pacbp_from_clustalw(alignment=alignment, coords=coords) # check for gaps in the clustalw alignment; if so, split them and select the # pacbp that overlaps with the omsr if newpacbp.alignment_has_gaps(gap_size=clustalw_gap_size): splitted, status = pacb.splitting.split_pacb_on_gaps( newpacbp, gapsize=clustalw_gap_size) if not status: # pacbp cannot be splitted for some reason. # Ignore it and continue with the next orgA/orgB comparison continue split_is_compatible = False for splittedpacbp in splitted: if splittedpacbp.query_start <= min(current_cbg_omsr[nodeQ]) and\ splittedpacbp.query_end >= max(current_cbg_omsr[nodeQ]) and\ splittedpacbp.sbjct_start <= min(current_cbg_omsr[nodeS]) and\ splittedpacbp.sbjct_end >= max(current_cbg_omsr[nodeS]): newpacbp = splittedpacbp split_is_compatible = True # check - again - if this clustalw-obtained pacbp is an extention newomsr[orgA] = (newpacbp.query_start, newpacbp.query_end) newomsr[orgB] = (newpacbp.sbjct_start, newpacbp.sbjct_end) isextended5p = (newomsr[orgA][0] < spos.query_pos, newomsr[orgB][0] < spos.sbjct_pos) isextended3p = (newomsr[orgA][1] - 1 > epos.query_pos, newomsr[orgB][1] - 1 > epos.sbjct_pos) # check if there is novel extention and on which side extention = None if isextended5p == (True, True) and isextended3p == (True, True): extention = 'both' # extention on both sides elif isextended5p == (True, True): extention = '5p' # extention on 5p side alone elif isextended3p == (True, True): extention = '3p' # extention on 3p side alone else: split_is_compatible = False # break out of looping over the splits break # check if the split was compatible with the OMSR of the current CBG if not split_is_compatible: # pacbp splits rigth through the OMSR region we are interested in. # Ignore it and continue with the next orgA/orgB comparison continue # convert (splitted) pacbp into pacbporf newpacbporf = pacb.conversion.pacbp2pacbporf(newpacbp, orfA, orfB) # now merge the clustalw pacbporf with the existing blast pacbporf status3p, status5p = False, False if extention in ['3p', 'both']: merged, status3p = pacb.merging.merge_pacbporfs(pacbporf, newpacbporf, 'rigth', verbose=verbose) if extention in ['5p', 'both']: if extention == 'both': # do not merge `pacbporf` but `merged` -> it is changed 4 lines higher up! merged, status5p = pacb.merging.merge_pacbporfs( merged, newpacbporf, 'left', verbose=verbose) else: merged, status5p = pacb.merging.merge_pacbporfs( pacbporf, newpacbporf, 'left', verbose=verbose) if float(pacbporf.bitscore) == 0.0: print "ZeroDivisionError in creation!" # Only reset the old (pacbporf) by the new (merged) if: # True in (status3p, status5p) AND # orf.bitscore ratio >= optimization_bitscore_ratio AND # orf.identityscore >= optimization_identity_ratio # Be aware of a potential ZeroDivisionError in the bitscore ratio try: bitscore_ratio_check = (float(merged.bitscore) / float( pacbporf.bitscore)) >= optimization_bitscore_ratio except ZeroDivisionError: # do not take ratio, just check if bigger. # by default, optimization_bitscore_ratio < 1.0, so # checking for gte is even a more stringent check bitscore_ratio_check = merged.bitscore >= pacbporf.bitscore # ZeroDivisionError in the identityscore can not/hardly be possible. # identityscore == 0 means nothing that is alignable at all! # But, just be certain becasue bitscore ratio ZeroDivisionError occurred as well try: identity_ratio_check = ( merged.identityscore / pacbporf.identityscore) >= optimization_identity_ratio except ZeroDivisionError: # do not take ratio, just check if bigger. # by default, optimization_identity_ratio < 1.0, so # checking for gte is even a more stringent check identity_ratio_check = merged.identityscore >= pacbporf.identityscore if True in (status3p, status5p ) and bitscore_ratio_check and identity_ratio_check: # reset 'old' pacbporf by 'merged' nodeQ = cbg.node_by_organism(orgA) nodeS = cbg.node_by_organism(orgB) cbg.remove_pacbp(pacbporf, nodeQ, nodeS) # and reset the pacbporf into the cbg merged.extend_pacbporf_after_stops() merged.source = "clustalw-OPTIMIZED" newkey = merged.construct_unique_key(nodeQ, nodeS) cbg.pacbps[(newkey, nodeQ, nodeS)] = merged IS_IMPROVED = True if verbose: print "IMPROVEMENT", orgA, orgB ###merged.print_protein(_linesize=150) else: if verbose: print "DISCARDED", orgA, orgB continue if IS_IMPROVED: cbg.clear_cache() cbg.update_edge_weights_by_minimal_spanning_range() cbg.create_cache() if verbose: print "### OPTIMIZED CBG", cbg cbg.printmultiplealignment() # return status True -> this CBG is optimized! return True else: if verbose: print "### no CBG optimization" # return status False -> no CBG optimized! return False
def improvealignment(cbg,verbose=False, allow_3p_optimization=True, allow_5p_optimization=True, maximal_cbg_identity=CBG_OPTIMIZE_MAXIMAL_IDENTITY, clustalw_gap_size=CBG_OPTIMIZE_CLUSTALW_GAP_SIZE, optimization_bitscore_ratio=CBG_OPTIMIZE_MINIMAL_BITSCORE_RATIO, optimization_identity_ratio=CBG_OPTIMIZE_MINIMAL_IDENTITY_RATIO): """ (Try to) Improve the multiple alignment of this CBG with clustalw @type cbg: CodingBlockGraph @param cbg: CodingBlockGraph instance to optimize @type verbose: Boolean @param verbose: print debugging/intermediate information to STDOUT @type allow_3p_optimization: Boolean @param allow_3p_optimization: allow optimization(extension!) on the 3p side @type allow_5p_optimization: Boolean @param allow_5p_optimization: allow optimization(extension!) on the 5p side @type clustalw_gap_size: integer @param clustalw_gap_size: split ClustalW-multiplealignment obtained PacbPs on this AA gap size @type maximal_cbg_identity: float @param maximal_cbg_identity: do not optimize CBG when its GTG.identity() > this number @type optimization_bitscore_ratio: float @param optimization_bitscore_ratio: only allow longer ClustalW-PacbPs when at least this ratio towards the original PacbP @type optimization_identity_ratio: float @param optimization_identity_ratio: only allow longer ClustalW-PacbPs when at least this ratio towards the original PacbP @attention: when a CBG is flanked by a lsrCBG in the GSG, it advised to set allow_*p_optimization to False @rtype: Boolean @return: is the CBG optimized or not """ IS_IMPROVED = False # if both allow_*p_optimization are False -> no optimization! if not allow_3p_optimization and not allow_5p_optimization: return False # check if there is a likely chance that we can optimize this cbg # This chance is defined by parameter if cbg.get_genetree().identity() > maximal_cbg_identity: return False # gather current CBG data to compare before/after ClustalW optimization current_cbg_total_weight = cbg.total_weight() current_cbg_string = str(cbg) current_cbg_omsr = cbg.overall_minimal_spanning_range() current_cbg_maxsr = cbg.maximal_spanning_range() # get the orf's sequences in a dict and do clustalw seqs = cbg.getorfproteinsequences() (_algseqs,_algm) = clustalw(seqs=seqs) # check if there is at least a single aligned position if len(_algm) == _algm.count(' '): return False # get position of the first and last aligned AA in the clustalw alignment firstalignedpos = 0 finalalignedpos = len(_algm)-1 while _algm[firstalignedpos] == ' ': firstalignedpos+=1 while _algm[finalalignedpos] == ' ': finalalignedpos-=1 # increase finalalignedpos+=1 for compatibility asa list slice finalalignedpos+=1 # translate ClustalW multiple alignment start & end to OMSR coordinates # While doing this, check if the current OMSR is fully covered by the # ClustalW OMSR. In case of long orf sequences and small CBGs, # ClustalW is likely to produce out-of-range alignments! newomsr = {} OMSR_IS_COMPLETELY_COVERED = True for org in seqs.keys(): orf = cbg.get_orfs_of_graph(organism=org)[0] node = cbg.node_by_organism(org) omsrstart = orf.protein_startPY +\ ( firstalignedpos - _algseqs[org][0:firstalignedpos].count('-') ) omsrend = omsrstart + ( finalalignedpos - firstalignedpos -\ _algseqs[org][firstalignedpos:finalalignedpos].count('-') ) newomsr[org] = (omsrstart,omsrend) # get the union between cirrent CBGs OMSR and this novel OMSR omsrunion = current_cbg_omsr[node].intersection( Set(range( omsrstart, omsrend+1 ) ) ) if len(omsrunion) < len(current_cbg_omsr[node]): # no, OMSR shrunk in stead of increased OMSR_IS_COMPLETELY_COVERED = False #################################################################### if verbose: print org, len(omsrunion), " < ", len(current_cbg_omsr[node]) #################################################################### # continue here; for this Organism identifier no improvement continue ######################################################################## if verbose: print org, min(current_cbg_omsr[node]), max(current_cbg_omsr[node]), print "new:", (omsrstart,omsrend), print "maxsr:", min(current_cbg_maxsr[node]), print max(current_cbg_maxsr[node]), node, orf , print orf.protein_startPY, orf.protein_endPY, len(_algseqs[org]), print len(_algseqs[org])-_algseqs[org].count('-'), orf.length/3 ######################################################################## # Check if current CBG OMSR is overlapping with clustalw OMSR if not OMSR_IS_COMPLETELY_COVERED: ######################################################################## if verbose: print "NO improvement, ClustalW out-of-range-alignment" ######################################################################## return False ############################################################################ if verbose: linesize=100 print "<ClustalW obtained multiple alignment>" for offset in range(0,len(_algm),linesize): start = firstalignedpos + offset end = start + linesize if end > finalalignedpos: end = finalalignedpos if offset==0 and finalalignedpos-firstalignedpos < linesize: end = finalalignedpos for org in seqs.keys(): print _algseqs[org][start:end], org print _algm[start:end] print "" if end == finalalignedpos: break print current_cbg_string cbg.printmultiplealignment() ############################################################################ # loop over the pairwise organism combinations and make new pacbps # but only if the new OMSR extends the known OMSR. # In this process, split the ClustalW PacbpOrfs for gaps # of size clustalw_gap_size for orgA,orgB in cbg.pairwisecrosscombinations_organism(): # get the current/original pacbporf pacbporf = cbg.get_pacbp_by_organisms(orgA,orgB) # check if there is novel extention and on which side extention = _does_clustalw_omsr_extend_pacbporf_omsr( pacbporf,orgA,orgB,newomsr) # Check if extention is alowed in this side. This check is recommended # to be included for CBGs # that are neigbored/delimited by lsrCBG(s) if not allow_3p_optimization and extention in ['both','3p']: continue # not alowed! if not allow_5p_optimization and extention in ['both','5p']: continue # not alowed! if extention == None: continue # not alowed! # get orf objects and aligned sequence parts orfA = cbg.get_orfs_of_graph(organism=orgA)[0] orfB = cbg.get_orfs_of_graph(organism=orgB)[0] seqA = _algseqs[orgA][firstalignedpos:finalalignedpos] seqB = _algseqs[orgB][firstalignedpos:finalalignedpos] nodeQ = cbg.node_by_organism(orgA) nodeS = cbg.node_by_organism(orgB) # make pacbp from this clustalw alignment and extend it alignment = ( seqA, _algm[firstalignedpos:finalalignedpos], seqB ) coords = ( newomsr[orgA][0], newomsr[orgA][1], newomsr[orgB][0], newomsr[orgB][1] ) newpacbp = pacb.conversion.pacbp_from_clustalw( alignment=alignment,coords=coords) # check for gaps in the clustalw alignment; if so, split them and # select the PacbP that overlaps with the OMSR if newpacbp.alignment_has_gaps(gap_size=clustalw_gap_size): splitted,status = pacb.splitting.split_pacb_on_gaps( newpacbp,gapsize=clustalw_gap_size) if not status: # pacbp cannot be splitted for some reason. # Ignore it and continue with the next orgA/orgB comparison continue split_is_compatible = False for splittedpacbp in splitted: if splittedpacbp.query_start <= min(current_cbg_omsr[nodeQ]) and\ splittedpacbp.query_end >= max(current_cbg_omsr[nodeQ]) and\ splittedpacbp.sbjct_start <= min(current_cbg_omsr[nodeS]) and\ splittedpacbp.sbjct_end >= max(current_cbg_omsr[nodeS]): # this is the splitted PacbP that overlaps with current OMSR newpacbp = splittedpacbp split_is_compatible = True # update the clustalwOMSR coords (newomsr) newomsr[orgA] = (newpacbp.query_start,newpacbp.query_end) newomsr[orgB] = (newpacbp.sbjct_start,newpacbp.sbjct_end) # check - again - if this clustalw OMSR is an extention # check with the ORIGINAL pacbporf! extention = _does_clustalw_omsr_extend_pacbporf_omsr( pacbporf,orgA,orgB,newomsr) # if no extention, set split as incompatible if extention == None: split_is_compatible = False # break out of looping over the splits break # check if the split was compatible with the OMSR of the current CBG if not split_is_compatible: # pacbp splits rigth through the relevant OMSR region. # Ignore it and continue with the next orgA/orgB comparison continue # If here, convert the (splitted) pacbp into pacbporf newpacbporf = pacb.conversion.pacbp2pacbporf(newpacbp,orfA,orfB) # now merge the clustalw pacbporf with the existing blast pacbporf status3p, status5p = False,False if extention in ['3p','both']: merged, status3p = pacb.merging.merge_pacbporfs( pacbporf,newpacbporf,'rigth',verbose=verbose) if extention in ['5p','both']: if extention == 'both': # take `merged` as input pacbporf, not `pacbporf` -> # it is changed 4 lines higher up! merged, status5p = pacb.merging.merge_pacbporfs( merged,newpacbporf,'left',verbose=verbose) else: merged, status5p = pacb.merging.merge_pacbporfs( pacbporf,newpacbporf,'left',verbose=verbose) # Only reset the old (pacbporf) by the new (merged) if: # True in (status3p, status5p) AND # orf.bitscore ratio >= optimization_bitscore_ratio AND # orf.identityscore >= optimization_identity_ratio try: # Be aware of a potential ZeroDivisionError in the bitscore ratio bitscore_ratio_check = ( float(merged.bitscore) /\ float(pacbporf.bitscore) ) >= optimization_bitscore_ratio except ZeroDivisionError: # do not take ratio, just check if bigger. # by default, optimization_bitscore_ratio < 1.0, so # checking for gte is even a more stringent check bitscore_ratio_check = merged.bitscore >= pacbporf.bitscore try: # ZeroDivisionError in the identityscore can not/hardly be possible. # Identityscore == 0 means nothing that is alignable at all! # But, safety first ... identity_ratio_check = ( merged.identityscore /\ pacbporf.identityscore ) >= optimization_identity_ratio except ZeroDivisionError: # do not take ratio, just check if bigger. # by default, optimization_identity_ratio < 1.0, so # checking for gte is even a more stringent check identity_ratio_check = merged.identityscore >=pacbporf.identityscore if True in (status3p, status5p) and\ bitscore_ratio_check and identity_ratio_check: # reset 'old' pacbporf by 'merged' nodeQ = cbg.node_by_organism(orgA) nodeS = cbg.node_by_organism(orgB) cbg.remove_pacbp(pacbporf,nodeQ,nodeS) # and reset the pacbporf into the cbg merged.extend_pacbporf_after_stops() merged.source="clustalw-OPTIMIZED" newkey = merged.construct_unique_key(nodeQ,nodeS) cbg.pacbps[(newkey,nodeQ,nodeS)] = merged IS_IMPROVED = True #################################################################### if verbose: print "IMPROVEMENT", orgA, orgB #################################################################### else: #################################################################### if verbose: print "DISCARDED", orgA, orgB #################################################################### continue if IS_IMPROVED: # CBG is succesfully changed. Recreate cache etc. cbg.clear_cache() cbg.update_edge_weights_by_minimal_spanning_range() cbg.create_cache() ############################################################ if verbose: print "### OPTIMIZED CBG", cbg cbg.printmultiplealignment() ############################################################ # return status True -> this CBG is optimized! return True else: ############################################################ if verbose: print "### no CBG optimization" ############################################################ # return status False -> no CBG optimized! return False
def merge_pacbporfs_with_closeby_independant_introns(pacbporfD,pacbporfA, verbose=False,**kwargs): """ Merge 2 PacbPORF objects by closeby independant gained introns @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs) @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intronQ, intronS, CIGexonPacbPORF ) """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes kwargs['allow_phase_shift'] = True _update_kwargs(kwargs,KWARGS_CLOSEBY_INDEPENDANT_INTRON_GAIN) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['cig_max_aa_length'] # run regular merge_pacbporfs_with_introns function alg_introns = merge_pacbporfs_with_introns(pacbporfD,pacbporfA,verbose=verbose,**kwargs) cig_introns = [] if verbose: print "introns::", len(alg_introns), "cig_max_aa_length:", kwargs['cig_max_aa_length'], kwargs['aligned_site_max_triplet_distance'] # check if there is length congruence between the cig_introns for intQ,intS in alg_introns: dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos,forced_return=True) dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos,forced_return=True) aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos,forced_return=True) aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos,forced_return=True) distDnt = (dQpos*3 + dQphase) - (dSpos*3 + dSphase) distAnt = (aQpos*3 + aQphase) - (aSpos*3 + aSphase) ######################################################################## if verbose: print (intQ.donor.pos, intQ.acceptor.pos), print (intS.donor.pos, intS.acceptor.pos), print distDnt, distAnt, kwargs['max_nt_offset'] ######################################################################## if abs(distDnt-distAnt) > kwargs['max_nt_offset']: # intermediate ciigPacbPORF has query vs sbjct length discrepancy # *3 for AA2nt coordinate conversion, +2 to allow different phases # e.g. phase difference can give 1AA+2nt difference continue if intQ.donor.phase == intS.donor.phase and\ (distDnt/3) <= kwargs['aligned_site_max_triplet_distance']: # a regularly merged intron combination continue if intQ.acceptor.phase == intS.acceptor.phase and\ (distAnt/3) <= kwargs['aligned_site_max_triplet_distance']: # a regularly merged intron combination continue if abs(distDnt) <= 5 or abs(distDnt) <= 5: # most likely a splice site phase shift, not a c.i.g. continue if abs(distDnt/3) >= kwargs['cig_min_aa_length'] and\ abs(distAnt/3) >= kwargs['cig_min_aa_length'] and\ abs(distDnt/3) <= kwargs['cig_max_aa_length'] and\ abs(distAnt/3) <= kwargs['cig_max_aa_length']: # putatively a closeby independant (intron) gain cig_introns.append( ( intQ, intS ) ) ############################################################################ if verbose: for intQ,intS in cig_introns: print "cig?:", (intQ.donor.pos, intQ.acceptor.pos), print (intS.donor.pos, intS.acceptor.pos) ############################################################################ # return variable to store found positive cases of CIG into found_cig_list = [] # check if there is some sequence similarity for intQ,intS in cig_introns: # get alignment positions around query & sbjcts splice sites dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos,forced_return=True) dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos,forced_return=True) aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos,forced_return=True) aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos,forced_return=True) distD = dQpos - dSpos distA = aQpos - aSpos distDnt = (dQpos*3 + dQphase) - (dSpos*3 + dSphase) distAnt = (aQpos*3 + aQphase) - (aSpos*3 + aSphase) if distDnt > 0: # then, distAnt is as well > 0 # QUERY is extended on the donor side #mode = "SQ" #qStart = pacbporfD._positions[dSpos].query_pos #qEnd = qStart + distD #sStart = pacbporfA._positions[aSpos].sbjct_pos #sEnd = sStart + distD #qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd) #sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd) mode = "SQ" qEnd = pacbporfD.orfQ.dnapos2aapos(intQ.donor.pos) qStart= qEnd - max([distA,distD]) sStart= pacbporfA.orfS.dnapos2aapos(intS.acceptor.pos) sEnd = sStart + max([distA,distD]) qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd) sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd) else: # distDnt and distAnt are < 0 ## SBJCT is extended on the donor site #mode = "QS" #qStart = pacbporfA._positions[aQpos].query_pos #qEnd = qStart - distA #sStart = pacbporfD._positions[dQpos].sbjct_pos #sEnd = sStart - distA #qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart, abs_pos_end=qEnd) #sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart, abs_pos_end=sEnd) mode = "QS" qStart= pacbporfA.orfQ.dnapos2aapos(intQ.acceptor.pos) qEnd = qStart - min([distA,distD]) sEnd = pacbporfD.orfS.dnapos2aapos(intS.donor.pos) sStart= sEnd + min([distA,distD]) qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd) sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd) headerQ = "query_%s_%s_%s" % (qStart,qEnd,qSeq) headerS = "sbjct_%s_%s_%s" % (sStart,sEnd,sSeq) headerQ = headerQ[0:20] # truncate to prevent error headerS = headerS[0:20] # truncate to prevent error if verbose: print mode, (distD,distA), qSeq, sSeq, headerQ, headerS, distDnt, distAnt, print dQpos, aQpos, dSpos, aSpos if not qSeq: continue # superfluous check-doublecheck for sequence if not sSeq: continue # superfluous check-doublecheck for sequence #################################################### # make PacbPORF with ClustalW #################################################### # align the sequences with clustalw seqs = { headerQ: qSeq, headerS: sSeq } (alignedseqs,alignment) = clustalw(seqs=seqs) # make pacbp from clustalw alignment pacbp = pacbp_from_clustalw( alignment=( alignedseqs[headerQ], alignment, alignedseqs[headerS] ), coords=(qStart,qEnd,sStart,sEnd) ) if not pacbp: continue # strip unaligned fraction of this pacbp object, then check length pacbp.strip_unmatched_ends() if len(pacbp) < kwargs['cig_min_aa_length']: continue if len(pacbp) > kwargs['cig_max_aa_length']: continue if pacbp: # initialize extended tiny PacbPORF caused by c.i.g. if distDnt > 0: cig_pacbporf = pacbp2pacbporf(pacbp,pacbporfD.orfQ,pacbporfA.orfS) else: cig_pacbporf = pacbp2pacbporf(pacbp,pacbporfA.orfQ,pacbporfD.orfS) cig_pacbporf.extend_pacbporf_after_stops() #################################################################### if verbose: print pacbp, len(pacbp) print cig_pacbporf print "CIG:", intQ print "CIG:", intS print distD, distA, distDnt, distAnt cig_pacbporf.print_protein_and_dna() #################################################################### #################################################################### # set some meta-data properties to the intron objects #################################################################### # add distance score to introns # The distance set in merge_pacbporfs_with_introns is large; # it is the actual distance between the splice sites. In CIG, # the measure for distance is the length difference between # the offset between query and sbjct measured on the cig_pacbporf intQ._distance = abs(distDnt-distAnt) intS._distance = abs(distDnt-distAnt) if distDnt > 0: # then, distAnt is as well > 0 # QUERY is extended on the donor side # add Alignment Positional Periphery Score into objects succes = set_apps_intron_query(intQ,cig_pacbporf,pacbporfA) succes = set_apps_intron_sbjct(intS,pacbporfD,cig_pacbporf) else: # SBJCT is extended on the donor side # add Alignment Positional Periphery Score into objects succes = set_apps_intron_query(intQ,pacbporfD,cig_pacbporf) succes = set_apps_intron_sbjct(intS,cig_pacbporf,pacbporfA) # set GFF fsource attribute for recognition of intron sources intQ._gff['fsource'] = "ABGPcig" intS._gff['fsource'] = "ABGPcig" # create _linked_to_xxx attributes intQ._linked_to_pacbporfs = [ cig_pacbporf ] intS._linked_to_pacbporfs = [ cig_pacbporf ] # append to found_cig_list found_cig_list.append( ( intQ, intS, cig_pacbporf ) ) else: # no alignment possible -> try next continue # return lists of closeby_independant_introns return found_cig_list
def update_PCG_with_signalpexons(signalpexonseqs,PCG,OPTIONS, min_pacbporf_identityscore=0.20,verbose=True): """ """ if not signalpexonseqs.has_key(OPTIONS.target): return False is_any_pacbporf_added = False for targetSPexon in signalpexonseqs[OPTIONS.target]: target = OPTIONS.target for informant,infSPlist in signalpexonseqs.iteritems(): if informant == OPTIONS.target: continue # check if informant has been deleted in the meanwhile if informant not in PCG.organism_set(): continue # list to store signalp exons into signalpexon_pacbp_list = [] # get ordered pacbporfs fromt he PCG thepacbporfs = order_pacbporf_list(PCG.get_pacbps_by_organisms(OPTIONS.target,informant)) if not thepacbporfs: # no alignments present for this organism (can happen!) continue for informantSPexon in infSPlist: coords = [ targetSPexon.protein_start(), targetSPexon.protein_end(), informantSPexon.protein_start(), informantSPexon.protein_end(), ] # prior to making ClustalW-PacbP, check PacbPCOORD placeability # into the list of pacbporfs pacbpCoordsObj = PacbPCOORDS(input=( targetSPexon.proteinsequence(), informantSPexon.proteinsequence(), targetSPexon.protein_start(), informantSPexon.protein_start(), ) ) if False in [ pacbpCoordsObj.is_positioned_compatibly(pacbporf) for pacbporf in thepacbporfs ]: # *NOT* placable in current ordered list of PacbPORFS continue dist = pacbpCoordsObj.distance_towards(thepacbporfs[0]) if dist > SIGNALP_FIRSTEXON_MAX_INTRON_NT_LENGTH/3: # WAY TO FAR in front of current gene structure parts. # Do not allow (pooras a *NOT* placable in current ordered list of PacbPORFS continue elif dist == 0: # NOT placeable in front of the rest of the PacbPORFS. continue else: pass # perform ClustalW alignment on the SP exons (alignedseqs,alignment) =\ clustalw( seqs= { OPTIONS.target: targetSPexon.proteinsequence(), informant: informantSPexon.proteinsequence() } ) # make pacbp from clustalw alignment pacbp = pacbp_from_clustalw( alignment=( alignedseqs[OPTIONS.target], alignment, alignedseqs[informant] ), coords=coords ) # is there any alignment constructed? if not pacbp: continue # ignore (very) poor identyscore alignments if pacbp.identityscore < min_pacbporf_identityscore: continue # if here make extended pacbpORF signalpexonPacbpORF = pacbp2pacbporf(pacbp, targetSPexon.orf,informantSPexon.orf) signalpexonPacbpORF.extend_pacbporf_after_stops() # and store in signalpexon_pacbp_list signalpexon_pacbp_list.append( signalpexonPacbpORF ) ################################################################ if verbose: print alignedseqs[OPTIONS.target], OPTIONS.target print alignment print alignedseqs[informant], informant if pacbp: print pacbp, (OPTIONS.target, targetSPexon.orf.id), print (informant, informantSPexon.orf.id), print "DISTANCE::", dist pacbp.print_protein() print "" ################################################################ # If there are signalpexon-guided pacbporfs found, store the one # with the highest bitscore if signalpexon_pacbp_list: signalpexon_pacbp_list = order_list_by_attribute( signalpexon_pacbp_list,order_by='bits',reversed=True) # store best bitscoring pacbporf to PCG signalp_pacbporf = signalpexon_pacbp_list[0] pacbporf2PCG(signalp_pacbporf,OPTIONS.target,informant,PCG,source='SignalP-ClustalW') is_any_pacbporf_added = True #################################################################### if verbose: print "SignalP Exon added to PCG:", signalp_pacbporf, informant #################################################################### else: pass # return pointer is_any_pacbporf_added return is_any_pacbporf_added
def _create_hmm_profile(cbg,area="OMSR",prevcbg=None,nextcbg=None, strip_nonaligned_residues=False, verbose=False,**kwargs): """ """ # area must be one of # OMSR MINSR MAXSR # LEFTSPRDIF RIGTHSPRDIF # OMSRANDLEFTSPRDIF OMSRANDRIGTHSPRDIF # RIGTHORFEND # update to default value if not kwargs.has_key('sprdif_min_aa_length'): kwargs['sprdif_min_aa_length'] = 20 if area == "OMSR": if cbg.has_overall_minimal_spanning_range(): coords = cbg.overall_minimal_spanning_range() else: return None, {} elif area == "MINSR": if cbg.has_minimal_spanning_range(): coords = cbg.minimal_spanning_range() else: return None, {} elif area == "MAXSR": if cbg.has_maximal_spanning_range(): coords = cbg.maximal_spanning_range() else: return None, {} elif area == "LEFTSPRDIF": if cbg.has_left_spanningrange_difference(**kwargs): coords = cbg.left_spanningrange_difference(**kwargs) else: return None, {} elif area == "RIGTHSPRDIF": if cbg.has_rigth_spanningrange_difference(**kwargs): coords = cbg.rigth_spanningrange_difference(**kwargs) else: return None, {} elif area == "OMSRANDLEFTSPRDIF": kwargs['sprdif_min_aa_length'] = 20 if not cbg.has_overall_minimal_spanning_range() or\ not cbg.has_left_spanningrange_difference(**kwargs): return None, {} # if here, start preparing coords coords = cbg.left_spanningrange_difference(**kwargs) # remove short contributors to left SPRDIF coords = _remove_short_sprdif_contributors(coords,verbose=verbose) # increase coord range by OMSR area omsr = cbg.overall_minimal_spanning_range() for node,coordrange in coords.iteritems(): coords[node] = Set( range( min(coordrange), max(omsr[node])+1 ) ) elif area == "OMSRANDRIGTHSPRDIF": kwargs['sprdif_min_aa_length'] = 20 if not cbg.has_overall_minimal_spanning_range() or\ not cbg.has_rigth_spanningrange_difference(**kwargs): return None, {} # if here, start preparing coords coords = cbg.rigth_spanningrange_difference(**kwargs) # remove short contributors to left SPRDIF coords = _remove_short_sprdif_contributors(coords,verbose=verbose) # increase coord range by OMSR area omsr = cbg.overall_minimal_spanning_range() for node,coordrange in coords.iteritems(): coords[node] = Set( range( min(omsr[node]), max(coordrange)+1 ) ) elif area == "RIGTHORFEND": # area in between MAXSR and orfend if not cbg.has_maximal_spanning_range(): return None, {} # get coords & obtain Orf ends coords = cbg.maximal_spanning_range() nodes = coords.keys() for node in nodes: organism = cbg.organism_by_node(node) theorf = cbg.get_orfs_of_graph(organism=organism)[0] coords[node] = range(max(coords[node])+1,theorf.protein_endPY) # remove zero-length ranges if len(coords[node]) == 0: del(coords[node]) else: raise "WHAT ELSE!?" ############################################################################ if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords) ############################################################################ # decrease coord range by prevcbg if applicable if area in ["MAXSR","LEFTSPRDIF","OMSRANDLEFTSPRDIF"] and prevcbg: omsr = prevcbg.overall_minimal_spanning_range() for org in cbg.organism_set().intersection( prevcbg.organism_set() ): # omsr/coords have Node keys -> translate to Organism keys nodeCbg = cbg.get_organism_nodes(org)[0] nodePrev = prevcbg.get_organism_nodes(org)[0] # check if node not deleted earlier in coords dict if not coords.has_key(nodeCbg): continue if not omsr.has_key(nodePrev): continue sta = max( [ max(omsr[nodePrev])+1, min(coords[nodeCbg]) ] ) end = max(coords[nodeCbg])+1 coords[nodeCbg] = Set(range(sta,end)) if not coords[nodeCbg]: del( coords[nodeCbg] ) # decrease coord range by nextcbg if applicable if area in ["MAXSR","RIGTHSPRDIF","OMSRANDRIGTHSPRDIF"] and nextcbg: omsr = nextcbg.overall_minimal_spanning_range() for org in cbg.organism_set().intersection( nextcbg.organism_set() ): # omsr/coords have Node keys -> translate to Organism keys nodeCbg = cbg.get_organism_nodes(org)[0] nodeNext = nextcbg.get_organism_nodes(org)[0] # check if node not deleted earlier in coords dict if not coords.has_key(nodeCbg): continue if not omsr.has_key(nodeNext): continue sta = min(coords[nodeCbg]) end = min( [ min(omsr[nodeNext]), max(coords[nodeCbg])+1 ] ) coords[nodeCbg] = Set(range(sta,end)) if not coords[nodeCbg]: del( coords[nodeCbg] ) # check if coords still present if not coords: return None, {} ############################################################################ if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords) ############################################################################ # do/redo _remove_short_sprdif_contributors id required if area in ["MAXSR","LEFTSPRDIF","RIGTHSPRDIF", "OMSRANDLEFTSPRDIF","OMSRANDRIGTHSPRDIF","RIGTHORFEND"]: coords = _remove_short_sprdif_contributors(coords) ############################################################################ if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords) ############################################################################ # check if at least 2 sequences/nodes are remaining if len(coords) <= 1: return None, {} # check sprdif_min_aa_length if applicable if area in ["RIGTHSPRDIF","LEFTSPRDIF","OMSRANDRIGTHSPRDIF", "OMSRANDLEFTSPRDIF"]: maxlength = max([ len(vlist) for vlist in coords.values() ]) if maxlength < kwargs['sprdif_min_aa_length']: return None, {} # if here, obtain sequences and build HMM search profile # get fasta sequences and fastaseqs = cbg._get_sequences_by_coords(coords) # rewrite dict (node) keys to string keys fastaseqs, coords = _rename_dict_keys_to_strings(fastaseqs, coords) # remove empty sequence strings from fastaseqs dict empty_seq_keys = [] for k,seq in fastaseqs.iteritems(): if seq == "" or len(seq) == 1: empty_seq_keys.append(k) for k in empty_seq_keys: del(coords[k]) del(fastaseqs[k]) # check (again) if at least 2 sequences/nodes are remaining if len(coords) <= 1: return None, {} # rewrite coords to (min,max) tuple coords = dict([ (key,[min(vlist),max(vlist)+1]) for key,vlist in coords.iteritems() ]) # perform clustalw multiple alignment (alignedseqs,alignment) = clustalw( seqs= fastaseqs ) # strip exterior gaps in case of OMSR/MINSR area if area in ["OMSR","MINSR"]: alignedseqs,alignment,coords = strip_alignment_for_exterior_gaps( deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) ) # strip poorly conserved residues in case of RIGTHORFEND if area in ["RIGTHORFEND"]: alignedseqs,alignment,coords = strip_poorly_supported_tails( deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords),0.20 ) # strip_overall_nonaligned_residues if requested for: THIS IS VERY RIGID! if strip_nonaligned_residues: alignedseqs,alignment,coords = strip_overall_nonaligned_residues( deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) ) # check if alignment was completely consumed or not if not alignment or len(alignment) <= 1: return None, {} ############################################################################ if verbose: print "## HMM clustalw input profile:",prevcbg!=None,area,nextcbg!=None for node,algseq in alignedseqs.iteritems(): print algseq, node, coords[node] print alignment ############################################################################ # make unique filename for hmm profile file fname_hmm_profile = "hmmbuild_profile_%s.hmmprof" % get_random_string_tag() # write multiple alignment input file writeMultiFasta(alignedseqs,fname_hmm_profile) # make hmmbuild file of the multiplealignment fname_hmmbuild_file = hmmbuild_protein( fname_hmm_profile ) # remove hmm profile multiple alignment file osRemove(fname_hmm_profile) # return HMM serach profile filename return fname_hmmbuild_file, coords
def hmmhit2pacbp(queryorf,queryorg,querycoords,sbjctorf,sbjctorg,hmmhit,verbose=False): """ """ # trim hmmhit for unmatched characters ( sbjct_header, sbjct_start, sbjct_end, query_start, query_end, query, match, sbjct, score, expect ) = hmmhit while match and match[0] == ' ': query = query[1:] match = match[1:] sbjct = sbjct[1:] sbjct_start+=1 query_start+=1 while match and match[-1] == ' ': query = query[0:-1] match = match[0:-1] sbjct = sbjct[0:-1] sbjct_end-=1 query_end-=1 # get orf, node and AA and DNA coordinates of this sbjct hit; # correct for -1 offset in start coordinate!! sbjct_aa_start = sbjct_start - 1 + sbjctorf.protein_startPY sbjct_aa_end = sbjct_end + sbjctorf.protein_startPY sbjctNode = (sbjctorg,sbjctorf.id) query = query.replace(".","-").upper() sbjct = sbjct.replace(".","-").upper() ############################################################################ if verbose: print "hmmhit2pacbp CREATING pacbps for organism/orf: (%s,%s)" % ( sbjctorg,sbjctorf.id) print "hmmhit2pacbp Q '%s'" % query print "hmmhit2pacbp m '%s'" % match print "hmmhit2pacbp S '%s'" % sbjct print "hmmQ:", query, query_start, query_end, "gaps:", print query.count('-'), len(query) print "hmmM:", match print "hmmS:", sbjct, sbjctNode, sbjct_aa_start, sbjct_aa_end, print "len:", sbjct_aa_end-sbjct_aa_start , len(sbjct) ############################################################################ # get Node and sequence of the query queryNode = (queryorg,queryorf.id) queryseq = deepcopy(query) # calculate query sequence position on queryorf query_aa_start = querycoords[0] + query_start - 1 query_aa_end = query_aa_start + len(queryseq) - queryseq.count('-') ############################################################################ if verbose: print "hmmq:", queryseq, queryNode, query_aa_start, query_aa_end, print "len:", query_aa_end-query_aa_start, len(queryseq) ############################################################################ # make a deepcopy; sbjct is needed unchanged for the next iteration # in the for loop, but here we want to trim of gap sequences sbjctseq = deepcopy(sbjct) sbjctaastart = deepcopy(sbjct_aa_start) sbjctaaend = deepcopy(sbjct_aa_end) while queryseq and queryseq[0] == '-': queryseq = queryseq[1:] sbjctseq = sbjctseq[1:] sbjctaastart+=1 while sbjctseq and sbjctseq[0] == '-': queryseq = queryseq[1:] sbjctseq = sbjctseq[1:] query_aa_start+=1 while queryseq and queryseq[-1] == '-': queryseq = queryseq[0:-1] sbjctseq = sbjctseq[0:-1] sbjctaaend-=1 while sbjctseq and sbjctseq[-1] == '-': queryseq = queryseq[0:-1] sbjctseq = sbjctseq[0:-1] query_aa_end-=1 # NEW NEW code in december 2010. Since inwpCBGs are implemented, HMM # profiles are build from clustalw alignments which have loosely aligned # tails (SPRDIF sequences). Problem with HMM is, that in the result file # no information is written on where in teh constructed HMM this hit # starts. This **sucks** because special care was taken in ABFGP code to # make shure the exact aa-coordinates of the applied sequences to ClustalW # are known. Hmmbuild here nullifies this effort by not giving start # coordinates. Therefore, we have to check the exact start position # of the HMM match on the queryorf. if queryseq.replace("-","") != queryorf.getaas(query_aa_start,query_aa_end): # obtain (search) query sequence, replace gaps by X symbol searchqueryseq = queryseq.upper().replace("-","X") # count length of the query sequence; here IGNORE THE GAPS!! seqlen = len(queryseq.upper().replace("-","")) # make fasta sequence dictionary seqdict = { 'query_hmm': searchqueryseq, 'query_orf': queryorf.protein_sequence, } # make coords dictionary for remapping coords = { 'query_hmm':[0,seqlen], 'query_orf':[queryorf.protein_startPY,queryorf.protein_endPY], } # perform clustalw multiple alignment (alignedseqs,alignment) = clustalw( seqs= seqdict ) # strip exterior gaps alignedseqs,alignment,coords = strip_alignment_for_exterior_gaps( deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) ) if alignedseqs['query_hmm'].count("-") > 0: # in (very) exceptional cases, gaps can be introduced in the # clustalw alignment in the HMM seq. This normally does not # occur! Fix this here by placing gaps in sbjctseq too. sbjctseq_as_list = list(sbjctseq) for pos in range(0,len(alignedseqs['query_hmm'])): if alignedseqs['query_hmm'][pos] == "-": sbjctseq_as_list.insert(pos,"-") if alignedseqs['query_hmm'].find("-",pos) == -1: break sbjctseq = "".join(sbjctseq_as_list) ######################################################################## if verbose: print "\t", "FALSE::", sbjctseq, "[ WITH GAPS,SBJCT ]" print "\t", "FALSE::", queryseq, "[ WITH GAPS ]" for k,algseq in alignedseqs.iteritems(): print "\t", "FALSE::", algseq, k, coords[k], len(algseq) print "\t", "FALSE::", sbjctseq, "SBJCT", len(sbjctseq) print "\t", "FALSE::", alignment, "ALMNT", len(alignment) print "\t", "SOLVED:", len(alignedseqs['query_orf']) == len(sbjctseq) ######################################################################## # update query sequence & coordinates if len(alignedseqs['query_orf']) == len(sbjctseq): queryseq = alignedseqs['query_orf'] query_aa_start = coords['query_orf'][0] query_aa_end = coords['query_orf'][1] else: # still not identical lengths. ClustalW recovery of HMM hit # failed miserably. For now: omit # TODO: resolve this case!! # example: --filewithloci examples/bilal/CFU_830450.bothss.csv # ## HMM clustalw input profile: False MAXSR True # FPKGCESGKFINWKTFKANGVNLGAWLAKEKTHDPVW foxga [561, 598] # FQRACR--KFID-ETLSAHAL---EWESKEIVPPEVW CFU [357, 388] # hmmhit2pacbp CREATING pacbps for organism/orf: (NP1064101[anid],1) # hmmhit2pacbp Q 'FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD' # hmmhit2pacbp m '+ ka + F W k + nLG Wl E d' # hmmhit2pacbp S 'YTKAFQ--PF-SWSSAKVRGANLGGWLVQEASID' # hmmQ: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD 1 34 gaps: 0 34 # hmmM: + ka + F W k + nLG Wl E d # hmmS: YTKAFQ--PF-SWSSAKVRGANLGGWLVQEASID ('NP1064101[anid]', 1) 33 64 len: 31 34 # hmmq: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD ('CFU', 91) 357 391 len: 34 34 # FALSE:: YTKAFQ---------PF-SWSS-----------------AKVR----------GANLGG--W-LVQEASID [ WITH GAPS,SBJCT ] # FALSE:: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD [ WITH GAPS ] # FALSE:: FQKACR-------SGKFIDWKT-----------------LKAN----------ALNLGE--W-LAKEKVH query_hmm [0, 33] 70 # FALSE:: FQRACRKFIDETLSAHALEWESKEIVPPEVWQRFAEANMLIPNLAALASRMVGEIGIGNAFWRLSVQGLR query_orf [357, 427] 70 # FALSE:: YTKAFQ---------PF-SWSS-----------------AKVR----------GANLGG--W-LVQEASID SBJCT 71 # FALSE:: **:*** *.: ::*:: * .* :.:*: * *: : :: ALMNT 70 # SOLVED: False # Pacbp creation failed! return False, None if queryseq and sbjctseq: ################################################################ if len(queryseq) != len(sbjctseq): # this will result in a exception to be raised: # pacb.exceptions.InproperlyAppliedArgument # print data here about what went wrong, then # just let the error be raised print queryseq, len(queryseq), sbjctseq, len(sbjctseq) print hmmhit print "Q:", query_aa_start, query_aa_end, print query_aa_end - query_aa_start, "len:", len(queryseq) print "S:", sbjctaastart, sbjctaaend, print sbjctaaend - sbjctaastart, "len:",len(sbjctseq) ################################################################ pacbpinput = (queryseq,sbjctseq,query_aa_start,sbjctaastart) pacbp = PacbP(input=pacbpinput) # remove consistent internal gaps caused hy HMM profile search pacbp.strip_consistent_internal_gaps() pacbp.source = 'hmmsearch' pacbporf = PacbPORF(pacbp,queryorf,sbjctorf) pacbporf.strip_unmatched_ends() if pacbporf.length==0: # Pacbp creation failed! return False, None else: pacbporf.extend_pacbporf_after_stops() pacbpkey = pacbporf.construct_unique_key(queryNode,sbjctNode) # return unique key and pacbporf return (pacbpkey,queryNode,sbjctNode), pacbporf else: # Pacbp creation failed! return False, None
def merge_pacbporfs_with_closeby_independant_introns(pacbporfD, pacbporfA, verbose=False, **kwargs): """ Merge 2 PacbPORF objects by closeby independant gained introns @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs) @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intronQ, intronS, CIGexonPacbPORF ) """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes kwargs['allow_phase_shift'] = True _update_kwargs(kwargs, KWARGS_CLOSEBY_INDEPENDANT_INTRON_GAIN) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs[ 'cig_max_aa_length'] # run regular merge_pacbporfs_with_introns function alg_introns = merge_pacbporfs_with_introns(pacbporfD, pacbporfA, verbose=verbose, **kwargs) cig_introns = [] if verbose: print "introns::", len(alg_introns), "cig_max_aa_length:", kwargs[ 'cig_max_aa_length'], kwargs['aligned_site_max_triplet_distance'] # check if there is length congruence between the cig_introns for intQ, intS in alg_introns: dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos, forced_return=True) dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos, forced_return=True) aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos, forced_return=True) aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos, forced_return=True) distDnt = (dQpos * 3 + dQphase) - (dSpos * 3 + dSphase) distAnt = (aQpos * 3 + aQphase) - (aSpos * 3 + aSphase) ######################################################################## if verbose: print(intQ.donor.pos, intQ.acceptor.pos), print(intS.donor.pos, intS.acceptor.pos), print distDnt, distAnt, kwargs['max_nt_offset'] ######################################################################## if abs(distDnt - distAnt) > kwargs['max_nt_offset']: # intermediate ciigPacbPORF has query vs sbjct length discrepancy # *3 for AA2nt coordinate conversion, +2 to allow different phases # e.g. phase difference can give 1AA+2nt difference continue if intQ.donor.phase == intS.donor.phase and\ (distDnt/3) <= kwargs['aligned_site_max_triplet_distance']: # a regularly merged intron combination continue if intQ.acceptor.phase == intS.acceptor.phase and\ (distAnt/3) <= kwargs['aligned_site_max_triplet_distance']: # a regularly merged intron combination continue if abs(distDnt) <= 5 or abs(distDnt) <= 5: # most likely a splice site phase shift, not a c.i.g. continue if abs(distDnt/3) >= kwargs['cig_min_aa_length'] and\ abs(distAnt/3) >= kwargs['cig_min_aa_length'] and\ abs(distDnt/3) <= kwargs['cig_max_aa_length'] and\ abs(distAnt/3) <= kwargs['cig_max_aa_length']: # putatively a closeby independant (intron) gain cig_introns.append((intQ, intS)) ############################################################################ if verbose: for intQ, intS in cig_introns: print "cig?:", (intQ.donor.pos, intQ.acceptor.pos), print(intS.donor.pos, intS.acceptor.pos) ############################################################################ # return variable to store found positive cases of CIG into found_cig_list = [] # check if there is some sequence similarity for intQ, intS in cig_introns: # get alignment positions around query & sbjcts splice sites dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos, forced_return=True) dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos, forced_return=True) aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos, forced_return=True) aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos, forced_return=True) distD = dQpos - dSpos distA = aQpos - aSpos distDnt = (dQpos * 3 + dQphase) - (dSpos * 3 + dSphase) distAnt = (aQpos * 3 + aQphase) - (aSpos * 3 + aSphase) if distDnt > 0: # then, distAnt is as well > 0 # QUERY is extended on the donor side #mode = "SQ" #qStart = pacbporfD._positions[dSpos].query_pos #qEnd = qStart + distD #sStart = pacbporfA._positions[aSpos].sbjct_pos #sEnd = sStart + distD #qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd) #sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd) mode = "SQ" qEnd = pacbporfD.orfQ.dnapos2aapos(intQ.donor.pos) qStart = qEnd - max([distA, distD]) sStart = pacbporfA.orfS.dnapos2aapos(intS.acceptor.pos) sEnd = sStart + max([distA, distD]) qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart, abs_pos_end=qEnd) sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart, abs_pos_end=sEnd) else: # distDnt and distAnt are < 0 ## SBJCT is extended on the donor site #mode = "QS" #qStart = pacbporfA._positions[aQpos].query_pos #qEnd = qStart - distA #sStart = pacbporfD._positions[dQpos].sbjct_pos #sEnd = sStart - distA #qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart, abs_pos_end=qEnd) #sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart, abs_pos_end=sEnd) mode = "QS" qStart = pacbporfA.orfQ.dnapos2aapos(intQ.acceptor.pos) qEnd = qStart - min([distA, distD]) sEnd = pacbporfD.orfS.dnapos2aapos(intS.donor.pos) sStart = sEnd + min([distA, distD]) qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart, abs_pos_end=qEnd) sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart, abs_pos_end=sEnd) headerQ = "query_%s_%s_%s" % (qStart, qEnd, qSeq) headerS = "sbjct_%s_%s_%s" % (sStart, sEnd, sSeq) headerQ = headerQ[0:20] # truncate to prevent error headerS = headerS[0:20] # truncate to prevent error if verbose: print mode, ( distD, distA), qSeq, sSeq, headerQ, headerS, distDnt, distAnt, print dQpos, aQpos, dSpos, aSpos if not qSeq: continue # superfluous check-doublecheck for sequence if not sSeq: continue # superfluous check-doublecheck for sequence #################################################### # make PacbPORF with ClustalW #################################################### # align the sequences with clustalw seqs = {headerQ: qSeq, headerS: sSeq} (alignedseqs, alignment) = clustalw(seqs=seqs) # make pacbp from clustalw alignment pacbp = pacbp_from_clustalw(alignment=(alignedseqs[headerQ], alignment, alignedseqs[headerS]), coords=(qStart, qEnd, sStart, sEnd)) if not pacbp: continue # strip unaligned fraction of this pacbp object, then check length pacbp.strip_unmatched_ends() if len(pacbp) < kwargs['cig_min_aa_length']: continue if len(pacbp) > kwargs['cig_max_aa_length']: continue if pacbp: # initialize extended tiny PacbPORF caused by c.i.g. if distDnt > 0: cig_pacbporf = pacbp2pacbporf(pacbp, pacbporfD.orfQ, pacbporfA.orfS) else: cig_pacbporf = pacbp2pacbporf(pacbp, pacbporfA.orfQ, pacbporfD.orfS) cig_pacbporf.extend_pacbporf_after_stops() #################################################################### if verbose: print pacbp, len(pacbp) print cig_pacbporf print "CIG:", intQ print "CIG:", intS print distD, distA, distDnt, distAnt cig_pacbporf.print_protein_and_dna() #################################################################### #################################################################### # set some meta-data properties to the intron objects #################################################################### # add distance score to introns # The distance set in merge_pacbporfs_with_introns is large; # it is the actual distance between the splice sites. In CIG, # the measure for distance is the length difference between # the offset between query and sbjct measured on the cig_pacbporf intQ._distance = abs(distDnt - distAnt) intS._distance = abs(distDnt - distAnt) if distDnt > 0: # then, distAnt is as well > 0 # QUERY is extended on the donor side # add Alignment Positional Periphery Score into objects succes = set_apps_intron_query(intQ, cig_pacbporf, pacbporfA) succes = set_apps_intron_sbjct(intS, pacbporfD, cig_pacbporf) else: # SBJCT is extended on the donor side # add Alignment Positional Periphery Score into objects succes = set_apps_intron_query(intQ, pacbporfD, cig_pacbporf) succes = set_apps_intron_sbjct(intS, cig_pacbporf, pacbporfA) # set GFF fsource attribute for recognition of intron sources intQ._gff['fsource'] = "ABGPcig" intS._gff['fsource'] = "ABGPcig" # create _linked_to_xxx attributes intQ._linked_to_pacbporfs = [cig_pacbporf] intS._linked_to_pacbporfs = [cig_pacbporf] # append to found_cig_list found_cig_list.append((intQ, intS, cig_pacbporf)) else: # no alignment possible -> try next continue # return lists of closeby_independant_introns return found_cig_list