def make_gaps_consistent(self): mainseq = self.master.seq for i in xrange(1,len(self)): entry = self[i] seq = array('c', entry.seq) for i,c in enumerate(mainseq): if isGap(c): seq[i] = '-' elif isGap(seq[i]): seq[i] = '?' entry.seq = seq.tostring()
def remove_gaps(self, template_seq=None): """remove_gaps([template_seq]) : Removes all alignment columns containing gaps. Only applicable if sequences are aligned (at least within each EntryGroup). If template_seq is given, all entries must be aligned to template_seq. """ # Note that using this function will unalign entries with different codes! if template_seq: assert isinstance(template_seq[0], str) for eg in self: code = eg.code if template_seq: template = template_seq else: if not eg.hasMasterEntry(): continue template = eg[0].seq seqs = [] for entry in eg: assert len(entry.seq) == len(template) seqs.append(array('c', entry.seq)) iend = len(template) ingap = False for i in xrange(len(template), 0, -1): c = template[i-1] #print i,c,ingap if ingap: if not isGap(c): ingap = False for seq in seqs: del seq[i:iend] else: if isGap(c): ingap = True iend = i if ingap: #print "removing final gap" for seq in seqs: del seq[:iend] for i,entry in enumerate(eg): entry.seq = seqs[i].tostring()
def remove_gapped_columns(seqs, master=0): # Keep only those columns where master sequence has no gap # residuesToDelete = [] n = 0 for i,c in enumerate(seqs[master]): if isGap(c): residuesToDelete.append(i) for i in xrange(len(seqs)): assert len(seqs[i]) == len(seqs[-1]), "Sequences not aligned" seqs[i] = __stringRemoveIndeces(seqs[i], residuesToDelete)
def find_aligned_residues(seq1, seq2, equiv=None, eqchars=":."): assert len(seq1) == len(seq2) assert None == equiv or len(seq1) == len(equiv) numbering_list1 = [] numbering_list2 = [] n1=0 n2=0 if None == equiv: for i in xrange(len(seq1)): if not isGap(seq1[i]) and not isGap(seq2[i]): numbering_list1.append(n1) numbering_list2.append(n2) n1 += 1 n2 += 1 elif not isGap(seq1[i]): n1 += 1 elif not isGap(seq2[i]): n2 += 1 else: for i, e in enumerate(equiv): if equiv[i] in eqchars: numbering_list1.append(n1) numbering_list2.append(n2) if not isGap(seq1[i]): n1 += 1 if not isGap(seq2[i]): n2 += 1 return numbering_list1, numbering_list2
def __get_pid_counts(seq1, seq2): assert len(seq1) == len(seq2), "\n>seq1\n%s\n>seq2\n%s\n"%(seq1, seq2) l1 = 0 # ungapped length of seq1 l2 = 0 # ungapped length of seq2 cov = 0 # number of aligned pairs (absolute coverage) idn = 0 # number of identical pairs (absolute identity) for i in xrange(len(seq1)): c1 = seq1[i] c2 = seq2[i] if (not isGap(c1)): l1 += 1 if (not isGap(c2)): l2 += 1 cov += 1 if c1 == c2: idn += 1 elif (not isGap(c2)): l2 += 1 return (l1, l2, cov, idn)
def consensus(seqs): cons=[] totals=[] length = len(seqs[0]) for s in seqs: assert len(s) == length, "Input sequences must be aligned (of the same length)" for i in xrange(length): d = {} n=0 for j in xrange(len(seqs)): c = seqs[j][i] if isGap(c): continue n+=1 if c not in d: d[c]=1 else: d[c]+=1 residues=[] for res in sorted(d): residues.append((res, d[res])) cons.append(residues) totals.append(n) return cons, totals