def align_consensus(cons_file_1, cons_file_2): """ Align consensus.faa to each other and to the HIV reference """ from pythonlib import EmbossStandalone from pythonlib.MarkxIO import Markx10Iterator needle_exe = 'needle' out_file = 'map_cons.needle' EmbossStandalone.needle(needle_exe, cons_file_1, cons_file_2, out=out_file, aglobal3='False') alignment = Markx10Iterator(open(out_file)).next() cons_1 = alignment.get_seq_by_num(0).tostring().upper() cons_2 = alignment.get_seq_by_num(1).tostring().upper() map = [] for c in zip(cons_1, cons_2): assert not (c[0] == c[1] and c[1] == '-'), 'not two gaps' if c[0] == '-': map.append(1) elif c[1] == '-': map.append(2) else: map.append(0) return map
length += float(len_seq) length2 += float(len_seq * len_seq) # readdict[read.] = [seq,len_seq] n += 1. meanlr = length / n stdlr = math.sqrt((n * length2 - length * length) / (n * n - n)) allowed_length = [meanlr - acclength * stdlr, meanlr + (1 + acclength) * stdlr] print >> sys.stderr, 'Allowed interval for length is', allowed_length if not os.path.isfile('tmp_align_f.needle'): print >> sys.stderr, 'needle forward' EmbossStandalone.needle(needle_exe, options.ref, f_fasta_forward_filename, out='tmp_align_f.needle', gapopen=6.0, gapext=3.0, aglobal3='False') """ else: print >>sys.stderr, 'The alignment file tmp_align_f.needle is already present' statinfo = os.stat('tmp_align_f.needle') age_sec = time.time() - statinfo.st_mtime if age_sec > 3600: print >>sys.stderr, 'Warning: it was modified more than an hour ago' age = time.gmtime(age_sec) print >>sys.stderr, 'If you want to run the alignment again, remove it' print >>sys.stderr, "using existing 'tmp_align_f.needle'..." """
def align_reads(filename): """reads the file with reads, align them with the reference, returns a dictionary with reads (in-dels are discarded) and starting position with respect to the reference """ from pythonlib import EmbossStandalone from pythonlib.MarkxIO import Markx10Iterator needle_exe = 'needle' aligned_reads = {} f_fasta = open(filename) seqlist = list(SeqIO.parse(f_fasta, 'fasta')) countreads = len(seqlist) # forward... f_fasta_forward_filename = 'tmp_reads_f.fas' f_fasta_forward = open(f_fasta_forward_filename, 'w') SeqIO.write(seqlist, f_fasta_forward, 'fasta') f_fasta_forward.close() # ...and reverse for seq in seqlist: seq.seq = seq.seq.reverse_complement() f_fasta.close() f_fasta_reverse_filename = 'tmp_reads_r.fas' f_fasta_reverse = open(f_fasta_reverse_filename, 'w') SeqIO.write(seqlist, f_fasta_reverse, 'fasta') f_fasta_reverse.close() print >> sys.stderr, 'Found', countreads, 'reads' if not os.path.isfile('tmp_align_f.needle'): print >> sys.stderr, 'needle forward' EmbossStandalone.needle(needle_exe, ref_genome, f_fasta_forward_filename, out='tmp_align_f.needle', gapopen=6.0, gapext=3.0, aglobal3='False', adesshow3='True') if not os.path.isfile('tmp_align_r.needle'): print >> sys.stderr, 'needle backward' EmbossStandalone.needle(needle_exe, ref_genome, f_fasta_reverse_filename, out='tmp_align_r.needle', gapopen=6.0, gapext=3.0, aglobal3='False', adesshow3='True') f_forward = open('tmp_align_f.needle') f_reverse = open('tmp_align_r.needle') forwardaligniter = Markx10Iterator(f_forward) reversealigniter = Markx10Iterator(f_reverse) count_forward = 0 count_reverse = 0 while True: # pos += 1 # print >> sys.stderr, '\x1B[1A\x1B[2K', pos try: f_align = forwardaligniter.next() r_align = reversealigniter.next() except: break if f_align is None or r_align is None: break assert f_align.get_all_seqs()[1].id == r_align.get_all_seqs( )[1].id, 'same seq back and forward' this_id = f_align.get_all_seqs()[1].id if float(f_align._annotations['sw_score']) > float( r_align._annotations['sw_score']): tmp = f_align.get_seq_by_num(1).tostring().upper() refseq = f_align.get_seq_by_num(0).tostring().upper() count_forward += 1 else: tmp = r_align.get_seq_by_num(1).tostring().upper() refseq = r_align.get_seq_by_num(0).tostring().upper() count_reverse += 1 q_align_start = len(tmp) - len(tmp.lstrip('-')) q_align_end = len(tmp.rstrip('-')) m_align_start = len(refseq) - len(refseq.lstrip('-')) m_align_end = len(refseq.rstrip('-')) align_start = max(m_align_start, q_align_start) align_end = min(m_align_end, q_align_end) this_read = [] for c in zip(refseq[align_start:align_end + 1], tmp[align_start:align_end + 1]): if c[0] != '-' and c[1] != '-': this_read.append(c[1]) elif c[1] == '-': this_read.append(c[0]) elif c[0] == '-': pass aligned_reads[this_id] = [''.join(this_read), align_start] return aligned_reads
def align_to_ref(al_exe, ref_file, reads_file, gen_length): """ Calls water standalone program to align reads to reference genome """ from pythonlib import EmbossStandalone import MyAlignIO import time max_read_length = 300 format = 'markx10' align_file = '%s.needle' % reads_file.rstrip('.fas') out_reads = {} cov_prof = [0] * (2 * gen_length + max_read_length) if not os.path.isfile(align_file): print 'Aligning reads via Needleman-Wunsch algorithm' EmbossStandalone.needle(al_exe, ref_file, reads_file, out=align_file, gapopen=go_default, gapext=ge_default, aglobal3='False') else: print 'The alignment file', align_file, 'is already present' statinfo = os.stat(align_file) age_sec = time.time() - statinfo.st_mtime if age_sec > 3600: print 'Warning: it was modified more than an hour ago' age = time.gmtime(age_sec) print 'If you want to run the alignment again, remove it' assert os.path.isfile(align_file), 'File %s not found' % align_file handle = open(align_file, 'rU') print 'Parsing alignment output' for alin in MyAlignIO.parse(handle, format): assert len(alin.get_all_seqs()) == 2, "Should be pairwise!" alength = int(alin.get_alignment_length()) # print 'Alignment is', alength, 'bases long' record = iter(alin) # These are the information of the query sequence, i.e. the reference query_rec = record.next() assert query_rec.name == 'query', 'This should be the query' qstart = int(query_rec.annotations['al_start']) qstop = int(query_rec.annotations['al_stop']) gaps_query = 0 qst = query_rec.seq.tostring() qls = list(qst) for c in qst.strip('-'): if c == '-': gaps_query = gaps_query + 1 # These are for the matching sequences, i.e. the reads match_rec = record.next() assert match_rec.name == 'match', 'This should be the match' mst = match_rec.seq.tostring() mls = list(mst) for c in mls: if c != '-': mstart = mls.index(c) + 1 break mstop = len(mst.rstrip('-')) # counts the gaps in the read (no flanking gaps) gaps_match = 0 for c in mst.strip('-'): if c == '-': gaps_match = gaps_match + 1 match_length = len(mst.strip('-')) if gaps_query + gaps_match > round(tolerance * match_length): # print 'too many indels,', (gaps_query + gaps_match) continue out_reads[match_rec.id] = [None, None, None, None, []] out_reads[ match_rec.id][0] = qstart # is this really useful at this time? out_reads[ match_rec.id][1] = qstop # is this really useful at this time? out_reads[match_rec.id][2] = mstart # this is out_reads[match_rec.id][3] = mstop # this too for i in range(mstart, mstop + 1): try: cov_prof[i] = cov_prof[i] + 1 except IndexError: print 'out of coverage', i this_q = qls[mstart - 1:mstop] this_m = list(mst.strip('-')) assert len(this_q) == len(this_m), 'Length must be the same %d %d' % ( len(this_q), len(this_m)) amb_calls = 0 # There are three possibilities: insertions, deletions, no in-dels for i in range(len(this_m)): if this_m[i] == '-' and this_q[i] != '-': out_reads[match_rec.id][4].append('-') if this_m[i] != '-' and this_q[i] == '-': pass if this_m[i] != '-' and this_q[i] != '-': out_reads[match_rec.id][4].append(this_m[i]) # This should never happen if this_m[i] == '-' and this_q[i] == '-': print 'Should this happen?' sys.exit() if this_m[i] == 'N': amb_calls = amb_calls + 1 if verbose: print >> sys.stderr, 'Found an N in', match_rec.id if amb_calls > amb_thresh: if verbose: print 'Read', match_rec.id, 'has too many Ns' del out_reads[match_rec.id] cp = open('./%s.covprof' % reads_file.rstrip('_reads.fas'), 'w') for i in range(1, gen_length): cp.write('%i\t%i\n' % (i, cov_prof[i])) cp.close() return out_reads