def test_raise_exception(self): '''open_file_write() and open_file_read() should raise an exception when can't do the opening''' with self.assertRaises(utils.Error): utils.open_file_read('this_file_is_not_here_so_throw_error') with self.assertRaises(utils.Error): utils.open_file_read('this_file_is_not_here_so_throw_error.gz') with self.assertRaises(utils.Error): utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error')) with self.assertRaises(utils.Error): utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error.gz'))
def barplot_by_input_data(stat_to_plot, prefix, plot_width=21, plot_height=7): bar_heights = [] colours = [] for t in test_data_types: for scaff in scaffolders: bar_heights.append(results[t].results[scaff][stat_to_plot]) colours.append(r_colours[scaff]) r_script = prefix + '.R' f = utils.open_file_write(r_script) for type in ['png', 'pdf']: print(type + '("' + outprefix + '.' + type + '", width=', plot_width, ', height=', plot_height, ')', file=f) print('barplot(c(' + ','.join(bar_heights), '), ', #'names.arg=', names, ', ' ' ylab="', stat_to_plot, '", ', 'col=c(', ','.join(['"' + x + '"' for x in colours]), ') ', ')', sep='', file=f) print('dev.off()', file=f) utils.close(f) run_r_script(r_script)
def plot_scatter(self, stat1, stat2, outprefix, legend=False, main=''): r_script = outprefix + '.R' f = utils.open_file_write(r_script) x_coords = [int(self.results[scaff][stat1]) for scaff in scaffolders if (self.data_type, scaff) not in bad_runs] y_coords = [int(self.results[scaff][stat2]) for scaff in scaffolders if (self.data_type, scaff) not in bad_runs] x_max = max(x_coords) y_max = max(y_coords) r_syms_v, r_syms_l, r_cols_v, r_cols_l = self.get_r_vectors() for type in ['pdf', 'png', 'svg']: print(type + '("' + outprefix + '.' + type + '")', file=f) print('plot(c(' + ','.join(str(x) for x in x_coords), '), ', 'c(', ','.join(str(x) for x in y_coords), '), ', 'xlab="', stat1, '", ', 'ylab="', stat2, '", ', #'xlim=c(0,', x_max, '), ', #'ylim=c(0,', y_max, '), ', 'main="', main, '",', 'col=', r_cols_v, ', ', 'pch=', r_syms_v, ', ', 'bg=', r_cols_v, ')', sep='', file=f) if legend: print(r_legend('topleft'), file=f) print('dev.off()', file=f) utils.close(f) run_r_script(r_script)
def interleave(infile_1, infile_2, outfile): seq_reader_1 = file_reader(infile_1) seq_reader_2 = file_reader(infile_2) f_out = utils.open_file_write(outfile) for seq_1 in seq_reader_1: try: seq_2 = next(seq_reader_2) except: utils.close(f_out) raise Error('Error getting mate for sequence', seq_1.id, ' ... cannot continue') print(seq_1, file=f_out) print(seq_2, file=f_out) try: seq_2 = next(seq_reader_2) except: seq_2 = None if seq_2 is not None: utils.close(f_out) raise Error('Error getting mate for sequence', seq_2.id, ' ... cannot continue') utils.close(f_out)
def barplot_of_one_stat_sorted(self, stat, outprefix, main='', stat2=None): r_script = outprefix + '.R' f = utils.open_file_write(r_script) if stat2 is None: bar_heights = [self.results[scaff][stat] for scaff in scaffolders if (self.data_type, scaff) not in bad_runs] else: bar_heights = [self.results[scaff][stat] + self.results[scaff][stat2] for scaff in scaffolders if (self.data_type, scaff) not in bad_runs] all_data = list(zip([int(x) for x in bar_heights], [scaff for scaff in scaffolders if (self.data_type, scaff) not in bad_runs], [r_colours[scaff] for scaff in scaffolders if (self.data_type, scaff) not in bad_runs])) all_data.sort() bar_heights = [str(x[0]) for x in all_data] names = 'c(' + ','.join(['"' + x[1] + '"' for x in all_data]) + ')' cols = ['"' + t[2] + '"' for t in all_data] for type in ['pdf', 'png', 'svg']: print(type + '("' + outprefix + '.' + type + '")', file=f) print('par(mar=c(10,4,4,2) + 0.1)', file=f) print('barplot(c(' + ','.join(bar_heights), '), ', 'names.arg=', names, ', ' 'main="', main, '",', ' ylab="', stat, '", ', 'col=c(' + ','.join(cols) + '), ', 'las=2', ')', sep='', file=f) print('dev.off()', file=f) utils.close(f) run_r_script(r_script)
def print_dict_as_tsv(d, filename): f = utils.open_file_write(filename) for id in d: for interval in d[id]: print(id, interval.start+1, interval.end+1, sep='\t', file=f) utils.close(f)
def write_gff(self, filename): # sort the output by reference name then position f = utils.open_file_write(filename) for k in sorted(self.mutations.keys()): print(self.mutations[k].to_gff(), file=f) utils.close(f)
def replace_bases(infile, outfile, old, new): seq_reader = file_reader(infile) f_out = utils.open_file_write(outfile) for seq in seq_reader: seq.replace_bases(old, new) print(seq, file=f_out) utils.close(f_out)
def reverse_complement(infile, outfile): seq_reader = file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: seq.revcomp() print(seq, file=fout) utils.close(fout)
def test_file_reader_mpileup(self): '''file_reader should iterate through a pileup file correctly''' tmp_out = 'tmp.mpileup' fout = utils.open_file_write(tmp_out) mpileup_reader = mpileup.file_reader('mpileup_unittest.mpileup') for mp in mpileup_reader: print(mp, file=fout) utils.close(fout) self.assertTrue(filecmp.cmp('mpileup_unittest.mpileup', tmp_out)) os.unlink(tmp_out)
def fastn_to_quasr_primers(infile, outfile): seq_reader = file_reader(infile) f_out = utils.open_file_write(outfile) for seq in seq_reader: seq2 = copy.copy(seq) seq2.revcomp() print(seq.seq, seq2.seq, sep='\t', file=f_out) utils.close(f_out)
def test_file_reader(self): '''file_reader should iterate through a nucmer file correctly''' tmp_out = 'nucmer_unittest.coords.tmp' fout = utils.open_file_write(tmp_out) nucmer_reader = nucmer.file_reader('nucmer_unittest.coords') for hit in nucmer_reader: print(hit, file=fout) utils.close(fout) self.assertTrue(filecmp.cmp('nucmer_unittest.coords.out', tmp_out)) os.unlink(tmp_out)
def trim(infile, outfile, start, end): seq_reader = file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: seq.trim(start, end) if len(seq): print(seq, file=fout) utils.close(fout)
def test_file_reader_sam(self): '''file_reader should iterate through a BAM file correctly''' tmp_sam_out = 'tmp.sam' fout = utils.open_file_write(tmp_sam_out) sam_reader = sam.file_reader('sam_unittest.bam') for sam_record in sam_reader: print(sam_record, file=fout) utils.close(fout) self.assertTrue(filecmp.cmp('sam_unittest.sam', tmp_sam_out)) os.unlink(tmp_sam_out)
def test_file_reader(self): '''file_reader should iterate through a blast file correctly''' tmp_out = 'blast_unittest.m8.tmp' for f in ['blast_unittest.m8', 'blast_unittest.m8.with_lengths']: blast_reader = blast.file_reader('blast_unittest.m8') fout = utils.open_file_write(tmp_out) for hit in blast_reader: print(hit, file=fout) utils.close(fout) self.assertTrue(filecmp.cmp('blast_unittest.m8', tmp_out)) os.unlink(tmp_out)
def deinterleave(infile, outfile_1, outfile_2, fasta_out=False): seq_reader = file_reader(infile) f_1 = utils.open_file_write(outfile_1) f_2 = utils.open_file_write(outfile_2) for seq in seq_reader: if fasta_out: print(Fasta(seq.id, seq.seq), file=f_1) else: print(seq, file=f_1) try: next(seq_reader) except StopIteration: utils.close(f_1) utils.close(f_2) raise Error('Error getting mate for sequence. Cannot continue') if fasta_out: print(Fasta(seq.id, seq.seq), file=f_2) else: print(seq, file=f_2) utils.close(f_1) utils.close(f_2)
def add_sequence_lengths(infile, ref_fai, qry_fai, outfile): ref_lengths = {} qry_lengths = {} fastn.lengths_from_fai(ref_fai, ref_lengths) fastn.lengths_from_fai(qry_fai, qry_lengths) f = utils.open_file_write(outfile) blast_reader = file_reader(infile) for hit in blast_reader: hit.add_sequence_lengths(ref_lengths, qry_lengths) print(hit, file=f) utils.close(f)
def add_sequence_lengths(infile, ref_fai, qry_fai, outfile): ref_lengths = {} qry_lengths = {} fastn.lengths_from_fai(ref_fai, ref_lengths) fastn.lengths_from_fai(qry_fai, qry_lengths) f = utils.open_file_write(outfile) blast_reader = file_reader(infile) for hit in blast_reader: hit.add_sequence_lengths(ref_lengths, qry_lengths) print(hit, file=f) utils.close(f)
def split_by_base_count(infile, outfiles_prefix, max_bases, max_seqs=None): '''Splits a fasta/q file into separate files, file size determined by number of bases. Puts <= max_bases in each split file The eException is a single sequence >=max_bases is put in its own file. This does not split sequences. ''' seq_reader = file_reader(infile) base_count = 0 file_count = 1 seq_count = 0 fout = None if max_seqs is None: max_seqs = float('inf') for seq in seq_reader: if base_count == 0: fout = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) file_count += 1 if base_count + len(seq) > max_bases or seq_count >= max_seqs: if base_count == 0: print(seq, file=fout) utils.close(fout) else: utils.close(fout) fout = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) print(seq, file=fout) base_count = len(seq) file_count += 1 seq_count = 1 else: base_count += len(seq) seq_count += 1 print(seq, file=fout) utils.close(fout)
def fasta_to_fastq(fasta_in, qual_in, outfile): fa_reader = file_reader(fasta_in) qual_reader = file_reader(qual_in, read_quals=True) f_out = utils.open_file_write(outfile) for seq in fa_reader: qual = next(qual_reader) if seq.id != qual.id: raise Error('Mismatch in names from fasta and qual file', seq.id, qual.id) qual.seq = [int(x) for x in qual.seq.split()] print(seq.to_Fastq(qual.seq), file=f_out) utils.close(f_out)
def make_tags_file(filename, seqs_for_tagging, tag_length): tags = {} f = utils.open_file_write(filename) for id, seq in seqs_for_tagging.items(): tag = make_tag_from_fastn(seq, tag_length) try: print(str(tag.to_fasta()), file=f) except: print('Error! str(tag.to_fasta()), tag=', tag) sys.exit(1) tags[seq.id] = copy.copy(tag) utils.close(f) return tags
def test_write_and_read(self): '''open_file_write() and open_file_read() should do the right thing depending gzipped or not''' for filename in ['utils.tmp', 'utils.tmp.gz', 'utils.tmp.bgz']: f = utils.open_file_write(filename) for i in range(3): print(i, file=f) utils.close(f) counter = 0 f = utils.open_file_read(filename) for line in f: self.assertEqual(counter, int(line.strip())) counter += 1 utils.close(f) os.unlink(filename)
def fastq_to_mira_xml(infile, outfile): seq_reader = file_reader(infile) fout = utils.open_file_write(outfile) print('<?xml version="1.0"?>', '<trace_volume>', sep='\n', file=fout) for seq in seq_reader: print(' <trace>', ' <trace_name>' + seq.id + '</trace_name>', ' <clip_quality_right>' + str(len(seq)) + '</clip_quality_right>', ' <clip_vector_left>1</clip_vector_left>', ' </trace>', sep='\n', file=fout) print('</trace_volume>', file=fout) utils.close(fout)
def barplot_beside(self, stat1, stat2, y_label, outprefix, main=''): r_script = outprefix + '.R' f = utils.open_file_write(r_script) r_syms_v, r_syms_l, r_cols_v, r_cols_l = self.get_r_vectors() bar_heights1 = [self.results[scaff][stat1] for scaff in scaffolders if (self.data_type, scaff) not in bad_runs] bar_heights2 = [self.results[scaff][stat2] for scaff in scaffolders if (self.data_type, scaff) not in bad_runs] #names = 'c(' + ','.join(['"' + x + '"' for x in scaffolders]) + ')' #all_data = [(max(int(bar_heights1[i]), int(bar_heights2[i])), bar_heights1[i], bar_heights2[i], scaffolders[i]) for i in range(len(bar_heights1))] all_data = [(int(bar_heights1[i]), bar_heights1[i], bar_heights2[i], scaffolders[i]) for i in range(len(bar_heights1)) if (self.data_type, scaffolders[i]) not in bad_runs] all_data.sort() bar_heights1 = [all_data[i][1] for i in range(len(all_data))] bar_heights2 = [all_data[i][2] for i in range(len(all_data))] names = 'c(' + ','.join(['"' + all_data[i][3] + '"' for i in range(len(all_data))]) + ')' #cols = [r_colours[x] for x in names] print(r''' col2trans = function(colour) { x=as.vector(col2rgb(colour)) return(rgb(x[1],x[2],x[3],20,maxColorValue=255)) } ''', file=f) print('cols=', r_cols_v, file=f) print('trans_cols=sapply(cols, col2trans)', file=f) for type in ['pdf', 'png', 'svg']: print(type + '("' + outprefix + '.' + type + '")', file=f) print('par(mar=c(10,4,4,2) + 0.1)', file=f) print('bar_heights1 = c(' + ','.join(bar_heights1) + ')', 'bar_heights2 = c(' + ','.join(bar_heights2) + ')', 'bar_heights = t(as.matrix(data.frame(bar_heights1, bar_heights2)))', sep='\n', file=f) print('barplot(bar_heights, names.arg=', names, ', ' ' ylab="', y_label, '", ', #'col=c(cols, col2trans) ', 'col=c("black", "gray"), ', 'beside=T,', 'main="', main, '",', 'las=2)', sep='', file=f) print('dev.off()', file=f) utils.close(f) run_r_script(r_script)
def test_print_line_length(self): '''__str__ should be formatted correctly with the right number of chars per line of sequence''' line_lengths = [0, 3] correct_files = [ 'fastn_unittest_one-per-line.fa', 'fastn_unittest_3-per-line.fa' ] for i in range(len(line_lengths)): seq_reader = fastn.file_reader('fastn_unittest_one-per-line.fa') fastn.Fasta.line_length = line_lengths[i] tmp_out = 'tmp.line_length_test.fa' f = utils.open_file_write(tmp_out) for s in seq_reader: print(s, file=f) utils.close(f) self.assertTrue(filecmp.cmp(correct_files[i], tmp_out)) os.unlink(tmp_out) fastn.Fasta.line_length = 60
def barplot_of_one_stat_coloured(self, stat, outprefix, main=''): r_script = outprefix + '.R' f = utils.open_file_write(r_script) r_syms_v, r_syms_l, r_cols_v, r_cols_l = self.get_r_vectors() bar_heights = [self.results[scaff][stat] for scaff in scaffolders if (self.data_type, scaff) not in bad_runs] names = 'c(' + ','.join(['"' + x + '"' for x in scaffolders if (self.data_type, x) not in bad_runs]) + ')' for type in ['pdf', 'png', 'svg']: print(type + '("' + outprefix + '.' + type + '")', file=f) print('barplot(c(' + ','.join(bar_heights), '), ', 'names.arg=', names, ', ' 'main="', main, '",', ' ylab="', stat, '", ', 'col=', r_cols_v, ', ', ')', sep='', file=f) print('dev.off()', file=f) utils.close(f) run_r_script(r_script)
#!/usr/bin/env python3.3 import argparse import fastn import utils parser = argparse.ArgumentParser( description='Gets all IDs from a fasta or fastq file', usage='%(prog)s <infile> <outfile>') parser.add_argument('infile', help='Name of fasta/q file to be read') parser.add_argument('outfile', help='Name of output file') options = parser.parse_args() seq_reader = fastn.file_reader(options.infile) f_out = utils.open_file_write(options.outfile) for seq in seq_reader: print(seq.id, file=f_out) utils.close(f_out)
hit_start = sam_record.cigar.operations[0].number if sam_record.cigar.operations[-1].operator == 'S': hit_end = len(sam_record.seq) - sam_record.cigar.operations[-1].number if sam_record.id not in read_hit_coords: read_hit_coords[sam_record.id] = [] read_hit_coords[sam_record.id].append(genome_intervals.Interval(hit_start - 1, hit_end - 1)) external_progs.bwa_index_clean(bwa_index) os.unlink(bwa_sam) seq_reader = fastn.file_reader(options.reads_in) f_fa = utils.open_file_write(options.outprefix + '.fq') f_log = utils.open_file_write(options.outprefix + '.log') for seq in seq_reader: if seq.id not in read_hit_coords: print(seq, file=f_fa) print(seq.id, 'no hit', sep='\t', file=f_log) else: hits = read_hit_coords[seq.id] genome_intervals.merge_overlapping_in_list(hits) i = 0 while i < len(hits) - 1: if hits[i+1].start - hits[i].end <= options.join_distance: hits[i] = hits[i].union_fill_gap(hits[i+1]) hits.pop(i+1)
if qry_index is not None and ref_index is not None: clusters[qry_index].add(hit.ref_name) if qry_index != ref_index: clusters[qry_index].update(clusters[ref_index]) clusters.pop(ref_index) elif qry_index is not None: clusters[qry_index].add(hit.ref_name) elif ref_index is not None: clusters[ref_index].add(hit.qry_name) else: clusters.append(set([hit.qry_name, hit.ref_name])) # print clusters ordered by size of cluster clusters.sort(key=len, reverse=True) f = utils.open_file_write(options.outfile) for i in range(len(clusters)): for x in clusters[i]: unused_seqs.discard(x) print(i + 1, x, strands[x], sep='\t', file=f) counter = -1 for x in sorted(unused_seqs): print(counter, x, sep='\t', file=f) counter -= 1 utils.close(f) assembled_seqs = []
utils.syscall(external_progs.bowtie2_align + ' -f -x ' + options.scaffolds_fa + ' -U ' + tags_fa_file + ' -S ' + samfile) utils.syscall('samtools view -T ' + options.scaffolds_fa + ' -bS ' + samfile + ' > ' + bamfile) os.unlink(samfile) utils.syscall('samtools sort ' + bamfile + ' ' + sorted_bamfile[0:-4]) #os.unlink(bamfile) # Load the hits into memory previous_sam = None previous_tag = None sam_reader = sam.file_reader(sorted_bamfile) flag_counts = {k: 0 for k in [0, 1, 2, 4, 5, 8, 12, 16]} tags_from_bam = set() tag_distances = [] f_log = utils.open_file_write(options.outprefix + '.log') f_tags_and_sam = utils.open_file_write(options.outprefix + '.tags_and_sam.gz') skipped_tags = 0 for current_sam in sam_reader: if current_sam.is_mapped(): tags_from_bam.add(current_sam.id) if current_sam.tags['AS'][1] != 0: print('Nonzero alignemnt score', current_sam, file=f_log) if 'XS' in current_sam.tags and current_sam.tags['XS'][ 1] >= current_sam.tags['AS'][1]: print('Non-unique best hit', current_sam, file=f_log) else: print('Unmapped', current_sam, file=f_log) if previous_sam is None:
print(scaff) scaff_dir = options.dir_in + '/' + scaff results[scaff]['flag_counts'] = get_scaff_results(scaff_dir) bsub_outfile = scaff_dir + '/' + scaff.split('.')[0].lower() + '.o' bsub_out = utils.syscall_get_stdout('bsub-out2stats.py -s ' + bsub_outfile) assert len(bsub_out) == 1 (attempt_no, exit_code, wall_hrs, cpu_secs, cpu_hrs, mem, swap, filename) = bsub_out[0].split('\t') assert exit_code == '0' results[scaff]['CPU'] = int(round(float(cpu_secs), 0)) results[scaff]['mem'] = mem # make a tsv file of all the stats f = utils.open_file_write(options.outprefix + '.stats.tsv') print('Scaffolder', 'Good joins', '\t'.join([str(x) for x in possible_flags if x not in [0, 16]]), 'Bad joins', 'Total joins', '% correct joins', 'Lost tags', 'Skipped tags', 'CPU', 'Mem', 'Extra CPU', 'Extra Mem', sep='\t', file=f)