def _merge_sorts(self): iterators = [] for i in range(1, self.batch_number): extra = "_batch" + str(i) new_path = brenninc_utils.create_new_file(self.fastq_file, extra, outputdir=self.outputdir, gzipped=False) iterable = wrap_sequence(new_path) iterators.append(iterable) big = heapq.merge(*iterators) extra = "_sorted" new_path = brenninc_utils.create_new_file(self.fastq_file, extra, outputdir=self.outputdir, gzipped=False) print "writing to", new_path with open(new_path, 'w') as sorted_file: for wrapper in big: wrapper.sequence.write_to_fastq_file(sorted_file) print "done" for i in range(1, self.batch_number): extra = "_batch" + str(i) new_path = brenninc_utils.create_new_file(self.fastq_file, extra, outputdir=self.outputdir, gzipped=False) os.remove(new_path)
def count_reads_using_features(sam_filename, features, counts, samtype, order, stranded, overlap_mode, quiet, minaqual, samout, directory): if samtype is None: samtype = detect_sam_type(sam_filename) with open_sam(sam_filename, samtype) as (pe_mode, read_seq): with get_write_to_samout(samout, pe_mode) as write_to_samout: if stranded in ["yes", "no", "both"]: if samout == "auto": samout = brenninc_utils.create_new_file( sam_filename, "_forward_annotated", outputdir=directory, extension="sam", gzipped=False) forward_counter = iv_counter_factory(features, counts, write_to_samout, overlap_mode) else: forward_counter = None if stranded in ["reverse", "both"]: if samout == "auto": samout = brenninc_utils.create_new_file( sam_filename, "_reverse_annotated", outputdir=directory, extension="sam", gzipped=False) reverse_counter = iv_counter_factory(features, counts, write_to_samout, overlap_mode) else: reverse_counter = None if pe_mode: pass #count_reads_paired(read_seq, forward_counter, reverse_counter, order, # quiet, minaqual, write_to_samout ) else: count_reads_single(read_seq, forward_counter, order, quiet, minaqual) if forward_counter is not None: output = brenninc_utils.create_new_file(sam_filename, "_forward_count", outputdir=directory, extension="txt", gzipped=False) print "forward written to", output with open(output, "w") as output_file: forward_counter.results(output_file) if reverse_counter is not None: output = brenninc_utils.create_new_file(sam_filename, "_reverse_count", outputdir=directory, extension="txt", gzipped=False) print "reverse written to", output with open(output, "w") as output_file: reverse_counter.results(output_file)
def count_reads_using_features(sam_filename, features, counts, samtype, order, stranded, overlap_mode, quiet, minaqual, samout, directory ): if samtype is None: samtype = detect_sam_type(sam_filename) with open_sam(sam_filename, samtype) as (pe_mode, read_seq): with get_write_to_samout(samout, pe_mode) as write_to_samout: if stranded in ["yes", "no", "both"]: if samout == "auto": samout = brenninc_utils.create_new_file(sam_filename, "_forward_annotated", outputdir=directory, extension="sam", gzipped=False) forward_counter = iv_counter_factory(features, counts, write_to_samout, overlap_mode) else: forward_counter = None if stranded in ["reverse", "both"]: if samout == "auto": samout = brenninc_utils.create_new_file(sam_filename, "_reverse_annotated", outputdir=directory, extension="sam", gzipped=False) reverse_counter = iv_counter_factory(features, counts, write_to_samout, overlap_mode) else: reverse_counter = None if pe_mode: pass #count_reads_paired(read_seq, forward_counter, reverse_counter, order, # quiet, minaqual, write_to_samout ) else: count_reads_single(read_seq, forward_counter, order, quiet, minaqual) if forward_counter is not None: output = brenninc_utils.create_new_file(sam_filename, "_forward_count", outputdir=directory, extension="txt", gzipped=False) print "forward written to", output with open(output,"w") as output_file: forward_counter.results(output_file) if reverse_counter is not None: output = brenninc_utils.create_new_file(sam_filename, "_reverse_count", outputdir=directory, extension="txt", gzipped=False) print "reverse written to", output with open(output,"w") as output_file: reverse_counter.results(output_file)
def count_reads_in_features(sam_filename, gff_filename, samtype, order, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout, directory): forward = stranded in ["yes", "both"] reverse = stranded in ["reverse", "both"] is_stranded = stranded != "NO" if samout != "" and stranded == "both": raise Exception("Output SAM alignment records not supported " "for stranded 'both'") if samtype == "bam": extensions = [".bam"] elif samtype == "sam": extensions = [".sam", ".sam.gz"] else: extensions = [".bam", ".sam", ".sam.gz"] files = brenninc_utils.find_files(sam_filename, extensions=extensions, recursive=True) features, counts = get_features(gff_filename, is_stranded, feature_type, id_attribute, quiet) for a_file in files: if samout == "auto": samout_file = brenninc_utils.create_new_file(sam_filename, "_annotated", outputdir=directory, extension="sam", gzipped=False) else: samout_file = samout print "counting", a_file count_reads(a_file, features, counts, samtype, order, forward, reverse, overlap_mode, quiet, minaqual, samout_file, directory)
def head(path, sequences=100, outputdir=None, qual_scale=_default_qual_scale): extra = "_head" + str(sequences) new_path = brenninc_utils.create_new_file(path, extra, outputdir=outputdir, gzipped=False) fastq_iterator = HTSeq.FastqReader(path, qual_scale) with open(new_path, 'w') as headFile: for sequence in itertools.islice(fastq_iterator, sequences): sequence.write_to_fastq_file(headFile)
def write_output(counter, original_filename, direction, directory): if counter is None: return output = brenninc_utils.create_new_file(original_filename, "_" + direction + "_count", outputdir=directory, extension="txt", gzipped=False) print direction, "written to", output with open(output, "w") as output_file: counter.results(output_file)
def compare(original_fastq, trimmed_fastq, unpaired_fastq, outputdir=os.getcwd(), qual_scale=_default_qual_scale): shorter_path = brenninc_utils.create_new_file(original_fastq, "_shorter", outputdir=outputdir, gzipped=False) dropped_path = brenninc_utils.create_new_file(original_fastq, "_dropped", outputdir=outputdir, gzipped=False) short_count_path = brenninc_utils.create_new_file(original_fastq, "_short_count", outputdir=outputdir, gzipped=False) with open(shorter_path, "w") as shorter_file: with open(dropped_path, "w") as dropped_file: comparer = Comparer(original_fastq, trimmed_fastq, unpaired_fastq, shorter_file, dropped_file, outputdir=outputdir, qual_scale=qual_scale) comparer.do_compare() comparer.print_shorter(short_count_path)
def _sort_and_save(self, sequences): print "sorting", len(sequences) sequences.sort(key=lambda sequence: sequence.name) if self.batch_number == 0: extra = "_sorted" else: extra = "_batch" + str(self.batch_number) new_path = brenninc_utils.create_new_file(self.fastq_file, extra, outputdir=self.outputdir, gzipped=False) print "writing to", new_path, with open(new_path, 'w') as sorted_file: for sequence in sequences: sequence.write_to_fastq_file(sorted_file) print "done"
def count_reads(sam_filename, features, counts, samtype, order, forward, reverse, overlap_mode, quiet, minaqual, samout, directory): def write_to_samout(r, assignment): if samoutfile is None: return if not pe_mode: r = (r, ) for read in r: if read is not None: samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n") if samout != "": samoutfile = open(samout, "w") else: samoutfile = None if samtype is None: samtype = detect_sam_type(sam_filename) if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader else: raise ValueError("Unknown input format %s specified." % samtype) try: if sam_filename != "-": read_seq_file = SAM_or_BAM_Reader(sam_filename) read_seq = read_seq_file first_read = iter(read_seq).next() else: read_seq_file = SAM_or_BAM_Reader(sys.stdin) read_seq_iter = iter(read_seq_file) first_read = read_seq_iter.next() read_seq = itertools.chain([first_read], read_seq_iter) pe_mode = first_read.paired_end except: sys.stderr.write("Error occured when reading beginning " "of SAM/BAM file.\n") raise try: if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) else: raise ValueError("Illegal order specified.") if forward: empty_forward = 0 ambiguous_forward = 0 counts_forward = copy.copy(counts) if reverse: empty_reverse = 0 ambiguous_reverse = 0 counts_reverse = copy.copy(counts) notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write("%d SAM alignment record%s processed.\n" % (i, "s" if not pe_mode else " pairs")) i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout(r, "__not_aligned") continue try: if r.optional_field("NH") > 1: nonunique += 1 write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(r, "__too_low_aQual") continue if forward: iv_seq_for = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) if reverse: iv_seq_rev = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: if forward: iv_seq_for = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) if reverse: iv_seq_rev = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq_rev = tuple() iv_seq_for = tuple() if r[1] is not None and r[1].aligned: if forward: iv_seq_for = (itertools.chain( iv_seq_for, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0))) if reverse: iv_seq_rev = itertools.chain( iv_seq_rev, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "__not_aligned") notaligned += 1 continue try: if ((r[0] is not None and r[0].optional_field("NH") > 1) or (r[1] is not None and r[1].optional_field("NH") > 1)): nonunique += 1 write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if ((r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual)): lowqual += 1 write_to_samout(r, "__too_low_aQual") continue try: if overlap_mode == "union": if forward: fs_for = set() for iv in iv_seq_for: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs_for = fs_for.union(fs2) if reverse: fs_rev = set() for iv in iv_seq_rev: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs_rev = fs_rev.union(fs2) elif (overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty"): if forward: fs_for = None for iv in iv_seq_for: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len(fs2) > 0 or \ overlap_mode == "intersection-strict": if fs_for is None: fs_for = fs2.copy() else: fs_for = fs_for.intersection(fs2) if reverse: fs_reverse = None for iv in iv_seq_rev: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len(fs2) > 0 or \ overlap_mode == "intersection-strict": if fs_rev is None: fs_rev = fs2.copy() else: fs_rev = fs_rev.intersection(fs2) else: sys.exit("Illegal overlap mode.") if forward: if fs_for is None or len(fs_for) == 0: write_to_samout(r, "__no_feature") empty_forward += 1 elif len(fs_for) > 1: write_to_samout( r, "__ambiguous[" + '+'.join(fs_for) + "]") ambiguous_forward += 1 else: write_to_samout(r, list(fs_for)[0]) counts_forward[list(fs_for)[0]] += 1 if reverse: if fs_reverse is None or len(fs_rev) == 0: write_to_samout(r, "__no_feature") empty_reverse += 1 elif len(fs_reverse) > 1: write_to_samout( r, "__ambiguous[" + '+'.join(fs_rev) + "]") ambiguous_reverse += 1 else: write_to_samout(r, list(fs_rev)[0]) counts_reverse[list(fs_rev)[0]] += 1 except UnknownChrom: write_to_samout(r, "__no_feature") empty_forward += 1 empty_reverse += 1 except: sys.stderr.write("Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string()) raise if not quiet: sys.stderr.write( "%d SAM %s processed.\n" % (i, "alignments " if not pe_mode else "alignment pairs")) if samoutfile is not None: samoutfile.close() if forward: output = brenninc_utils.create_new_file(sam_filename, "_forward_count", outputdir=directory, extension="txt", gzipped=False) used_features_count = 0 used_features_sum = 0 print "Forward written to", output with open(output, "w") as output_file: for fn in sorted(counts_forward.keys()): output_file.write("%s\t%d\n" % (fn, counts_forward[fn])) used_features_count += 1 used_features_sum += counts_forward[fn] output_file.write("__no_feature\t%d\n" % empty_forward) output_file.write("__ambiguous\t%d\n" % ambiguous_forward) output_file.write("__too_low_aQual\t%d\n" % lowqual) output_file.write("__not_aligned\t%d\n" % notaligned) output_file.write("__alignment_not_unique\t%d\n" % nonunique) print "Forward features with alignment\t%d" % used_features_count print "Forward alignments asigned to feature\t%d" % used_features_sum print "__forward_no_feature\t%d" % empty_forward print "__forward_ambiguous\t%d" % ambiguous_forward if reverse: output = brenninc_utils.create_new_file(sam_filename, "_reverse_count", outputdir=directory, extension="txt", gzipped=False) used_features_count = 0 used_features_sum = 0 print "Reverse written to", output with open(output, "w") as output_file: for fn in sorted(counts_reverse.keys()): output.write("%s\t%d\n" % (fn, counts_reverse[fn])) used_features_count += 1 used_features_sum += counts_reverse[fn] output_file.write("__no_feature\t%d\n" % empty_reverse) output_file.write("__ambiguous\t%d\n" % ambiguous_reverse) output_file.write("__too_low_aQual\t%d\n" % lowqual) output_file.write("__not_aligned\t%d\n" % notaligned) output_file.write("__alignment_not_unique\t%d\n" % nonunique) print "Reverse features with alignment\t%d" % used_features_count print "Reverse alignments asigned to feature\t%d" % used_features_sum print "__reverse_no_feature\t%d" % empty_reverse print "__reverse_ambiguous\t%d" % ambiguous_reverse print "__too_low_aQual\t%d" % lowqual print "__not_aligned\t%d" % notaligned print "__alignment_not_unique\t%d" % nonunique
def count_reads(sam_filename, features, counts, samtype, order, forward, reverse, overlap_mode, quiet, minaqual, samout, directory): def write_to_samout(r, assignment): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n") if samout != "": samoutfile = open(samout, "w") else: samoutfile = None if samtype is None: samtype = detect_sam_type(sam_filename) if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader else: raise ValueError("Unknown input format %s specified." % samtype) try: if sam_filename != "-": read_seq_file = SAM_or_BAM_Reader(sam_filename) read_seq = read_seq_file first_read = iter(read_seq).next() else: read_seq_file = SAM_or_BAM_Reader(sys.stdin) read_seq_iter = iter(read_seq_file) first_read = read_seq_iter.next() read_seq = itertools.chain([first_read], read_seq_iter) pe_mode = first_read.paired_end except: sys.stderr.write("Error occured when reading beginning " "of SAM/BAM file.\n") raise try: if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) else: raise ValueError("Illegal order specified.") if forward: empty_forward = 0 ambiguous_forward = 0 counts_forward = copy.copy(counts) if reverse: empty_reverse = 0 ambiguous_reverse = 0 counts_reverse = copy.copy(counts) notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write("%d SAM alignment record%s processed.\n" % (i, "s" if not pe_mode else " pairs")) i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout(r, "__not_aligned") continue try: if r.optional_field("NH") > 1: nonunique += 1 write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(r, "__too_low_aQual") continue if forward: iv_seq_for = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) if reverse: iv_seq_rev = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: if forward: iv_seq_for = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) if reverse: iv_seq_rev = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq_rev = tuple() iv_seq_for = tuple() if r[1] is not None and r[1].aligned: if forward: iv_seq_for = (itertools.chain(iv_seq_for, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0))) if reverse: iv_seq_rev = itertools.chain(iv_seq_rev, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "__not_aligned") notaligned += 1 continue try: if ((r[0] is not None and r[0].optional_field("NH") > 1) or (r[1] is not None and r[1].optional_field("NH") > 1)): nonunique += 1 write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if ((r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual)): lowqual += 1 write_to_samout(r, "__too_low_aQual") continue try: if overlap_mode == "union": if forward: fs_for = set() for iv in iv_seq_for: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs_for = fs_for.union(fs2) if reverse: fs_rev = set() for iv in iv_seq_rev: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs_rev = fs_rev.union(fs2) elif (overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty"): if forward: fs_for = None for iv in iv_seq_for: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len(fs2) > 0 or \ overlap_mode == "intersection-strict": if fs_for is None: fs_for = fs2.copy() else: fs_for = fs_for.intersection(fs2) if reverse: fs_reverse = None for iv in iv_seq_rev: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len(fs2) > 0 or \ overlap_mode == "intersection-strict": if fs_rev is None: fs_rev = fs2.copy() else: fs_rev = fs_rev.intersection(fs2) else: sys.exit("Illegal overlap mode.") if forward: if fs_for is None or len(fs_for) == 0: write_to_samout(r, "__no_feature") empty_forward += 1 elif len(fs_for) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs_for) + "]") ambiguous_forward += 1 else: write_to_samout(r, list(fs_for)[0]) counts_forward[list(fs_for)[0]] += 1 if reverse: if fs_reverse is None or len(fs_rev) == 0: write_to_samout(r, "__no_feature") empty_reverse += 1 elif len(fs_reverse) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs_rev) + "]") ambiguous_reverse += 1 else: write_to_samout(r, list(fs_rev)[0]) counts_reverse[list(fs_rev)[0]] += 1 except UnknownChrom: write_to_samout(r, "__no_feature") empty_forward += 1 empty_reverse += 1 except: sys.stderr.write("Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string()) raise if not quiet: sys.stderr.write("%d SAM %s processed.\n" % (i, "alignments " if not pe_mode else "alignment pairs")) if samoutfile is not None: samoutfile.close() if forward: output = brenninc_utils.create_new_file(sam_filename, "_forward_count", outputdir=directory, extension="txt", gzipped=False) used_features_count = 0 used_features_sum = 0 print "Forward written to", output with open(output, "w") as output_file: for fn in sorted(counts_forward.keys()): output_file.write("%s\t%d\n" % (fn, counts_forward[fn])) used_features_count += 1 used_features_sum += counts_forward[fn] output_file.write("__no_feature\t%d\n" % empty_forward) output_file.write("__ambiguous\t%d\n" % ambiguous_forward) output_file.write("__too_low_aQual\t%d\n" % lowqual) output_file.write("__not_aligned\t%d\n" % notaligned) output_file.write("__alignment_not_unique\t%d\n" % nonunique) print "Forward features with alignment\t%d" % used_features_count print "Forward alignments asigned to feature\t%d" % used_features_sum print "__forward_no_feature\t%d" % empty_forward print "__forward_ambiguous\t%d" % ambiguous_forward if reverse: output = brenninc_utils.create_new_file(sam_filename, "_reverse_count", outputdir=directory, extension="txt", gzipped=False) used_features_count = 0 used_features_sum = 0 print "Reverse written to", output with open(output, "w") as output_file: for fn in sorted(counts_reverse.keys()): output.write("%s\t%d\n" % (fn, counts_reverse[fn])) used_features_count += 1 used_features_sum += counts_reverse[fn] output_file.write("__no_feature\t%d\n" % empty_reverse) output_file.write("__ambiguous\t%d\n" % ambiguous_reverse) output_file.write("__too_low_aQual\t%d\n" % lowqual) output_file.write("__not_aligned\t%d\n" % notaligned) output_file.write("__alignment_not_unique\t%d\n" % nonunique) print "Reverse features with alignment\t%d" % used_features_count print "Reverse alignments asigned to feature\t%d" % used_features_sum print "__reverse_no_feature\t%d" % empty_reverse print "__reverse_ambiguous\t%d" % ambiguous_reverse print "__too_low_aQual\t%d" % lowqual print "__not_aligned\t%d" % notaligned print "__alignment_not_unique\t%d" % nonunique