def bam_count(args): bam = HTSeq.SAM_Reader(args.fi) #exons = htseq_read_gtf(args.fg) cnts = collections.Counter() for bundle in HTSeq.pair_SAM_alignments_with_buffer(bam): if len(bundle) != 1: continue aln1, aln2 = bundle[0] if not aln1.aligned and aln2.aligned: cnts["_unmapped"] += 1 continue gids = set() for iv, val in exons[aln1.iv].steps(): gids |= val for iv, val in exons[aln2.iv].steps(): gids |= val if len(gids) == 1: gid = list(gids)[0] cnts[gid] += 1 elif len(gids) == 0: cnts["_no_feature"] += 1 else: cnts["_ambiguous"] += 1 for gid in cnts: print("%s\t%d" % (gid, cnts[gid]))
def bam_count(args): bam = HTSeq.SAM_Reader(args.fi) #exons = htseq_read_gtf(args.fg) cnts = collections.Counter() for bundle in HTSeq.pair_SAM_alignments_with_buffer(bam): if len(bundle) != 1: continue aln1, aln2 = bundle[0] if not aln1.aligned and aln2.aligned: cnts["_unmapped"] += 1 continue gids = set() for iv, val in exons[aln1.iv].steps(): gids |= val for iv, val in exons[aln2.iv].steps(): gids |= val if len(gids) == 1: gid = list(gids)[0] cnts[gid] += 1 elif len(gids) == 0: cnts["_no_feature"] += 1 else: cnts["_ambiguous"] += 1 for gid in cnts: print("%s\t%d" % (gid, cnts[gid]))
def Get_label_information(label, annot, bam_reader): warnings.simplefilter("ignore") gas = HTSeq.GenomicArrayOfSets("auto", stranded=False) ga = HTSeq.GenomicArray("auto", stranded=False, typecode="i") gene_count = {} for feature, rank, chrom, start, end, strand, length, exon_rank_left, exon_rank_right in annot[ label]: iv = HTSeq.GenomicInterval(chrom, start, end, strand) gas[iv] += (feature, rank) gene_count[(feature, rank)] = 0 boundary_left, boundary_right = min([i[3] for i in annot[label] ]), max([i[4] for i in annot[label]]) region_fetch = annot[label][0][2] + ":" + str( int(boundary_left) - 500) + "-" + str(int(boundary_right) + 500) read_seq = bam_reader.fetch(region=region_fetch) read_seq_iter = iter(bam_reader.fetch()) one_read = next(read_seq_iter) pe_mode = one_read.paired_end if pe_mode: read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) for a in read_seq: if not pe_mode: if not a.aligned: continue if a.optional_field('NH') > 1: continue iv_seq = (cigop.ref_iv for cigop in a.cigar if cigop.type == "M" and cigop.size > 0) else: if ((a[0] and a[0].aQual < minaqual) or (a[1] and a[1].aQual < minaqual)): continue if ((a[0] and a[0].optional_field('NH') > 1) or (a[1] and a[1].optional_field('NH') > 1)): continue if a[0] is not None and a[0].aligned: iv_seq = (cigop.ref_iv for cigop in a[0].cigar if cigop.type in cigar_char and cigop.size > 0) else: iv_seq = tuple() if a[1] is not None and a[1].aligned: iv_seq = itertools.chain( iv_seq, (invert_strand(cigop.ref_iv) for cigop in a[1].cigar if cigop.type in cigar_char and cigop.size > 0)) feature_aligned = set() for iv in iv_seq: for iv2, val2 in gas[iv].steps(): feature_aligned |= val2 ga[iv] += 1 # for calculating coverage if len(feature_aligned) == 0: continue for f in [item for item in feature_aligned if item[0] == 'intron']: gene_count[f] += 1 if 'intron' not in [x for x, y in feature_aligned]: for f in feature_aligned: gene_count[f] += 1 return gas, ga, gene_count
def _calc_insert_size(regions_path: str, bam_paths: List[str], out_path: str, mc: MessageCenter): """Calculate insert sizes of each read pairs mapped to long continuous regions.""" mc.log_debug('regions_path: {}'.format(regions_path)) mc.log_debug('bam_paths: {}'.format(', '.join(bam_paths))) mc.log_debug('out_path: {}'.format(out_path)) mc.handle_progress('Collecting insert sizes...') if not os.path.exists(regions_path): raise PEUtilPathError(regions_path, 'File not exists.') bam_readers = [] for bam_path in bam_paths: if not os.path.exists(bam_path): raise PEUtilPathError(bam_path, 'File not exists.') bam_reader = HTSeq.BAM_Reader(bam_path) bam_readers.append(bam_reader) with open(regions_path) as f, open(out_path, 'w') as o: n = -1 for row in f: row = row.strip('\n') n += 1 if n != 0 and n % 1000 == 0: mc.handle_progress('{} regions processed...'.format(n)) cells = row.split('\t') try: chrom, start, end = cells[:3] start, end = int(start), int(end) except ValueError: raise PEUtilParseError(regions_path, 'Incorrect file format.') insert_sizes = [] for bam_reader in bam_readers: alns = bam_reader.fetch(chrom, start, end) for aln1, aln2 in HTSeq.pair_SAM_alignments_with_buffer(alns): aln1 = aln1 # type: HTSeq.SAM_Alignment aln2 = aln2 # type: HTSeq.SAM_Alignment if (aln1 is None) or (aln2 is None): continue should_skip = False for aln in [aln1, aln2]: assert aln.aligned assert aln.iv.start < aln.iv.end if aln.not_primary_alignment or aln.pcr_or_optical_duplicate or aln.failed_platform_qc: should_skip = True break if (aln.iv.start < start) or (end < aln.iv.end): should_skip = True break if should_skip: continue insert_sizes.append(str(np.abs(aln1.inferred_insert_size))) o.write('{0}:{1}-{2}({3})\t{4}\n'.format(chrom, start, end, end - start, ';'.join(insert_sizes)))
def Get_Skipstart_dict(region_fetch, all_bamfiles, strand): skip_list = [] for bamfile in all_bamfiles: bam_reader = HTSeq.BAM_Reader(bamfile) read_seq = bam_reader.fetch(region=region_fetch) read_seq_iter = iter(bam_reader.fetch()) one_read = next(read_seq_iter) pe_mode = one_read.paired_end if pe_mode: read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) for a in read_seq: if not pe_mode: if not a.aligned: continue if a.optional_field('NH') > 1: continue if strand == "+": skip_list.extend([ int(cigop.ref_iv.start) for cigop in a.cigar if cigop.type == "N" and cigop.size > 0 ]) else: skip_list.extend([ int(cigop.ref_iv.end) for cigop in a.cigar if cigop.type == "N" and cigop.size > 0 ]) else: if ((a[0] and a[0].aQual < minaqual) or (a[1] and a[1].aQual < minaqual)): continue if ((a[0] and a[0].optional_field('NH') > 1) or (a[1] and a[1].optional_field('NH') > 1)): continue if a[0] is not None and a[0].aligned: if strand == "+": skip_list.extend([ int(cigop.ref_iv.start) for cigop in a[0].cigar if cigop.type == "N" and cigop.size > 0 ]) else: skip_list.extend([ int(cigop.ref_iv.end) for cigop in a[0].cigar if cigop.type == "N" and cigop.size > 0 ]) if a[1] is not None and a[1].aligned: if strand == "+": skip_list.extend([ int(cigop.ref_iv.start) for cigop in a[1].cigar if cigop.type == "N" and cigop.size > 0 ]) else: skip_list.extend([ int(cigop.ref_iv.end) for cigop in a[1].cigar if cigop.type == "N" and cigop.size > 0 ]) skip_dict = dict(collections.Counter(skip_list)) return skip_dict
def count_reads_paired(read_seq, counter, order, quiet, minaqual): if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) else: raise ValueError("Illegal order specified.") i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: msg = "%d SAM alignment record pairs processed.\n" % (i) sys.stderr.write(msg) i += 1 if r[0] is not None and r[0].aligned: forward_iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) reverse_iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: forward_iv_seq = tuple() reverse_iv_seq = tuple() if r[1] is not None and r[1].aligned: rest = (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0) forward_iv_seq = itertools.chain(forward_iv_seq, rest) rest = (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0) reverse_iv_seq = itertools.chain(reverse_iv_seq, rest) else: if (r[0] is None) or not (r[0].aligned): counter.not_aligned(r) continue try: if (r[0] is not None and r[0].optional_field("NH") > 1) or \ (r[1] is not None and r[1].optional_field("NH") > 1): counter.non_unique(r) continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or \ (r[1] and r[1].aQual < minaqual): forward_counter.too_low_quality(r) continue counter.forward_count(forward_iv_seq, r) counter.reverse_count(reverse_iv_seq, r) if not quiet: sys.stderr.write("%d SAM alignment pairs processed.\n" % (i))
def count_reads_paired(read_seq, counter, order, quiet, minaqual): if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) else: raise ValueError("Illegal order specified.") i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: msg = "%d SAM alignment record pairs processed.\n" % (i) sys.stderr.write(msg) i += 1 if r[0] is not None and r[0].aligned: forward_iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) reverse_iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: forward_iv_seq = tuple() reverse_iv_seq = tuple() if r[1] is not None and r[1].aligned: rest = (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0) forward_iv_seq = itertools.chain(forward_iv_seq, rest) rest = (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0) reverse_iv_seq = itertools.chain(reverse_iv_seq, rest) else: if (r[0] is None) or not (r[0].aligned): counter.not_aligned(r) continue try: if (r[0] is not None and r[0].optional_field("NH") > 1) or \ (r[1] is not None and r[1].optional_field("NH") > 1): counter.non_unique(r) continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or \ (r[1] and r[1].aQual < minaqual): counter.too_low_quality(r) continue counter.forward_count(forward_iv_seq, r) counter.reverse_count(reverse_iv_seq, r) if not quiet: sys.stderr.write("%d SAM alignment pairs processed.\n" % (i))
def count_reads_paired(read_seq, counter, order, stranded, quiet, minaqual, write_to_samout ): if order == "name": read_seq = HTSeq.pair_SAM_alignments( read_seq ) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq ) else: raise ValueError, "Illegal order specified." i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write( "%d SAM alignment record%s processed.\n" % ( i, "s" if not pe_mode else " pairs" ) ) i += 1 if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain(iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: if ( r[0] is None ) or not ( r[0].aligned ): write_to_samout( r, "__not_aligned" ) counter.notaligned += 1 continue try: if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): counter.nonunique += 1 write_to_samout( r, "__alignment_not_unique" ) continue except KeyError: pass if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ): lowqual += 1 write_to_samout( r, "__too_low_aQual" ) continue counter.count(iv_seq, r) if not quiet: sys.stderr.write( "%d SAM %s processed.\n" % ( i, "alignments " if not pe_mode else "alignment pairs" ) )
def count_reads(sam_filename, features, counts, samtype, order, forward, reverse, overlap_mode, quiet, minaqual, samout, directory): def write_to_samout(r, assignment): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n") if samout != "": samoutfile = open(samout, "w") else: samoutfile = None if samtype is None: samtype = detect_sam_type(sam_filename) if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader else: raise ValueError("Unknown input format %s specified." % samtype) try: if sam_filename != "-": read_seq_file = SAM_or_BAM_Reader(sam_filename) read_seq = read_seq_file first_read = iter(read_seq).next() else: read_seq_file = SAM_or_BAM_Reader(sys.stdin) read_seq_iter = iter(read_seq_file) first_read = read_seq_iter.next() read_seq = itertools.chain([first_read], read_seq_iter) pe_mode = first_read.paired_end except: sys.stderr.write("Error occured when reading beginning " "of SAM/BAM file.\n") raise try: if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) else: raise ValueError("Illegal order specified.") if forward: empty_forward = 0 ambiguous_forward = 0 counts_forward = copy.copy(counts) if reverse: empty_reverse = 0 ambiguous_reverse = 0 counts_reverse = copy.copy(counts) notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write("%d SAM alignment record%s processed.\n" % (i, "s" if not pe_mode else " pairs")) i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout(r, "__not_aligned") continue try: if r.optional_field("NH") > 1: nonunique += 1 write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(r, "__too_low_aQual") continue if forward: iv_seq_for = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) if reverse: iv_seq_rev = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: if forward: iv_seq_for = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) if reverse: iv_seq_rev = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq_rev = tuple() iv_seq_for = tuple() if r[1] is not None and r[1].aligned: if forward: iv_seq_for = (itertools.chain(iv_seq_for, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0))) if reverse: iv_seq_rev = itertools.chain(iv_seq_rev, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "__not_aligned") notaligned += 1 continue try: if ((r[0] is not None and r[0].optional_field("NH") > 1) or (r[1] is not None and r[1].optional_field("NH") > 1)): nonunique += 1 write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if ((r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual)): lowqual += 1 write_to_samout(r, "__too_low_aQual") continue try: if overlap_mode == "union": if forward: fs_for = set() for iv in iv_seq_for: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs_for = fs_for.union(fs2) if reverse: fs_rev = set() for iv in iv_seq_rev: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs_rev = fs_rev.union(fs2) elif (overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty"): if forward: fs_for = None for iv in iv_seq_for: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len(fs2) > 0 or \ overlap_mode == "intersection-strict": if fs_for is None: fs_for = fs2.copy() else: fs_for = fs_for.intersection(fs2) if reverse: fs_reverse = None for iv in iv_seq_rev: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len(fs2) > 0 or \ overlap_mode == "intersection-strict": if fs_rev is None: fs_rev = fs2.copy() else: fs_rev = fs_rev.intersection(fs2) else: sys.exit("Illegal overlap mode.") if forward: if fs_for is None or len(fs_for) == 0: write_to_samout(r, "__no_feature") empty_forward += 1 elif len(fs_for) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs_for) + "]") ambiguous_forward += 1 else: write_to_samout(r, list(fs_for)[0]) counts_forward[list(fs_for)[0]] += 1 if reverse: if fs_reverse is None or len(fs_rev) == 0: write_to_samout(r, "__no_feature") empty_reverse += 1 elif len(fs_reverse) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs_rev) + "]") ambiguous_reverse += 1 else: write_to_samout(r, list(fs_rev)[0]) counts_reverse[list(fs_rev)[0]] += 1 except UnknownChrom: write_to_samout(r, "__no_feature") empty_forward += 1 empty_reverse += 1 except: sys.stderr.write("Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string()) raise if not quiet: sys.stderr.write("%d SAM %s processed.\n" % (i, "alignments " if not pe_mode else "alignment pairs")) if samoutfile is not None: samoutfile.close() if forward: output = brenninc_utils.create_new_file(sam_filename, "_forward_count", outputdir=directory, extension="txt", gzipped=False) used_features_count = 0 used_features_sum = 0 print "Forward written to", output with open(output, "w") as output_file: for fn in sorted(counts_forward.keys()): output_file.write("%s\t%d\n" % (fn, counts_forward[fn])) used_features_count += 1 used_features_sum += counts_forward[fn] output_file.write("__no_feature\t%d\n" % empty_forward) output_file.write("__ambiguous\t%d\n" % ambiguous_forward) output_file.write("__too_low_aQual\t%d\n" % lowqual) output_file.write("__not_aligned\t%d\n" % notaligned) output_file.write("__alignment_not_unique\t%d\n" % nonunique) print "Forward features with alignment\t%d" % used_features_count print "Forward alignments asigned to feature\t%d" % used_features_sum print "__forward_no_feature\t%d" % empty_forward print "__forward_ambiguous\t%d" % ambiguous_forward if reverse: output = brenninc_utils.create_new_file(sam_filename, "_reverse_count", outputdir=directory, extension="txt", gzipped=False) used_features_count = 0 used_features_sum = 0 print "Reverse written to", output with open(output, "w") as output_file: for fn in sorted(counts_reverse.keys()): output.write("%s\t%d\n" % (fn, counts_reverse[fn])) used_features_count += 1 used_features_sum += counts_reverse[fn] output_file.write("__no_feature\t%d\n" % empty_reverse) output_file.write("__ambiguous\t%d\n" % ambiguous_reverse) output_file.write("__too_low_aQual\t%d\n" % lowqual) output_file.write("__not_aligned\t%d\n" % notaligned) output_file.write("__alignment_not_unique\t%d\n" % nonunique) print "Reverse features with alignment\t%d" % used_features_count print "Reverse alignments asigned to feature\t%d" % used_features_sum print "__reverse_no_feature\t%d" % empty_reverse print "__reverse_ambiguous\t%d" % ambiguous_reverse print "__too_low_aQual\t%d" % lowqual print "__not_aligned\t%d" % notaligned print "__alignment_not_unique\t%d" % nonunique
def count_reads_paired(read_seq, forward_counter, reverse_counter, order, quiet, minaqual, write_to_samout ): if order == "name": read_seq = HTSeq.pair_SAM_alignments( read_seq ) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq ) else: raise ValueError, "Illegal order specified." i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write( "%d SAM alignment record pairs processed.\n" % ( i ) ) i += 1 if r[0] is not None and r[0].aligned: if forward_counter is not None: forward_iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 ) if reverse_counter is not None: reverse_iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: forward_iv_seq = tuple() reverse_iv_seq = tuple() if r[1] is not None and r[1].aligned: if forward_counter is not None: forward_iv_seq = itertools.chain(forward_iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) if reverse_counter is not None: reverse_iv_seq = itertools.chain( reverse_iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: if ( r[0] is None ) or not ( r[0].aligned ): write_to_samout( r, "__not_aligned" ) if forward_counter is not None: forward_Counter.notaligned += 1 if reverse_counter is not None: reverse_counter.notaligned += 1 continue try: if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): if forward_counter is not None: forward_counter.nonunique += 1 if reverse_counter is not None: reverse_counter.nonunique += 1 write_to_samout( r, "__alignment_not_unique" ) continue except KeyError: pass if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ): if forward_counter is not None: forward_counter.lowqual += 1 if reverse_counter is not None: reverse_counter.lowqual += 1 write_to_samout( r, "__too_low_aQual" ) continue if forward_counter is not None: forward_counter.count(forward_iv_seq, r) if reverse_counter is not None: reverse_counter.count(reverse_iv_seq, r) if not quiet: sys.stderr.write( "%d SAM alignment pairs processed.\n" % ( i) )
def count_reads(features, counts, pe_mode, read_seq, order, stranded, overlap_mode, quiet, minaqual, write_to_samout ): if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments( read_seq ) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq ) else: raise ValueError, "Illegal order specified." empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write( "%d SAM alignment record%s processed.\n" % ( i, "s" if not pe_mode else " pairs" ) ) i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout( r, "__not_aligned" ) continue try: if r.optional_field( "NH" ) > 1: nonunique += 1 write_to_samout( r, "__alignment_not_unique" ) continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout( r, "__too_low_aQual" ) continue if stranded != "reverse": iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 ) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain(iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: if ( r[0] is None ) or not ( r[0].aligned ): write_to_samout( r, "__not_aligned" ) notaligned += 1 continue try: if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): nonunique += 1 write_to_samout( r, "__alignment_not_unique" ) continue except KeyError: pass if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ): lowqual += 1 write_to_samout( r, "__too_low_aQual" ) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): fs = fs.union( fs2 ) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection( fs2 ) else: sys.exit( "Illegal overlap mode." ) if fs is None or len( fs ) == 0: write_to_samout( r, "__no_feature" ) empty += 1 elif len( fs ) > 1: write_to_samout( r, "__ambiguous[" + '+'.join( fs ) + "]" ) ambiguous += 1 else: write_to_samout( r, list(fs)[0] ) counts[ list(fs)[0] ] += 1 except UnknownChrom: write_to_samout( r, "__no_feature" ) empty += 1 if not quiet: sys.stderr.write( "%d SAM %s processed.\n" % ( i, "alignments " if not pe_mode else "alignment pairs" ) ) for fn in sorted( counts.keys() ): print "%s\t%d" % ( fn, counts[fn] ) print "__no_feature\t%d" % empty print "__ambiguous\t%d" % ambiguous print "__too_low_aQual\t%d" % lowqual print "__not_aligned\t%d" % notaligned print "__alignment_not_unique\t%d" % nonunique
def mapping_reads2shared_exons_introns(refGene_txt, bam_filename, minaqual, stranded, order, max_buffer_size): # initialise counters counts = {} counts['_empty'] = 0 counts['_ambiguous'] = 0 counts['_lowaqual'] = 0 counts['_notaligned'] = 0 counts['_ambiguous_readpair_position'] = 0 # Read BAM file bam_reader = HTSeq.BAM_Reader(bam_filename) # CIGAR match characters (including alignment match, sequence match, and sequence mismatch cigar_char = ('M', '=', 'X') # (Refer to HTSeq-count)strand-associated stranded_boolean = stranded == 'yes' or stranded == 'reverse' reverse_boolean = stranded == 'reverse' def invert_strand(iv): iv2 = iv.copy() if iv2.strand == "+": iv2.strand = "-" elif iv2.strand == "-": iv2.strand = "+" else: raise ValueError("Illegal strand") return iv2 sys.stdout.write( "Gene\tfeature\trank\tposition\tlength\tread_counts\tread_counts_norm\tcoverage(%)\n" ) annot = collections.OrderedDict() for line in open(refGene_txt): gene_label, feature, rank, position, length = line.strip().split('\t') chrom, iv_str, strand = position.strip().split(':') start, end = map(int, iv_str.strip().split('-')) annot.setdefault(gene_label, []).append( (feature, int(rank), chrom, start, end, strand, int(length))) for gene_name in annot: gene_count = {} gas = HTSeq.GenomicArrayOfSets("auto", stranded=stranded_boolean) ga = HTSeq.GenomicArray("auto", stranded=stranded_boolean, typecode="i") cvg_list = [] # Annotation for feature, rank, chrom, start, end, strand, length in annot[ gene_name]: iv = HTSeq.GenomicInterval(chrom, start, end, strand) gas[iv] += (feature, rank) gene_count[(feature, rank)] = 0 # 直接对bam_reader取iter有问题,作者说是pysam的bug导致的。修正:加fetch boundary_left, boundary_right = min( [i[3] for i in annot[gene_name]]), max([i[4] for i in annot[gene_name]]) region_fetch = annot[gene_name][0][2] + ':' + str( int(boundary_left) - 500) + '-' + str(int(boundary_right) + 500) read_seq = bam_reader.fetch(region=region_fetch) # distinguish SE and PE mode: read_seq_iter = iter(bam_reader.fetch()) one_read = next(read_seq_iter) pe_mode = one_read.paired_end if pe_mode: if order == 'name': read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == 'pos': read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq, max_buffer_size=max_buffer_size) else: raise ValueError("Illegal order name.") # Mapping for a in read_seq: if not pe_mode: if not a.aligned: counts['_notaligned'] += 1 continue if a.optional_field('NH') > 1: continue if a.aQual < minaqual: counts['_lowaqual'] += 1 continue if not reverse_boolean: iv_seq = (cigop.ref_iv for cigop in a.cigar if cigop.type == "M" and cigop.size > 0) else: iv_seq = (invert_strand(cigop.ref_iv) for cigop in a.cigar if cigop.type in cigar_char and cigop.size > 0) # pe mode else: if ((a[0] and a[0].aQual < minaqual) or (a[1] and a[1].aQual < minaqual)): counts['_lowaqual'] += 1 continue if ((a[0] and a[0].optional_field('NH') > 1) or (a[1] and a[1].optional_field('NH') > 1)): continue if a[0] is not None and a[0].aligned: if not reverse_boolean: iv_seq = ( cigop.ref_iv for cigop in a[0].cigar if cigop.type in cigar_char and cigop.size > 0) else: iv_seq = ( invert_strand(cigop.ref_iv) for cigop in a[0].cigar if cigop.type in cigar_char and cigop.size > 0) else: iv_seq = tuple() if a[1] is not None and a[1].aligned: if not reverse_boolean: iv_seq = itertools.chain( iv_seq, (invert_strand(cigop.ref_iv) for cigop in a[1].cigar if cigop.type in cigar_char and cigop.size > 0)) else: iv_seq = itertools.chain( iv_seq, (cigop.ref_iv for cigop in a[1].cigar if cigop.type in cigar_char and cigop.size > 0)) feature_aligned = set() for iv in iv_seq: for iv2, val2 in gas[iv].steps(): feature_aligned |= val2 ga[iv] += 1 # for calculating coverage if len(feature_aligned) == 0: counts['_empty'] += 1 continue # when mapping to intron, discard exons for f in [item for item in feature_aligned if item[0] == 'intron']: gene_count[f] += 1 # when no mapping to intron, count all exons if 'intron' not in [x for x, y in feature_aligned]: for f in feature_aligned: gene_count[f] += 1 res = [] for feature, rank, chrom, start, end, strand, length in annot[ gene_name]: feature_count = gene_count[(feature, rank)] feature_count_norm = feature_count / length * 1000 # Coverage calculation iv = HTSeq.GenomicInterval(chrom, start, end, strand) cvg_region = list(ga[iv]) cvg = len(filter(lambda x: x > 0, cvg_region)) / len(cvg_region) * 100 res.append([ feature, rank, chrom, start, end, strand, length, feature_count, feature_count_norm, cvg ]) # Output for feature, rank, chrom, start, end, strand, length, feature_count, feature_count_norm, cvg in res: pos = "%s:%d-%d:%s" % (chrom, start, end, strand) sys.stdout.write('\t'.join( map(str, [ gene_name, feature, rank, pos, length, feature_count, feature_count_norm, cvg ])) + '\n') for fn in counts.keys(): sys.stderr.write('%s\t%d\n' % (fn, counts[fn]))
def count_reads(features, counts, pe_mode, read_seq, order, stranded, overlap_mode, quiet, minaqual, write_to_samout): if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) else: raise ValueError, "Illegal order specified." empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write("%d SAM alignment record%s processed.\n" % (i, "s" if not pe_mode else " pairs")) i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout(r, "__not_aligned") continue try: if r.optional_field("NH") > 1: nonunique += 1 write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(r, "__too_low_aQual") continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "__not_aligned") notaligned += 1 continue try: if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): nonunique += 1 write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual): lowqual += 1 write_to_samout(r, "__too_low_aQual") continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len(fs2 ) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: write_to_samout(r, "__no_feature") empty += 1 elif len(fs) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]") ambiguous += 1 else: write_to_samout(r, list(fs)[0]) counts[list(fs)[0]] += 1 except UnknownChrom: write_to_samout(r, "__no_feature") empty += 1 if not quiet: sys.stderr.write( "%d SAM %s processed.\n" % (i, "alignments " if not pe_mode else "alignment pairs")) for fn in sorted(counts.keys()): print "%s\t%d" % (fn, counts[fn]) print "__no_feature\t%d" % empty print "__ambiguous\t%d" % ambiguous print "__too_low_aQual\t%d" % lowqual print "__not_aligned\t%d" % notaligned print "__alignment_not_unique\t%d" % nonunique
def count_reads_in_features(sam_filenames, gff_filename, samtype, order, max_buffer_size, stranded, overlap_mode, multimapped_mode, secondary_alignment_mode, supplementary_alignment_mode, feature_type, id_attribute, additional_attributes, quiet, minaqual, samouts): def exists(obj, chain): _key = chain.pop(0) if _key in obj: return exists(obj[_key], chain) if chain else obj[_key] def check_overlapped_exons_and_calc_sum(gene): rightmost_value = gene["exons"][0][1] start = gene["exons"][0][0] new_exons = [] total = rightmost_value - start for interval in gene["exons"]: if (interval[0] <= rightmost_value and interval[1] >= rightmost_value): total += (interval[1] - rightmost_value) rightmost_value = interval[1] elif (interval[0] > rightmost_value): total += (interval[1] - interval[0]) new_exons.append([start, rightmost_value ]) #add previous extended interval to result start = interval[0] rightmost_value = interval[1] new_exons.append([start, rightmost_value]) gene["exons"] = new_exons gene["total_sum_of_exons"] = total def check_and_count_points_coverage(gene_id, first_read, second_read): # определить какую из точек пересекает # вычесть из каждой координаты координату начала гена! if (first_read is None or second_read is None): return gene_begin = genes_exons[gene_id]["gene_begin"] fstart = first_read.iv.start - gene_begin fend = first_read.iv.end - gene_begin sstart = second_read.iv.start - gene_begin send = second_read.iv.end - gene_begin if (first_read.proper_pair == False or second_read.proper_pair == False): return if (fend < sstart and fstart < fend and sstart < send): check(gene_id, fstart, fend) check(gene_id, sstart, send) elif (send < fstart and fstart < fend and sstart < send): check(gene_id, fstart, fend) check(gene_id, sstart, send) elif (fstart < fend and sstart < send and sstart >= fstart and send >= fend and sstart <= fend): check(gene_id, fstart, send) elif (fstart < fend and sstart < send and sstart <= fstart and send >= fstart and send <= fend): check(gene_id, sstart, fend) elif (fstart < sstart and send < fend): check(gene_id, fstart, fend) elif (sstart < fstart and fend < send): check(gene_id, sstart, send) def check(gene_id, start, end): total = 100 half = total / 2 left_interval = right_interval = half try: i = 0 while (left_interval >= 10): if (i > 10): raise ValueError('Out of boundaries\n') if (exists( genes_coverage_in_points, [gene_id, half ]) == None): # если точки нет то ищем ближаишую слева # half = math.ceil(half) half = int(math.floor(half / 10) * 10) point = genes_coverage_in_points[gene_id][half]["point"] right_interval += 5 left_interval -= 5 else: # если точка есть, point = genes_coverage_in_points[gene_id][half]["point"] if (point < start): # слева точка от рида, рид справой строны half = half + (right_interval / 2) left_interval = right_interval = right_interval / 2 elif (point > end): # точка справа от рида, рид слевой стороны half = half - (left_interval / 2) left_interval = right_interval = left_interval / 2 elif (point > start and point < end): # пересекает genes_coverage_in_points[gene_id][half]["coverage"] += 1 return i += 1 except: sys.stderr.write("Out of boundaries\n") def check2(gene_id, start, end): #gene_begin = genes_exons[gene_id]["gene_begin"] for i in range(0, 100, 10): point = genes_coverage_in_points[gene_id][i]["point"] if (start < point and point < end): genes_coverage_in_points[gene_id][i]["coverage"] += 1 return def clear_all_cov_points(): for gene_id, gene in genes_coverage_in_points.iteritems(): for k, val in gene.iteritems(): val["coverage"] = 0 def plot_gene_coverage(): sys.stderr.write("ENSG00000000003.10 genes on: " + str(test_n[0]) + "\n") x = [] y = [] i = 0 for k, val in enumerate( list(cvg[HTSeq.GenomicInterval("chrX", test_first_exon_start, test_last_exon_end)])): x.append(i) y.append(val) i += 1 plt.plot(x, y) plt.show() """ iv = HTSeq.GenomicInterval("chr3", 100, 200, "+") cvg[iv] += 1 iv = HTSeq.GenomicInterval("chr3", 150, 250, "-") cvg[iv] += 1 """ if samouts != "": if len(samouts) != len(sam_filenames): raise ValueError( 'Select the same number of SAM input and output files') # Try to open samout files early in case any of them has issues for samout in samouts: with open(samout, 'w'): pass # Try to open samfiles to fail early in case any of them is not there if (len(sam_filenames) != 1) or (sam_filenames[0] != '-'): for sam_filename in sam_filenames: with open(sam_filename): pass # CIGAR match characters (including alignment match, sequence match, and # sequence mismatch com = ('M', '=', 'X') features = HTSeq.GenomicArrayOfSets("auto", stranded != "no") gff = HTSeq.GFF_Reader(gff_filename) #genes_coverage_in_points = {} genes_coverage_in_points = defaultdict(dict) #genes_exons = {} genes_exons = defaultdict(dict) #cvg = HTSeq.GenomicArray("auto", stranded != "no") test_n = [0] i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[id_attribute] except KeyError: raise ValueError( "Feature %s does not contain a '%s' attribute" % (f.name, id_attribute)) if stranded != "no" and f.iv.strand == ".": raise ValueError( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) features[f.iv] += feature_id #counts[f.attr[id_attribute]] = 0 #экзоны не в порядке сортировки! координат #ген - граница экзона #здесь будут все интервалы и сумма всех интервалов gene_id = feature_id #f.attr[id_attribute] if (exists(genes_exons, [gene_id]) == None): #координата первого экзона genes_exons[gene_id] = { "total_sum_of_exons": 0, "total_aligned_reads": 0, "gene_begin": 0, "exons": list([[f.iv.start, f.iv.end]]) } else: genes_exons[gene_id]["exons"].append( [f.iv.start, f.iv.end]) #10 точек для гена для которых будем считать покрытие(интроны вычтем) i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write("%d GFF lines processed.\n" % i) except: sys.stderr.write("Error occured when processing GFF file (%s):\n" % gff.get_line_number_string()) raise if not quiet: sys.stderr.write("%d GFF lines processed.\n" % i) if len(genes_exons) == 0: sys.stderr.write("Warning: No features of type '%s' found.\n" % feature_type) #проход по всем генам и внутри каждого сортируем по первой координате экзона #в конце сортировки каждого гена назначаем крайнюю координату начала гена(первый экзон) #пересекающиеся экзоны надо склеивать и расширять границы #после склеивания будем получать сумму экзонов total_sum_of_exons, т.е. мы получим участки непокрытые ни на одном стренде for gene_id, gene in genes_exons.iteritems(): gene["exons"].sort() #by first member gene["gene_begin"] = gene["exons"][0][0] #слить все пересекающиеся экзоны и одновременно посчитать сумму длин без полученных промежутков check_overlapped_exons_and_calc_sum(gene) total = gene["total_sum_of_exons"] # длина всех экзонов for ten_interval in xrange(0, 100, 10): point = (total * ten_interval ) / 100 #точка в абсолютном исчислении % от длины экзона prev_exon_end = 0 for exon_key, exon in enumerate(gene["exons"]): #prev_exon_length + exon.start + point += (exon[0] - prev_exon_end) #длина интрона if (point < exon[1]): #точка конца экзона #пишем точку в конечный массив genes_coverage_in_points[gene_id][ten_interval] = { "point": point - gene["gene_begin"], "coverage": 0 } break # переход на следующую точку 10% else: #длину экзона не уложившегося записываем #prev_exon_length += exon.end - exon.start prev_exon_end = exon[1] if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader else: raise ValueError("Unknown input format %s specified." % samtype) sample = 0 colors = ["red", "blue", "green", "yellow"] handlers = [] sys.stderr.write(strftime("%Y-%m-%d %H:%M:%S", gmtime()) + "\n") for isam, (sam_filename) in enumerate(sam_filenames): total_of_reads_in_sample = 0 if samouts != '': samoutfile = open(samouts[isam], 'w') else: samoutfile = None try: if sam_filename != "-": read_seq_file = SAM_or_BAM_Reader(sam_filename) read_seq = read_seq_file first_read = next(iter(read_seq)) else: read_seq_file = SAM_or_BAM_Reader(sys.stdin) read_seq_iter = iter(read_seq_file) first_read = next(read_seq_iter) read_seq = itertools.chain([first_read], read_seq_iter) pe_mode = first_read.paired_end except: sys.stderr.write( "Error occured when reading beginning of SAM/BAM file.\n") raise try: if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq, max_buffer_size=max_buffer_size) else: raise ValueError("Illegal order specified.") notaligned = 0 lowqual = 0 i = 0 for r in read_seq: #TODO 'NoneType' object has no attribute 'iv' raised in plot_coverage.py:169] total_of_reads_in_sample += 1 if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write("%d SAM alignment record%s processed.\n" % (i, "s" if not pe_mode else " pairs")) sys.stderr.write( strftime("%Y-%m-%d %H:%M:%S", gmtime()) + "\n") i += 1 if not pe_mode: if not r.aligned: #notaligned += 1 #write_to_samout(r, "__not_aligned", samoutfile) continue if ((secondary_alignment_mode == 'ignore') and r.not_primary_alignment): continue if ((supplementary_alignment_mode == 'ignore') and r.supplementary): continue try: if r.optional_field("NH") > 1: #nonunique += 1 #write_to_samout(r, "__alignment_not_unique", samoutfile) if multimapped_mode == 'none': continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 #write_to_samout(r, "__too_low_aQual", samoutfile) continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if (co.type in com and co.size > 0)) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type in com and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type in com and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): #write_to_samout(r, "__not_aligned", samoutfile) #notaligned += 1 continue if secondary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].not_primary_alignment: continue elif (r[1] is not None) and r[1].not_primary_alignment: continue if supplementary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].supplementary: continue elif (r[1] is not None) and r[1].supplementary: continue try: if ((r[0] is not None and r[0].optional_field("NH") > 1) or (r[1] is not None and r[1].optional_field("NH") > 1)): #nonunique += 1 #write_to_samout(r, "__alignment_not_unique", samoutfile) if multimapped_mode == 'none': continue except KeyError: pass if ((r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual)): lowqual += 1 #write_to_samout(r, "__too_low_aQual", samoutfile) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode in ("intersection-strict", "intersection-nonempty"): fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: continue #raise UnknownChrom for iv2, fs2 in features[iv].steps(): if ((len(fs2) > 0) or (overlap_mode == "intersection-strict")): if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is not None and len(fs) > 0: if multimapped_mode == 'none': if len(fs) == 1: #counts[list(fs)[0]] += 1 #read mapped only for one exon, (all cigar parts of both reads in pair mapped on one gene, but may be for several exons) #we can take this read into account of analysis #they must come in sorted order by coordinate! #this is one unit of analysis. save it in memory and go throught it gene_name = list(fs)[0] # - имя гена genes_exons[gene_name][ "total_aligned_reads"] += 1 #if (total_of_reads_in_sample==100000): # break check_and_count_points_coverage( gene_name, r[0], r[1]) """ elif multimapped_mode == 'all': for fsi in list(fs): #counts[fsi] += 1 """ else: sys.exit("Illegal multimap mode.") except UnknownChrom: #write_to_samout(r, "__no_feature", samoutfile) #empty += 1 raise except: sys.stderr.write( "Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string()) raise if not quiet: sys.stderr.write( "%d SAM %s processed.\n" % (i, "alignments " if not pe_mode else "alignment pairs")) if samoutfile is not None: samoutfile.close() #сохранить данные в таблицы чтобы работать с ними как угодно потом! outfile = open( '/home/kirill/bi/transcript/' + str(sample) + '_dict.txt', 'w') outfile.write("total_of_reads_in_sample" + '\t' + str(total_of_reads_in_sample) + '\n') for gene_id, gene in genes_coverage_in_points.iteritems(): outfile.write( str(gene_id) + '\t' + str(genes_exons[gene_id]["total_aligned_reads"]) + '\t' + str(genes_exons[gene_id]["total_sum_of_exons"]) + '\n') outfile.write(str(gene_id) + '\t') [ outfile.write(str(val["coverage"]) + '\t') for k, val in gene.iteritems() ] outfile.write('\n') outfile.close() #############test################ #plot_gene_coverage() ################################ #1. получить % от числа ридов картированных на ген в конкретной точке(сумма всех % на 10 точках = 100) - число ридов картированных на ген будем записывать в массив(это бывший массиыв count) #2 для каждой точки делим полученный процент на длину конкретного гена (total_sum of exons) #3. для каждой точки делим величину на общее число ридов в образце #4. deviance - min - max всех значений? точка на графике среднее между ними CalcCoverage.do_coverage(genes_coverage_in_points, genes_exons, total_of_reads_in_sample, colors, sample, handlers) sample += 1 #обнуление точек покрытия clear_all_cov_points() plt.legend(handlers, ['Sample ' + str(v) for v in range(0, sample, 1)]) plt.title('Positions relative coverege') plt.xlabel('5` -> 3` positions, %') plt.ylabel('relative coverage') plt.grid(True) plt.savefig('/home/kirill/bi/transcript/covarage.png') plt.show() plt.close()
def count_reads_in_features( sam_filename, gff_filename, samtype, order, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout ): def write_to_samout( r, assignment ): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: samoutfile.write( read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n" ) if samout != "": samoutfile = open( samout, "w" ) else: samoutfile = None features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" ) counts = {} # Try to open samfile to fail early in case it is not there if sam_filename != "-": open( sam_filename ).close() gff = HTSeq.GFF_Reader( gff_filename ) i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[ id_attribute ] except KeyError: raise ValueError, ( "Feature %s does not contain a '%s' attribute" % ( f.name, id_attribute ) ) if stranded != "no" and f.iv.strand == ".": raise ValueError, ( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % ( f.name, f.iv ) ) features[ f.iv ] += feature_id counts[ f.attr[ id_attribute ] ] = 0 i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) except: sys.stderr.write( "Error occured when processing GFF file (%s):\n" % gff.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) if len( counts ) == 0: sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type ) if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader else: raise ValueError, "Unknown input format %s specified." % samtype try: if sam_filename != "-": read_seq_file = SAM_or_BAM_Reader( sam_filename ) read_seq = read_seq_file first_read = iter(read_seq).next() else: read_seq_file = SAM_or_BAM_Reader( sys.stdin ) read_seq_iter = iter( read_seq_file ) first_read = read_seq_iter.next() read_seq = itertools.chain( [ first_read ], read_seq_iter ) pe_mode = first_read.paired_end except: sys.stderr.write( "Error occured when reading beginning of SAM/BAM file.\n" ) raise try: if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments( read_seq ) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq ) else: raise ValueError, "Illegal order specified." empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write( "%d SAM alignment record%s processed.\n" % ( i, "s" if not pe_mode else " pairs" ) ) i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout( r, "__not_aligned" ) continue try: if r.optional_field( "NH" ) > 1: nonunique += 1 write_to_samout( r, "__alignment_not_unique" ) continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout( r, "__too_low_aQual" ) continue if stranded != "reverse": iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 ) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: if ( r[0] is None ) or not ( r[0].aligned ): write_to_samout( r, "__not_aligned" ) notaligned += 1 continue try: if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): nonunique += 1 write_to_samout( r, "__alignment_not_unique" ) continue except KeyError: pass if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ): lowqual += 1 write_to_samout( r, "__too_low_aQual" ) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): fs = fs.union( fs2 ) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection( fs2 ) else: sys.exit( "Illegal overlap mode." ) if fs is None or len( fs ) == 0: write_to_samout( r, "__no_feature" ) empty += 1 elif len( fs ) > 1: write_to_samout( r, "__ambiguous[" + '+'.join( fs ) + "]" ) ambiguous += 1 else: write_to_samout( r, list(fs)[0] ) counts[ list(fs)[0] ] += 1 except UnknownChrom: write_to_samout( r, "__no_feature" ) empty += 1 except: sys.stderr.write( "Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d SAM %s processed.\n" % ( i, "alignments " if not pe_mode else "alignment pairs" ) ) if samoutfile is not None: samoutfile.close() for fn in sorted( counts.keys() ): print "%s\t%d" % ( fn, counts[fn] ) print "__no_feature\t%d" % empty print "__ambiguous\t%d" % ambiguous print "__too_low_aQual\t%d" % lowqual print "__not_aligned\t%d" % notaligned print "__alignment_not_unique\t%d" % nonunique
def count_reads_with_barcodes( sam_filename, features, feature_attr, order, max_buffer_size, stranded, overlap_mode, multimapped_mode, secondary_alignment_mode, supplementary_alignment_mode, feature_type, id_attribute, additional_attributes, quiet, minaqual, samout_format, samout_filename, cb_tag, ub_tag, ): def write_to_samout(r, assignment, samoutfile, template=None): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: read.optional_fields.append(('XF', assignment)) if samout_format in ('SAM', 'sam'): samoutfile.write(read.get_sam_line() + "\n") else: samoutfile.write(read.to_pysam_AlignedSegment(template)) def identify_barcodes(r): '''Identify barcode from the read or pair (both must have the same)''' if not pe_mode: r = (r,) # cell, UMI barcodes = [None, None] nbar = 0 for read in r: if read is not None: for tag, val in read.optional_fields: if tag == cb_tag: barcodes[0] = val nbar += 1 if nbar == 2: return barcodes elif tag == ub_tag: barcodes[1] = val nbar += 1 if nbar == 2: return barcodes return barcodes try: if sam_filename == "-": read_seq_file = HTSeq.BAM_Reader(sys.stdin) else: read_seq_file = HTSeq.BAM_Reader(sam_filename) # Get template for output BAM if samout_filename is None: template = None samoutfile = None elif samout_format in ('bam', 'BAM'): template = read_seq_file.get_template() samoutfile = pysam.AlignmentFile( samout_filename, 'wb', template=template, ) else: template = None samoutfile = open(samout_filename, 'w') read_seq_iter = iter(read_seq_file) # Catch empty BAM files try: first_read = next(read_seq_iter) pe_mode = first_read.paired_end # FIXME: catchall can hide subtle bugs except: first_read = None pe_mode = False if first_read is not None: read_seq = itertools.chain([first_read], read_seq_iter) else: read_seq = [] except: sys.stderr.write( "Error occured when reading beginning of SAM/BAM file.\n") raise # CIGAR match characters (including alignment match, sequence match, and # sequence mismatch com = ('M', '=', 'X') try: if pe_mode: if ((supplementary_alignment_mode == 'ignore') and (secondary_alignment_mode == 'ignore')): primary_only = True else: primary_only = False if order == "name": read_seq = HTSeq.pair_SAM_alignments( read_seq, primary_only=primary_only) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq, max_buffer_size=max_buffer_size, primary_only=primary_only) else: raise ValueError("Illegal order specified.") # The nesting is cell barcode, UMI, feature counts = defaultdict(lambda: defaultdict(Counter)) i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write( "%d alignment record%s processed.\n" % (i, "s" if not pe_mode else " pairs")) sys.stderr.flush() i += 1 cb, ub = identify_barcodes(r) if not pe_mode: if not r.aligned: counts[cb][ub]['__not_aligned'] += 1 write_to_samout( r, "__not_aligned", samoutfile, template) continue if ((secondary_alignment_mode == 'ignore') and r.not_primary_alignment): continue if ((supplementary_alignment_mode == 'ignore') and r.supplementary): continue try: if r.optional_field("NH") > 1: counts[cb][ub]['__alignment_not_unique'] += 1 write_to_samout( r, "__alignment_not_unique", samoutfile, template) if multimapped_mode == 'none': continue except KeyError: pass if r.aQual < minaqual: counts[cb][ub]['__too_low_aQual'] += 1 write_to_samout( r, "__too_low_aQual", samoutfile, template) continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if (co.type in com and co.size > 0)) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type in com and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type in com and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout( r, "__not_aligned", samoutfile, template) counts[cb][ub]['__not_aligned'] += 1 continue if secondary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].not_primary_alignment: continue elif (r[1] is not None) and r[1].not_primary_alignment: continue if supplementary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].supplementary: continue elif (r[1] is not None) and r[1].supplementary: continue try: if ((r[0] is not None and r[0].optional_field("NH") > 1) or (r[1] is not None and r[1].optional_field("NH") > 1)): write_to_samout( r, "__alignment_not_unique", samoutfile, template) counts[cb][ub]['__alignment_not_unique'] += 1 if multimapped_mode == 'none': continue except KeyError: pass if ((r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual)): write_to_samout( r, "__too_low_aQual", samoutfile, template) counts[cb][ub]['__too_low_aQual'] += 1 continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode in ("intersection-strict", "intersection-nonempty"): fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if ((len(fs2) > 0) or (overlap_mode == "intersection-strict")): if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: write_to_samout( r, "__no_feature", samoutfile, template) counts[cb][ub]['__no_feature'] += 1 elif len(fs) > 1: write_to_samout( r, "__ambiguous[" + '+'.join(fs) + "]", samoutfile, template) counts[cb][ub]['__ambiguous'] += 1 else: write_to_samout( r, list(fs)[0], samoutfile, template) if fs is not None and len(fs) > 0: if multimapped_mode == 'none': if len(fs) == 1: counts[cb][ub][list(fs)[0]] += 1 elif multimapped_mode == 'all': for fsi in list(fs): counts[cb][ub][fsi] += 1 else: sys.exit("Illegal multimap mode.") except UnknownChrom: write_to_samout( r, "__no_feature", samoutfile, template) counts[cb][ub]['__no_feature'] += 1 except: sys.stderr.write( "Error occured when processing input (%s):\n" % (read_seq_file.get_line_number_string())) raise if not quiet: sys.stderr.write( "%d %s processed.\n" % (i, "alignments " if not pe_mode else "alignment pairs")) sys.stderr.flush() if samoutfile is not None: samoutfile.close() # Get rid of UMI by majority rule cbs = sorted(counts.keys()) counts_noumi = {} for cb in cbs: counts_cell = Counter() for ub, udic in counts.pop(cb).items(): # In case of a tie, do not increment either feature top = udic.most_common(2) if (len(top) == 2) and (top[0][1] == top[1][1]): continue counts_cell[top[0][0]] += 1 counts_noumi[cb] = counts_cell return { 'cell_barcodes': cbs, 'counts': counts_noumi, }
def main(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( 'alignment_file', metavar='in.aln', help="input alignment file in SAM or BAM format. Use '-' to indicate " "that input should be taken from standard input (stdin)") parser.add_argument( 'feature_file', metavar='in.gff3', help="input feature annotation file in GFF3 format. Use '-' to indicate " "that input should be taken from standard input (stdin)") parser.add_argument( '-m', '--mapping', metavar='in.json', dest='map_files', action=ParseSeparator, sep=',', help="input one or more relational databases, in JSON format, " "containing features mapped to feature categories, such as genes " "to gene families or exons to genes. Abundance estimates for the " "given feature category will be reported in place of features. " "Multiple input files can be provided by separating them with a " "comma and no spaces") parser.add_argument( '-c', '--category', metavar='FIELD', dest='category', help="field in the relational database representing how features " "are categorized. WARNING: if the value type of the selected field " "is a list, then the category abundance totals can be greater than " "the feature abundance totals") gff_group = parser.add_argument_group('GFF3 arguments') gff_group.add_argument( '-t', '--type', metavar='TYPE', dest='ftype', default='CDS', help="feature type (3rd column in GFF file) to estimate abundance for " "[default: CDS]. All features of other type will be ignored") gff_group.add_argument( '-a', '--attr', metavar='ATTRIBUTE', default="Name", help="GFF attribute to use as the ID for the calculated abundances " "[default: 'Name']. This value will also be used as the search " "ID in the relational database, if provided") aln_group = parser.add_argument_group('SAM/BAM arguments') aln_group.add_argument( '-f', '--format', metavar='FORMAT', dest='aformat', choices=['bam', 'sam'], default='bam', help="input alignment file format [default: bam]. Options are 'sam' " "or 'bam'") aln_group.add_argument( '-q', '--qual', metavar='THRESH', dest='minqual', type=int, default=2, help="skip all reads with alignment quality lower than the threshold " "[default: 2]") aln_group.add_argument( '-s', '--sorting', metavar='ORDER', dest='order', choices=["position", "name"], default='position', help="alignment file sorting scheme. Options are 'position' and " "'name' [default: position]. Alignments must be pre-sorted " "either by position/coordinates or by read name. This option " "will be ignored for single-end reads") aln_group.add_argument( '-b', '--buffer', metavar='BYTES', dest='buffer_size', type=int, default=3145728, help="buffer size for paired reads in the alignment file if sorted by " "position [default: 3145728 (3GB)]. This value should be " "increased if memory issues are encountered") count_group = parser.add_argument_group('quantification arguments') count_group.add_argument( '-e', '--mode', metavar='MODE', choices=["union", "intersection-strict", "intersection-nonempty"], default="union", help="mode for handling different alignment scenarios. Options are " "'union', 'intersection-strict', and 'intersection-nonempty' " "[default: union]. The modes will count alignments differently " "depending on whether a read/pair overlaps more than one feature " "or only partially aligns to a single feature. The most " "inclusive mode is 'union' when given with the nonunique flag, " "and the least inclusive is 'intersection-strict'") count_group.add_argument( '-u', '--units', metavar='UNITS', dest='norm', action=ParseSeparator, sep=',', default='counts', help="comma-separated list of units to output abundance estimates in " "[default: counts]. Options are 'counts', 'fpk' (fragments per " "kilobase of feature), 'fpkm' (fragements per kilobase of " "feature per million mapped fragments), 'tpm' " "(transcripts/fragments per million), 'prop', and 'custom'. If " "other than 'counts', features will be normalized by recruitment " "length, which will be calculated from the start and end fields " "of the GFF3 file. This is the sole normalization method used " "when transforming counts to FPK, and is useful to correct for " "differences in feature lengths within a sample. In addition to " "feature length, FPKM and TPM attempt to account for differences " "between samples in sequencing effort. An advantage of TMP over " "FPKM is that TPM is a proportional measurement, making it " "easier to identify the extent that the relative 'importance' of " "a given feature changes between samples. A custom transformation " "can also be performed when used with the -k/--coeff argument, in " "which case the length normalized proportion of a feature will be " "multiplied by the provided scaling factor.") count_group.add_argument( '-k', '--coeff', metavar='MUL', dest='sfactor', type=float, default=1, help="multiplier to use when 'custom' is given to -u/--units " "[default: 1]") count_group.add_argument( '--cdna', dest='transcripts', action='store_true', help="sequences represent cDNA [default: False]. Whether sequences are " "from gDNA or cDNA will determine how the length of a feature is " "calculated for normalization. If cDNA, effective length will " "serve as feature length") count_group.add_argument( '--nonunique', action='store_true', help="allow reads to align with more than one feature") output_group = parser.add_argument_group('output control arguments') output_group.add_argument( '-o', '--outpref', type=str, metavar='PREFIX', dest='outpref', default='sample', help="prefix for the output tabular files containing feature abundance " "estimates [default: sample]. File names will be appended with " "the units, file format, and compression algorithm, if relevant " "[e.g. sample.counts.csv.gz]") output_group.add_argument( '--filter', dest='cat_only', action='store_true', help="only output abundances for features with an associated feature " "category [default: output all]") compression = output_group.add_mutually_exclusive_group() compression.add_argument('--gzip', dest='gzipped', action='store_true', help="compress output using the gzip algorithm") compression.add_argument('--bzip2', dest='bzipped', action='store_true', help="compress output using the bzip2 algorithm") compression.add_argument('--lzma', dest='lzma', action='store_true', help="compress output using the lzma algorithm") parser.add_argument('--version', action='version', version='%(prog)s ' + __version__) args = parser.parse_args() # Argument sanity checks if (args.category and not args.map_files) or \ (args.map_files and not args.category): parser.error("error: -m/--mapping and -c/--category must be supplied " "together") if args.alignment_file == '-' and args.feature_file == '-': parser.error("error: standard input (stdin) can only be redirected to " "a single positional argument") # Output run information all_args = sys.argv[1:] print("{} {!s}".format('count_features', __version__), file=sys.stderr) print(textwrap.fill("Command line parameters: {}"\ .format(' '.join(all_args)), 79), file=sys.stderr) print("", file=sys.stderr) # Track program run-time start_time = time() # Assign variables based on user inputs if args.gzipped: compression = '.gz' elif args.bzipped: compression = '.bz2' elif args.lzma: compression = '.xz' else: compression = '' allowed_units = ["counts", "tpm", "custom", "prop", "fpk", "fpkm"] out_handles = {} for unit in args.norm: if unit not in allowed_units: print("warning: unknown metric of abundance '{}' provided to " "-u/--unit. Please see the help message for a list of the " "allowed units".format(unit), file=sys.stderr) continue outfile = "{}.{}.csv{}".format(args.outpref, unit, compression) try: out_h = open_io(outfile, mode='wb') except AttributeError: print("error: unable to write to '{}'".format(outfile), \ file=sys.stderr) sys.exit(1) out_handles[unit] = out_h if not out_handles: print( "error: no output files can be created. Please re-run with one " "or more of the accepted units of abundance", file=sys.stderr) sys.exit(1) overlap_mode = args.mode minaqual = args.minqual feature_type = args.ftype id_field = args.attr category_field = args.category are_transcripts = args.transcripts category_only = args.cat_only multi_aln = args.nonunique match_types = ('M', '=', 'X') if args.aformat == "sam": align_reader = HTSeq.SAM_Reader else: #must be BAM then align_reader = HTSeq.BAM_Reader if args.map_files: mapping = load_dbs(args.map_files, fields=[category_field], csv=False) else: mapping = None # Store features in genomic arrays features = HTSeq.GenomicArrayOfSets("auto", stranded=False) counts = {} # Iterate over GFF3 file, storing features to estimate coverage for no_attr = 0 f_totals = 0 ftype_totals = 0 try: if args.feature_file == '-': gff = HTSeq.GFF_Reader(sys.stdin) else: gff = HTSeq.GFF_Reader(args.feature_file) for f in gff: f_totals += 1 try: feature_id = f.attr[id_field] except KeyError: no_attr += 1 feature_id = "unkwn_{:08}".format(no_attr) # Skip features of wrong type if feature_type: if f.type == feature_type: ftype_totals += 1 else: continue # Store feature length for normalization feature_length = abs(f.iv.end - f.iv.start + 1) features[f.iv] += feature_id #for mapping alignments counts[feature_id] = {'count': 0, 'length': feature_length} except: print("error: problem occured when processing GFF3 file at line {}". format(gff.get_line_number_string()), file=sys.stderr) sys.exit(1) # Verify GFF3 file contains features of the specified type if ftype_totals == 0: print("error: no features of type '{}' found.\n".format(feature_type), file=sys.stderr) sys.exit(1) if no_attr > 0: print("warning: found {!s} features without a '{}' attribute.\n"\ .format(no_attr, id_field), file=sys.stderr) # Check alignment file formatting try: if args.alignment_file == '-': read_seq_file = align_reader(sys.stdin) read_seq_iter = iter(read_seq_file) first_read = next(read_seq_iter) read_seq = itertools.chain([first_read], read_seq_iter) else: read_seq_file = align_reader(args.alignment_file) read_seq = read_seq_file first_read = next(iter(read_seq)) except: print( "error: unable to read the alignment file. Please verify that " "the formatting is correct.", file=sys.stderr) sys.exit(1) pe_mode = first_read.paired_end #reads are paired-end or single-end if pe_mode: if args.order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) else: #order is by position read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq, \ max_buffer_size=args.buffer_size) # Iterate over alignment file empty = 0 #reads aligned somewhere in the assembly, but not to a feature duplicate = 0 #reads are duplicates of other reads ambiguous = 0 #reads overlapping more than one feature notaligned = 0 #unaligned reads lowqual = 0 #reads not passing minimum threshold for alignment quality nonunique = 0 #reads having multiple alignments with similar score r_totals = 0 #total reads aln_totals = 0 #correctly mapped to a feature fld = [] #fragment length / insert-size distribution for r in read_seq: r_totals += 1 if not pe_mode: #single-end read mapping # Check if read aligned if not r.aligned: notaligned += 1 continue # Check if the read aligned uniquely try: if r.optional_field("NH") > 1: nonunique += 1 print("warning: read '{}' has multiple alignments with " "similar score.\n".format(r.iv.chrom), \ file=sys.stderr) continue except KeyError: pass # Cehck if the alignment passed the quality requirement if r.aQual < minaqual: lowqual += 1 continue # Check whether the read was marked as a duplciate if r.pcr_or_optical_duplicate: duplicate += 1 continue # Store read coordiantes iv_seq = (co.ref_iv for co in r.cigar if co.type in match_types \ and co.size > 0) else: #paired-end read mapping # Store pair coordinates try: first_r, second_r = r except ValueError: notaligned += 1 continue if first_r is None or second_r is None: notaligned += 1 continue if first_r is not None and first_r.aligned: iv_seq = (co.ref_iv for co in first_r.cigar if co.type in \ match_types and co.size > 0) else: iv_seq = tuple() if second_r is not None and second_r.aligned: iv_seq = itertools.chain(iv_seq, (co.ref_iv for co in \ second_r.cigar if co.type in match_types and \ co.size > 0)) else: if (first_r is None) or not (first_r.aligned): notaligned += 1 continue # Check whether either read aligned more than once try: if (first_r.optional_field("NH") > 1) or \ (second_r.optional_field("NH") > 1): nonunique += 1 print("warning: read '{}' has multiple alignments with " "similar score.\n".format(first_r.iv.chrom), \ file=sys.stderr) continue except KeyError: pass # Check if both reads passed the quality requirement if first_r.aQual < minaqual or second_r.aQual < minaqual: lowqual += 1 continue # Check if the read pair was marked as a duplicate if first_r.pcr_or_optical_duplicate or \ second_r.pcr_or_optical_duplicate: duplicate += 1 continue # Append fragment length/insert-size to distribution try: fld.append(first_r.inferred_insert_size) except AttributeError: pass # Handle case where reads might overlap more than one feature try: if overlap_mode == "union": fs = set() #store feature names when reads align for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) else: #intersection fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len(fs2 ) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) # If a read correctly mapped to a feature, increment its abundance if not fs: empty += 1 continue elif len(fs) > 1: ambiguous += 1 if not multi_aln: continue else: aln_totals += 1 for fsi in list(fs): counts[fsi]['count'] += 1 except UnknownChrom: empty += 1 unaln_totals = empty + ambiguous + lowqual + notaligned + nonunique + \ duplicate nmapped = aln_totals + empty + ambiguous + nonunique + lowqual for unit in out_handles: # Set scaling function if unit == 'fpk': norm_method = scale_abundance_fpk scaling_factor = None elif unit == 'fpkm': norm_method = scale_abundance_fpkm # Scaling factor is all mapped reads scaling_factor = nmapped print("info: the total number of mapped reads used in calculation " "of FPKM is {!s}.\n".format(nmapped), file=sys.stderr) elif unit == 'tpm': norm_method = scale_abundance_tpm rates = [counts[j]['count'] / counts[j]['length'] for j in counts] rate_sum = sum(rates) print("info: the sum of all counts per bp rates used in " "estimating fragment proportions is {:.2f}.\n"\ .format(rate_sum), file=sys.stderr) # Scaling factor is sum of all reads per base rates scaling_factor = rate_sum elif unit == 'custom': norm_method = scale_abundance_prop rates = [counts[j]['count'] / counts[j]['length'] for j in counts] rate_sum = sum(rates) print("info: the sum of all counts per bp rates used in " "estimating fragment proportions is {:.2f}.\n"\ .format(rate_sum), file=sys.stderr) scaling_factor = args.sfactor / rate_sum elif unit == 'prop': norm_method = scale_abundance_prop rates = [counts[j]['count'] / counts[j]['length'] for j in counts] rate_sum = sum(rates) print("info: the sum of all counts per bp rates used in " "estimating fragment proportions is {:.2f}.\n"\ .format(rate_sum), file=sys.stderr) scaling_factor = 1 / rate_sum else: #default is counts norm_method = scale_abundance_none scaling_factor = None if are_transcripts and not pe_mode: print( "warning: unable to calculate effective length from single-end " "reads. Will use sequence length instead.\n", file=sys.stderr) calc_length = return_first_arg elif are_transcripts and pe_mode: calc_length = compute_effective_length else: calc_length = return_first_arg out_h = out_handles[unit] # Abundance normalization abundances = {} unkwn_feat = 0 no_map = 0 for feature in counts: fcount = counts[feature]['count'] flen = calc_length(counts[feature]['length'], fld) feature_abundance = norm_method(fcount, flen, scaling_factor) # Map to higher order features, if applicable if category_field: try: # Ensure that feature has corresponding entry in database feature_map = mapping[feature] except KeyError: no_map += 1 if not category_only: # Keep all features, even the uncategorized ones abundances[feature] = abundances.get(feature, 0) + \ feature_abundance continue else: try: # Ensure that entry has relevant category field category = feature_map[category_field] except KeyError: unkwn_feat += 1 if not category_only: abundances[feature] = abundances.get(feature, 0) + \ feature_abundance continue # Handle case where feature has more than one category, such # as if a protein sequence is assigned to more than one gene # family categories = [category] if not type(category) == type(list()) \ else category for category in categories: abundances[category.lstrip()] = \ abundances.get(category, 0) + feature_abundance else: abundances[feature] = abundances.get(feature, 0) + \ feature_abundance # "UNMAPPED" can be interpreted as a single unknown gene of length one # kilobase recruiting all reads that failed to map to input features #abundances['UNMAPPED'] = unaln_totals # Output abundances sorted by key name for fn in sorted(abundances): if not fn.startswith("unkwn_"): write_io(out_h, "{}\t{!s}\n".format(fn, abundances[fn])) out_h.close() if unkwn_feat > 0: print("warning: found '{!s}' features without the '{}' field in the " "relational database.\n".format(unkwn_feat, category_field), \ file=sys.stderr) if no_map > 0: print("warning: found {!s} features without an entry in the " "relational database.\n".format(no_map), file=sys.stderr) # Output statistics print("Features processed:", file=sys.stderr) print(" - feature totals:\t{!s}".format(f_totals), file=sys.stderr) if feature_type: print(" - of relevant type:\t{!s}".format(ftype_totals), \ file=sys.stderr) print(" - unique features:\t{!s}".format(len(counts)), file=sys.stderr) print("Reads processed:", file=sys.stderr) print(" - read totals:\t{!s}".format(r_totals), file=sys.stderr) print(" - successfully mapped:\t{!s}".format(aln_totals), \ file=sys.stderr) if multi_aln: print(" - ambiguous alignment:\t{!s}".format(ambiguous), \ file=sys.stderr) print(" - unsuccessfully mapped:\t{!s}".format(unaln_totals), \ file=sys.stderr) print(" - no feature\t{!s}".format(empty), file=sys.stderr) if not multi_aln: print(" - ambiguous alignment\t{!s}".format(ambiguous), \ file=sys.stderr) print(" - too low alignment quality\t{!s}".format(lowqual), \ file=sys.stderr) print(" - not aligned\t{!s}".format(notaligned), file=sys.stderr) print(" - duplicate\t{!s}".format(duplicate), file=sys.stderr) print(" - alignment not unique\t{!s}".format(nonunique), \ file=sys.stderr) print("", file=sys.stderr) # Calculate and print program run-time end_time = time() total_time = (end_time - start_time) / 60.0 print("It took {:.2e} minutes to count {!s} fragments for {!s} features"\ .format(total_time, r_totals, f_totals), file=sys.stderr) print("", file=sys.stderr)
def count_reads_in_features(sam_filenames, gff_filename, samtype, order, max_buffer_size, stranded, overlap_mode, multimapped_mode, secondary_alignment_mode, supplementary_alignment_mode, feature_type, id_attribute, additional_attributes, quiet, minaqual, samouts): def write_to_samout(r, assignment, samoutfile): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: read.optional_fields.append(('XF', assignment)) samoutfile.write(read.get_sam_line() + "\n") if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader samname = 'SAM' elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader samname = 'BAM' else: raise ValueError("Unknown input format %s specified." % samtype) if samouts != []: if len(samouts) != len(sam_filenames): raise ValueError( 'Select the same number of {:} input and output files'.format(samname)) # Try to open samout files early in case any of them has issues for samout in samouts: with open(samout, 'w'): pass # Try to open samfiles to fail early in case any of them is not there if (len(sam_filenames) != 1) or (sam_filenames[0] != '-'): for sam_filename in sam_filenames: with open(sam_filename): pass # CIGAR match characters (including alignment match, sequence match, and # sequence mismatch com = ('M', '=', 'X') features = HTSeq.GenomicArrayOfSets("auto", stranded != "no") gff = HTSeq.GFF_Reader(gff_filename) counts = {} attributes = {} i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[id_attribute] except KeyError: raise ValueError( "Feature %s does not contain a '%s' attribute" % (f.name, id_attribute)) if stranded != "no" and f.iv.strand == ".": raise ValueError( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) features[f.iv] += feature_id counts[f.attr[id_attribute]] = 0 attributes[f.attr[id_attribute]] = [ f.attr[attr] if attr in f.attr else '' for attr in additional_attributes] i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write("%d GFF lines processed.\n" % i) sys.stderr.flush() except: sys.stderr.write( "Error occured when processing GFF file (%s):\n" % gff.get_line_number_string()) raise if not quiet: sys.stderr.write("%d GFF lines processed.\n" % i) sys.stderr.flush() if len(counts) == 0: sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type) counts_all = [] empty_all = [] ambiguous_all = [] notaligned_all = [] lowqual_all = [] nonunique_all = [] for isam, (sam_filename) in enumerate(sam_filenames): if samouts != []: samoutfile = open(samouts[isam], 'w') else: samoutfile = None try: if sam_filename == "-": read_seq_file = SAM_or_BAM_Reader(sys.stdin) else: read_seq_file = SAM_or_BAM_Reader(sam_filename) read_seq_iter = iter(read_seq_file) # Catch empty BAM files try: first_read = next(read_seq_iter) pe_mode = first_read.paired_end except: first_read = None pe_mode = False if first_read is not None: read_seq = itertools.chain([first_read], read_seq_iter) else: read_seq = [] except: sys.stderr.write( "Error occured when reading beginning of {:} file.\n".format( samname)) raise try: if pe_mode: if ((supplementary_alignment_mode == 'ignore') and (secondary_alignment_mode == 'ignore')): primary_only = True else: primary_only = False if order == "name": read_seq = HTSeq.pair_SAM_alignments( read_seq, primary_only=primary_only) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq, max_buffer_size=max_buffer_size, primary_only=primary_only) else: raise ValueError("Illegal order specified.") empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write( "%d %s alignment record%s processed.\n" % (i, samname, "s" if not pe_mode else " pairs")) sys.stderr.flush() i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout(r, "__not_aligned", samoutfile) continue if ((secondary_alignment_mode == 'ignore') and r.not_primary_alignment): continue if ((supplementary_alignment_mode == 'ignore') and r.supplementary): continue try: if r.optional_field("NH") > 1: nonunique += 1 write_to_samout( r, "__alignment_not_unique", samoutfile) if multimapped_mode == 'none': continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(r, "__too_low_aQual", samoutfile) continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if (co.type in com and co.size > 0)) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type in com and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type in com and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "__not_aligned", samoutfile) notaligned += 1 continue if secondary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].not_primary_alignment: continue elif (r[1] is not None) and r[1].not_primary_alignment: continue if supplementary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].supplementary: continue elif (r[1] is not None) and r[1].supplementary: continue try: if ((r[0] is not None and r[0].optional_field("NH") > 1) or (r[1] is not None and r[1].optional_field("NH") > 1)): nonunique += 1 write_to_samout(r, "__alignment_not_unique", samoutfile) if multimapped_mode == 'none': continue except KeyError: pass if ((r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual)): lowqual += 1 write_to_samout(r, "__too_low_aQual", samoutfile) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode in ("intersection-strict", "intersection-nonempty"): fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if ((len(fs2) > 0) or (overlap_mode == "intersection-strict")): if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: write_to_samout(r, "__no_feature", samoutfile) empty += 1 elif len(fs) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]", samoutfile) ambiguous += 1 else: write_to_samout(r, list(fs)[0], samoutfile) if fs is not None and len(fs) > 0: if multimapped_mode == 'none': if len(fs) == 1: counts[list(fs)[0]] += 1 elif multimapped_mode == 'all': for fsi in list(fs): counts[fsi] += 1 else: sys.exit("Illegal multimap mode.") except UnknownChrom: write_to_samout(r, "__no_feature", samoutfile) empty += 1 except: sys.stderr.write( "Error occured when processing %s input (%s):\n" % (samname, read_seq_file.get_line_number_string())) raise if not quiet: sys.stderr.write( "%d %s %s processed.\n" % (i, samname, "alignments " if not pe_mode else "alignment pairs")) sys.stderr.flush() if samoutfile is not None: samoutfile.close() counts_all.append(counts.copy()) for fn in counts: counts[fn] = 0 empty_all.append(empty) ambiguous_all.append(ambiguous) lowqual_all.append(lowqual) notaligned_all.append(notaligned) nonunique_all.append(nonunique) pad = ['' for attr in additional_attributes] for fn in sorted(counts.keys()): print('\t'.join([fn] + attributes[fn] + [str(c[fn]) for c in counts_all])) print('\t'.join(["__no_feature"] + pad + [str(c) for c in empty_all])) print('\t'.join(["__ambiguous"] + pad + [str(c) for c in ambiguous_all])) print('\t'.join(["__too_low_aQual"] + pad + [str(c) for c in lowqual_all])) print('\t'.join(["__not_aligned"] + pad + [str(c) for c in notaligned_all])) print('\t'.join(["__alignment_not_unique"] + pad + [str(c) for c in nonunique_all]))
def main(): parser = argparse.ArgumentParser( description='Assign reads to different genomic regions') parser.add_argument( '--input', '-i', type=str, required=True, help= "Input bam file, should in hg38 coordinate, can either sorted by query name or coordiante" ) parser.add_argument('--strandness', '-s', type=str, default="no", choices=["forward", "reverse", "no"]) parser.add_argument('--output', '-o', type=str, required=True, help="Output reads assignment") parser.add_argument('--beddir', '-bd', type=str, default="genome/bed", help="Dir that contains bed files") parser.add_argument( '--priority', '-p', type=str, default= "lncRNA,mRNA,snoRNA,snRNA,srpRNA,tRNA,tucpRNA,Y_RNA,pseudogene,exon,intron,antisense,promoter,enhancer,repeats" ) args = parser.parse_args() bam = HTSeq.BAM_Reader(args.input) regions = args.priority.strip().split(",") print("Load genomic regions ...") if args.strandness != "no": ga = HTSeq.GenomicArrayOfSets("auto", stranded=True) else: ga = HTSeq.GenomicArrayOfSets("auto", stranded=False) for region in regions: bed = args.beddir + "/" + region + ".bed" with open(bed, "r") as f: for line in f: line = line.strip() if line.startswith("#") or len(line) == 0: continue fields = line.split("\t") chrom, start, end, strand = fields[0], int(fields[1]), int( fields[2]), fields[5] iv = HTSeq.GenomicInterval(chrom, start, end, strand=strand) ga[iv] += region print("{} loaded".format(region)) print("Done .") stats = defaultdict(int) n_total_fragments = 0 for read1, read2 in tqdm( HTSeq.pair_SAM_alignments_with_buffer(bam, max_buffer_size=5000000)): n_total_fragments += 1 # ignore singletons if (read1 is None) or (read2 is None): stats['singleton'] += 1 continue # ignore unmapped reads if not (read1.aligned and read2.aligned): stats['unmapped'] += 1 continue if read1.iv.chrom != read2.iv.chrom: stats['diff_chrom'] += 1 continue if args.strandness == 'forward': read2.iv.strand = read1.iv.strand elif args.strandness == 'reverse': read1.iv.strand = read2.iv.strand else: read1.iv.strand = "." read2.iv.strand = "." featureSet = set() for iv0, step_set in ga[read1.iv].steps(): featureSet = featureSet.union(step_set) for iv0, step_set in ga[read2.iv].steps(): featureSet = featureSet.union(step_set) for region in regions: if region in featureSet: stats[region] += 1 break n_assigned = pd.Series(stats).sum() stats["unassigned"] = n_total_fragments - n_assigned stats["total"] = n_total_fragments with open(args.output, "w") as f: for region in regions: print(region, stats[region], sep="\t", file=f) for each in [ 'singleton', 'unmapped', 'diff_chrom', 'unassigned', 'total' ]: print(each, stats[each], sep="\t", file=f)
def count_reads(sam_filename, features, counts, samtype, order, forward, reverse, overlap_mode, quiet, minaqual, samout, directory): def write_to_samout(r, assignment): if samoutfile is None: return if not pe_mode: r = (r, ) for read in r: if read is not None: samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n") if samout != "": samoutfile = open(samout, "w") else: samoutfile = None if samtype is None: samtype = detect_sam_type(sam_filename) if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader else: raise ValueError("Unknown input format %s specified." % samtype) try: if sam_filename != "-": read_seq_file = SAM_or_BAM_Reader(sam_filename) read_seq = read_seq_file first_read = iter(read_seq).next() else: read_seq_file = SAM_or_BAM_Reader(sys.stdin) read_seq_iter = iter(read_seq_file) first_read = read_seq_iter.next() read_seq = itertools.chain([first_read], read_seq_iter) pe_mode = first_read.paired_end except: sys.stderr.write("Error occured when reading beginning " "of SAM/BAM file.\n") raise try: if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) else: raise ValueError("Illegal order specified.") if forward: empty_forward = 0 ambiguous_forward = 0 counts_forward = copy.copy(counts) if reverse: empty_reverse = 0 ambiguous_reverse = 0 counts_reverse = copy.copy(counts) notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write("%d SAM alignment record%s processed.\n" % (i, "s" if not pe_mode else " pairs")) i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout(r, "__not_aligned") continue try: if r.optional_field("NH") > 1: nonunique += 1 write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(r, "__too_low_aQual") continue if forward: iv_seq_for = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) if reverse: iv_seq_rev = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: if forward: iv_seq_for = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) if reverse: iv_seq_rev = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq_rev = tuple() iv_seq_for = tuple() if r[1] is not None and r[1].aligned: if forward: iv_seq_for = (itertools.chain( iv_seq_for, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0))) if reverse: iv_seq_rev = itertools.chain( iv_seq_rev, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "__not_aligned") notaligned += 1 continue try: if ((r[0] is not None and r[0].optional_field("NH") > 1) or (r[1] is not None and r[1].optional_field("NH") > 1)): nonunique += 1 write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if ((r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual)): lowqual += 1 write_to_samout(r, "__too_low_aQual") continue try: if overlap_mode == "union": if forward: fs_for = set() for iv in iv_seq_for: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs_for = fs_for.union(fs2) if reverse: fs_rev = set() for iv in iv_seq_rev: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs_rev = fs_rev.union(fs2) elif (overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty"): if forward: fs_for = None for iv in iv_seq_for: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len(fs2) > 0 or \ overlap_mode == "intersection-strict": if fs_for is None: fs_for = fs2.copy() else: fs_for = fs_for.intersection(fs2) if reverse: fs_reverse = None for iv in iv_seq_rev: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len(fs2) > 0 or \ overlap_mode == "intersection-strict": if fs_rev is None: fs_rev = fs2.copy() else: fs_rev = fs_rev.intersection(fs2) else: sys.exit("Illegal overlap mode.") if forward: if fs_for is None or len(fs_for) == 0: write_to_samout(r, "__no_feature") empty_forward += 1 elif len(fs_for) > 1: write_to_samout( r, "__ambiguous[" + '+'.join(fs_for) + "]") ambiguous_forward += 1 else: write_to_samout(r, list(fs_for)[0]) counts_forward[list(fs_for)[0]] += 1 if reverse: if fs_reverse is None or len(fs_rev) == 0: write_to_samout(r, "__no_feature") empty_reverse += 1 elif len(fs_reverse) > 1: write_to_samout( r, "__ambiguous[" + '+'.join(fs_rev) + "]") ambiguous_reverse += 1 else: write_to_samout(r, list(fs_rev)[0]) counts_reverse[list(fs_rev)[0]] += 1 except UnknownChrom: write_to_samout(r, "__no_feature") empty_forward += 1 empty_reverse += 1 except: sys.stderr.write("Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string()) raise if not quiet: sys.stderr.write( "%d SAM %s processed.\n" % (i, "alignments " if not pe_mode else "alignment pairs")) if samoutfile is not None: samoutfile.close() if forward: output = brenninc_utils.create_new_file(sam_filename, "_forward_count", outputdir=directory, extension="txt", gzipped=False) used_features_count = 0 used_features_sum = 0 print "Forward written to", output with open(output, "w") as output_file: for fn in sorted(counts_forward.keys()): output_file.write("%s\t%d\n" % (fn, counts_forward[fn])) used_features_count += 1 used_features_sum += counts_forward[fn] output_file.write("__no_feature\t%d\n" % empty_forward) output_file.write("__ambiguous\t%d\n" % ambiguous_forward) output_file.write("__too_low_aQual\t%d\n" % lowqual) output_file.write("__not_aligned\t%d\n" % notaligned) output_file.write("__alignment_not_unique\t%d\n" % nonunique) print "Forward features with alignment\t%d" % used_features_count print "Forward alignments asigned to feature\t%d" % used_features_sum print "__forward_no_feature\t%d" % empty_forward print "__forward_ambiguous\t%d" % ambiguous_forward if reverse: output = brenninc_utils.create_new_file(sam_filename, "_reverse_count", outputdir=directory, extension="txt", gzipped=False) used_features_count = 0 used_features_sum = 0 print "Reverse written to", output with open(output, "w") as output_file: for fn in sorted(counts_reverse.keys()): output.write("%s\t%d\n" % (fn, counts_reverse[fn])) used_features_count += 1 used_features_sum += counts_reverse[fn] output_file.write("__no_feature\t%d\n" % empty_reverse) output_file.write("__ambiguous\t%d\n" % ambiguous_reverse) output_file.write("__too_low_aQual\t%d\n" % lowqual) output_file.write("__not_aligned\t%d\n" % notaligned) output_file.write("__alignment_not_unique\t%d\n" % nonunique) print "Reverse features with alignment\t%d" % used_features_count print "Reverse alignments asigned to feature\t%d" % used_features_sum print "__reverse_no_feature\t%d" % empty_reverse print "__reverse_ambiguous\t%d" % ambiguous_reverse print "__too_low_aQual\t%d" % lowqual print "__not_aligned\t%d" % notaligned print "__alignment_not_unique\t%d" % nonunique
def _set_read_seq( self, supplementary_alignment_mode, secondary_alignment_mode, order, max_buffer_size, ): """ Prepare the BAM/SAM file iterator. Note, only run this after _set_BAM_reader as you need self.read_seq_file to be set. This will create a parser and prepare an iterator for it. Depending on whether we have paired-end reads or not, different iterator will be returned. Parameters ---------- supplementary_alignment_mode : str Whether to score supplementary alignments (0x800 flag). Choices: score or ignore. secondary_alignment_mode : str Whether to score secondary alignments (0x100 flag). Choices: score or ignore. order : str Can only be either 'pos' or 'name'. Sorting order of <alignment_file>. max_buffer_size : int When <alignment_file> is paired end sorted by position, allow only so many reads to stay in memory until the mates are found (raising this number will use more memory). Has no effect for single end or paired end sorted by name. """ read_seq_iter = iter(self.read_seq_file) # Catch empty BAM files try: first_read = next(read_seq_iter) self.pe_mode = first_read.paired_end # FIXME: catchall can hide subtle bugs except: first_read = None self.pe_mode = False if first_read is not None: self.read_seq = itertools.chain([first_read], read_seq_iter) else: self.read_seq = [] if self.pe_mode: if (supplementary_alignment_mode == "ignore") and ( secondary_alignment_mode == "ignore" ): primary_only = True else: primary_only = False if order == "name": self.read_seq = HTSeq.pair_SAM_alignments( self.read_seq, primary_only=primary_only ) elif order == "pos": self.read_seq = HTSeq.pair_SAM_alignments_with_buffer( self.read_seq, max_buffer_size=max_buffer_size, primary_only=primary_only, ) else: raise ValueError("Illegal order specified.")
def count_reads_in_features(sam_filenames, gff_filename, samtype, order, max_buffer_size, stranded, overlap_mode, multimapped_mode, secondary_alignment_mode, supplementary_alignment_mode, feature_type, id_attribute, additional_attributes, quiet, minaqual, samouts, utr_tag): def write_to_samout(r, assignment, samoutfile): if samoutfile is None: return if not pe_mode: r = (r, ) for read in r: if read is not None: read.optional_fields.append(('XF', assignment)) samoutfile.write(read.get_sam_line() + "\n") if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader samname = 'SAM' elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader samname = 'BAM' else: raise ValueError("Unknown input format %s specified." % samtype) if samouts != []: if len(samouts) != len(sam_filenames): raise ValueError( 'Select the same number of {:} input and output files'.format( samname)) # Try to open samout files early in case any of them has issues for samout in samouts: with open(samout, 'w'): pass # Try to open samfiles to fail early in case any of them is not there if (len(sam_filenames) != 1) or (sam_filenames[0] != '-'): for sam_filename in sam_filenames: with open(sam_filename): pass # CIGAR match characters (including alignment match, sequence match, and # sequence mismatch com = ('M', '=', 'X') tags = utr_tag.split(",") features = HTSeq.GenomicArrayOfSets("auto", stranded != "no") gff = HTSeq.GFF_Reader(gff_filename) counts = {} attributes = {} i = 0 try: for f in gff: if f.type == feature_type or f.type in tags: # includes all entries with these tags try: feature_id = f.attr[id_attribute] except KeyError: raise ValueError( "Feature %s does not contain a '%s' attribute" % (f.name, id_attribute)) if stranded != "no" and f.iv.strand == ".": raise ValueError( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) if f.type in tags: features[f.iv] += f.attr[ id_attribute] + '^' + 'UTR' # in features dictionary, which contains intervals for each region, each entry is tagged with their featuretype separated by a unique character else: features[f.iv] += feature_id + '^' + f.type counts[f.attr[ id_attribute]] = 0 # the counts dictionary does not include this, meaning that the final counts will not include the added tag attributes[f.attr[id_attribute]] = [ f.attr[attr] if attr in f.attr else '' for attr in additional_attributes ] i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write("%d GFF lines processed.\n" % i) sys.stderr.flush() except: sys.stderr.write("Error occured when processing GFF file (%s):\n" % gff.get_line_number_string()) raise if not quiet: sys.stderr.write("%d GFF lines processed.\n" % i) sys.stderr.flush() if len(counts) == 0: sys.stderr.write("Warning: No features of specified types found.\n") counts_all = [] empty_all = [] ambiguous_all = [] notaligned_all = [] lowqual_all = [] nonunique_all = [] for isam, (sam_filename) in enumerate(sam_filenames): if samouts != []: samoutfile = open(samouts[isam], 'w') else: samoutfile = None try: if sam_filename == "-": read_seq_file = SAM_or_BAM_Reader(sys.stdin) else: read_seq_file = SAM_or_BAM_Reader(sam_filename) read_seq_iter = iter(read_seq_file) # Catch empty BAM files try: first_read = next(read_seq_iter) pe_mode = first_read.paired_end except: first_read = None pe_mode = False if first_read is not None: read_seq = itertools.chain([first_read], read_seq_iter) else: read_seq = [] except: sys.stderr.write( "Error occured when reading beginning of {:} file.\n".format( samname)) raise try: if pe_mode: if ((supplementary_alignment_mode == 'ignore') and (secondary_alignment_mode == 'ignore')): primary_only = True else: primary_only = False if order == "name": read_seq = HTSeq.pair_SAM_alignments( read_seq, primary_only=primary_only) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq, max_buffer_size=max_buffer_size, primary_only=primary_only) else: raise ValueError("Illegal order specified.") empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write( "%d %s alignment record%s processed.\n" % (i, samname, "s" if not pe_mode else " pairs")) sys.stderr.flush() i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout(r, "__not_aligned", samoutfile) continue if ((secondary_alignment_mode == 'ignore') and r.not_primary_alignment): continue if ((supplementary_alignment_mode == 'ignore') and r.supplementary): continue try: if r.optional_field("NH") > 1: nonunique += 1 write_to_samout(r, "__alignment_not_unique", samoutfile) if multimapped_mode == 'none': continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(r, "__too_low_aQual", samoutfile) continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if (co.type in com and co.size > 0)) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type in com and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type in com and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "__not_aligned", samoutfile) notaligned += 1 continue if secondary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].not_primary_alignment: continue elif (r[1] is not None) and r[1].not_primary_alignment: continue if supplementary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].supplementary: continue elif (r[1] is not None) and r[1].supplementary: continue try: if ((r[0] is not None and r[0].optional_field("NH") > 1) or (r[1] is not None and r[1].optional_field("NH") > 1)): nonunique += 1 write_to_samout(r, "__alignment_not_unique", samoutfile) if multimapped_mode == 'none': continue except KeyError: pass if ((r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual)): lowqual += 1 write_to_samout(r, "__too_low_aQual", samoutfile) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode in ("intersection-strict", "intersection-nonempty"): fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if ((len(fs2) > 0) or (overlap_mode == "intersection-strict")): if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: write_to_samout(r, "__no_feature", samoutfile) empty += 1 elif len(fs) >= 3: write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]", samoutfile) ambiguous += 1 else: write_to_samout(r, list(fs)[0], samoutfile) # this change works on the assumption that in TAGseq, no reads should map to non-UTRs. Because manually added # 3' UTRs may overlap with downstream genes if fs is not None and len( fs ) > 0: # since fs contains the gene names from the features dictionary, we can differentiate between UTRs and CDS and prioritize whichever we need if multimapped_mode == 'none': if len(fs) == 1: counts[list(fs)[0].split( '^' )[0]] += 1 # if multimapped counts go to same ID. elif len( fs ) == 2: # if mapping overlaps with two neighboring sequences and one of them is a UTR, count goes to UTR. if list(fs)[0].split( '^')[-1] == 'UTR' and list( fs)[1].split('^')[-1] != 'UTR': counts[list(fs)[0].split('^')[0]] += 1 elif list(fs)[1].split( '^')[-1] == 'UTR' and list( fs)[0].split('^')[-1] != 'UTR': counts[list(fs)[1].split('^')[0]] += 1 else: # read overlaps with different features write_to_samout( r, "__ambiguous[" + '+'.join(fs) + "]", samoutfile) ambiguous += 1 # else: # read overlaps with more than one count # write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]", # samoutfile) # ambiguous += 1 elif multimapped_mode == 'all': for fsi in list(fs): counts[fsi] += 1 else: sys.exit("Illegal multimap mode.") except UnknownChrom: write_to_samout(r, "__no_feature", samoutfile) empty += 1 except: sys.stderr.write("Error occured when processing %s input (%s):\n" % (samname, read_seq_file.get_line_number_string())) raise if not quiet: sys.stderr.write( "%d %s %s processed.\n" % (i, samname, "alignments " if not pe_mode else "alignment pairs")) sys.stderr.flush() if samoutfile is not None: samoutfile.close() counts_all.append(counts.copy()) for fn in counts: counts[fn] = 0 empty_all.append(empty) ambiguous_all.append(ambiguous) lowqual_all.append(lowqual) notaligned_all.append(notaligned) nonunique_all.append(nonunique) pad = ['' for attr in additional_attributes] for fn in sorted(counts.keys()): print('\t'.join([fn] + attributes[fn] + [str(c[fn]) for c in counts_all])) print('\t'.join(["__no_feature"] + pad + [str(c) for c in empty_all])) print('\t'.join(["__ambiguous"] + pad + [str(c) for c in ambiguous_all])) print('\t'.join(["__too_low_aQual"] + pad + [str(c) for c in lowqual_all])) print('\t'.join(["__not_aligned"] + pad + [str(c) for c in notaligned_all])) print('\t'.join(["__alignment_not_unique"] + pad + [str(c) for c in nonunique_all]))
def count_reads_in_features(sam_filenames, gff_filename, samtype, order, max_buffer_size, stranded, overlap_mode, multimapped_mode, secondary_alignment_mode, supplementary_alignment_mode, feature_type, id_attribute, additional_attributes, quiet, minaqual, samouts): def write_to_samout(r, assignment, samoutfile): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: read.optional_fields.append(('XF', assignment)) samoutfile.write(read.get_sam_line() + "\n") if samouts != "": if len(samouts) != len(sam_filenames): raise ValueError('Select the same number of SAM input and output files') # Try to open samout files early in case any of them has issues for samout in samouts: with open(samout, 'w'): pass # Try to open samfiles to fail early in case any of them is not there if (len(sam_filenames) != 1) or (sam_filenames[0] != '-'): for sam_filename in sam_filenames: with open(sam_filename): pass # CIGAR match characters (including alignment match, sequence match, and # sequence mismatch com = ('M', '=', 'X') features = HTSeq.GenomicArrayOfSets("auto", stranded != "no") gff = HTSeq.GFF_Reader(gff_filename) counts = {} attributes = {} i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[id_attribute] except KeyError: raise ValueError("Feature %s does not contain a '%s' attribute" % (f.name, id_attribute)) if stranded != "no" and f.iv.strand == ".": raise ValueError("Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) features[f.iv] += feature_id counts[f.attr[id_attribute]] = 0 attributes[f.attr[id_attribute]] = [ f.attr[attr] if attr in f.attr else '' for attr in additional_attributes] i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write("%d GFF lines processed.\n" % i) sys.stderr.flush() except: sys.stderr.write( "Error occured when processing GFF file (%s):\n" % gff.get_line_number_string()) raise if not quiet: sys.stderr.write("%d GFF lines processed.\n" % i) sys.stderr.flush() if len(counts) == 0: sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type) if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader else: raise ValueError("Unknown input format %s specified." % samtype) counts_all = [] empty_all = [] ambiguous_all = [] notaligned_all = [] lowqual_all = [] nonunique_all = [] for isam, (sam_filename) in enumerate(sam_filenames): if samouts != '': samoutfile = open(samouts[isam], 'w') else: samoutfile = None try: if sam_filename != "-": read_seq_file = SAM_or_BAM_Reader(sam_filename) read_seq = read_seq_file first_read = next(iter(read_seq)) else: read_seq_file = SAM_or_BAM_Reader(sys.stdin) read_seq_iter = iter(read_seq_file) first_read = next(read_seq_iter) read_seq = itertools.chain([first_read], read_seq_iter) pe_mode = first_read.paired_end except: sys.stderr.write( "Error occured when reading beginning of SAM/BAM file.\n") raise try: if pe_mode: if ((supplementary_alignment_mode == 'ignore') and (secondary_alignment_mode == 'ignore')): primary_only = True else: primary_only = False if order == "name": read_seq = HTSeq.pair_SAM_alignments( read_seq, primary_only=primary_only) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq, max_buffer_size=max_buffer_size, primary_only=primary_only) else: raise ValueError("Illegal order specified.") empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write( "%d SAM alignment record%s processed.\n" % (i, "s" if not pe_mode else " pairs")) sys.stderr.flush() i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout(r, "__not_aligned", samoutfile) continue if ((secondary_alignment_mode == 'ignore') and r.not_primary_alignment): continue if ((supplementary_alignment_mode == 'ignore') and r.supplementary): continue try: if r.optional_field("NH") > 1: nonunique += 1 write_to_samout(r, "__alignment_not_unique", samoutfile) if multimapped_mode == 'none': continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(r, "__too_low_aQual", samoutfile) continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if (co.type in com and co.size > 0)) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type in com and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type in com and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "__not_aligned", samoutfile) notaligned += 1 continue if secondary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].not_primary_alignment: continue elif (r[1] is not None) and r[1].not_primary_alignment: continue if supplementary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].supplementary: continue elif (r[1] is not None) and r[1].supplementary: continue try: if ((r[0] is not None and r[0].optional_field("NH") > 1) or (r[1] is not None and r[1].optional_field("NH") > 1)): nonunique += 1 write_to_samout(r, "__alignment_not_unique", samoutfile) if multimapped_mode == 'none': continue except KeyError: pass if ((r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual)): lowqual += 1 write_to_samout(r, "__too_low_aQual", samoutfile) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode in ("intersection-strict", "intersection-nonempty"): fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if ((len(fs2) > 0) or (overlap_mode == "intersection-strict")): if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: write_to_samout(r, "__no_feature", samoutfile) empty += 1 elif len(fs) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]", samoutfile) ambiguous += 1 else: write_to_samout(r, list(fs)[0], samoutfile) if fs is not None and len(fs) > 0: if multimapped_mode == 'none': if len(fs) == 1: counts[list(fs)[0]] += 1 elif multimapped_mode == 'all': for fsi in list(fs): counts[fsi] += 1 else: sys.exit("Illegal multimap mode.") except UnknownChrom: write_to_samout(r, "__no_feature", samoutfile) empty += 1 except: sys.stderr.write( "Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string()) raise if not quiet: sys.stderr.write( "%d SAM %s processed.\n" % (i, "alignments " if not pe_mode else "alignment pairs")) sys.stderr.flush() if samoutfile is not None: samoutfile.close() counts_all.append(counts.copy()) for fn in counts: counts[fn] = 0 empty_all.append(empty) ambiguous_all.append(ambiguous) lowqual_all.append(lowqual) notaligned_all.append(notaligned) nonunique_all.append(nonunique) pad = ['' for attr in additional_attributes] for fn in sorted(counts.keys()): print('\t'.join([fn] + attributes[fn] + [str(c[fn]) for c in counts_all])) print('\t'.join(["__no_feature"] + pad + [str(c) for c in empty_all])) print('\t'.join(["__ambiguous"] + pad + [str(c) for c in ambiguous_all])) print('\t'.join(["__too_low_aQual"] + pad + [str(c) for c in lowqual_all])) print('\t'.join(["__not_aligned"] + pad + [str(c) for c in notaligned_all])) print('\t'.join(["__alignment_not_unique"] + pad + [str(c) for c in nonunique_all]))
def main(): parser = argparse.ArgumentParser( description='Assign reads to different genomic regions') parser.add_argument('--input', '-i', type=str, required=True, help="Input bam file, should in hg38 coordinate") parser.add_argument('--strandness', '-s', type=str, default="no", choices=["forward", "reverse", "no"]) parser.add_argument( '--filter', '-f', type=int, default=3, help="Only consider exon with mean coverage higher than this value") parser.add_argument('--gtf', '-a', type=str, default="genome/gtf/gencode.v27.annotation.gtf", help="gtf annotation") parser.add_argument('--coverage', '-c', type=str, required=True, help="Output coverage") parser.add_argument('--pdf', '-p', type=str, default=None, help="Output coverage plot") args = parser.parse_args() if args.strandness != "no": ga1 = HTSeq.GenomicArray("auto", stranded=True) ga2 = HTSeq.GenomicArray("auto", stranded=True) else: ga1 = HTSeq.GenomicArray("auto", stranded=False) ga2 = HTSeq.GenomicArray("auto", stranded=False) #chr1 HAVANA exon 12613 12721 print("Load bam file ...") bam = HTSeq.BAM_Reader(args.input) for read1, read2 in tqdm( HTSeq.pair_SAM_alignments_with_buffer(bam, max_buffer_size=5000000)): if (read1 is None) or (read2 is None): continue if not (read1.aligned and read2.aligned): continue if read1.iv.chrom != read2.iv.chrom: continue else: read1.iv.strand = "." read2.iv.strand = "." for cigop in read1.cigar: if cigop.type != "M": continue ga1[checkStrandness(cigop.ref_iv, "1", args.strandness)] += 1 for cigop in read2.cigar: if cigop.type != "M": continue ga2[checkStrandness(cigop.ref_iv, "2", args.strandness)] += 1 print("Done.") exonNumber = 0 print("Get coverage of exons in gtf annotation ...") fivePrime1 = np.zeros(100) fivePrime2 = np.zeros(100) threePrime1 = np.zeros(100) threePrime2 = np.zeros(100) with open(args.gtf) as f: for line in tqdm(f): line = line.strip() if len(line) == 0 or line.startswith("#"): continue fields = line.split("\t") if fields[2] != "exon": continue strand = fields[6] if strand == "+": fivePrimeBoundary = HTSeq.GenomicInterval( fields[0], int(fields[3]) - 1 - 50, int(fields[3]) + 49, strand) threePrimeBoundary = HTSeq.GenomicInterval( fields[0], int(fields[4]) - 1 - 50, int(fields[4]) + 49, strand) if fivePrimeBoundary.start < 0 or threePrimeBoundary.start < 0: continue fivePrime1_ = np.fromiter(ga1[fivePrimeBoundary], dtype="i") fivePrime2_ = np.fromiter(ga2[fivePrimeBoundary], dtype="i") threePrime1_ = np.fromiter(ga1[threePrimeBoundary], dtype="i") threePrime2_ = np.fromiter(ga2[threePrimeBoundary], dtype="i") else: fivePrimeBoundary = HTSeq.GenomicInterval( fields[0], int(fields[4]) - 1 - 50, int(fields[4]) + 49, strand) threePrimeBoundary = HTSeq.GenomicInterval( fields[0], int(fields[3]) - 1 - 50, int(fields[3]) + 49, strand) if fivePrimeBoundary.start < 0 or threePrimeBoundary.start < 0: continue fivePrime1_ = np.fromiter(ga1[fivePrimeBoundary], dtype="i")[::-1] fivePrime2_ = np.fromiter(ga2[fivePrimeBoundary], dtype="i")[::-1] threePrime1_ = np.fromiter(ga1[threePrimeBoundary], dtype="i")[::-1] threePrime2_ = np.fromiter(ga2[threePrimeBoundary], dtype="i")[::-1] if (fivePrime1_.mean() > args.filter) or (fivePrime2_.mean() > args.filter) or ( threePrime1_.mean() > args.filter) or (threePrime2_.mean() > args.filter): exonNumber += 1 fivePrime1 += fivePrime1_ fivePrime2 += fivePrime2_ threePrime1 += threePrime1_ threePrime2 += threePrime2_ print("Done .") df = pd.DataFrame({ "read1-5p": fivePrime1, "read1-3p": threePrime1, "read2-5p": fivePrime2, "read2-3p": threePrime2 }) df = df / exonNumber df.to_csv(args.coverage, sep="\t") if args.pdf is not None: plotCoverage(df, args.pdf)
def count_reads_in_features(sam_filename, gff_filename, samtype, order, overlap_mode, feature_type, id_attribute, quiet, minaqual, mapping_file, scale_method): features = HTSeq.GenomicArrayOfSets("auto", False) counts = {} # Try to open samfile to fail early in case it is not there if sam_filename != "-": open(sam_filename).close() # Try to open mapping file to fail early in case it is not there if mapping_file: open(mapping_file).close() gff = HTSeq.GFF_Reader(gff_filename) i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[id_attribute] except KeyError: continue features[f.iv] += feature_id counts[feature_id] = 0 i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write("{!s} GFF lines processed.\n".format(i)) except: sys.stderr.write("Error occured when processing GFF file ({}):\n" .format(gff.get_line_number_string())) raise if not quiet: sys.stderr.write("{!s} GFF lines processed.\n".format(i)) num_features = len(counts) if num_features == 0: sys.stderr.write("Warning: No features of type '{}' found.\n" .format(feature_type)) if samtype == "sam": align_reader = HTSeq.SAM_Reader elif samtype == "bam": align_reader = HTSeq.BAM_Reader else: raise ValueError, "Unknown input format {} specified.".format(samtype) try: if sam_filename != "-": read_seq_file = align_reader(sam_filename) read_seq = read_seq_file first_read = iter(read_seq).next() else: read_seq_file = align_reader(sys.stdin) read_seq_iter = iter(read_seq_file) first_read = read_seq_iter.next() read_seq = itertools.chain([first_read], read_seq_iter) pe_mode = first_read.paired_end except: sys.stderr.write("Error occured when reading SAM/BAM file.\n" ) raise try: if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == "position": read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) else: raise ValueError, "Illegal order specified." empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write("{!s} SAM alignment record{} processed.\n" .format(i, "s" if not pe_mode else " pairs")) i += 1 if not pe_mode: if not r.aligned: notaligned += 1 continue try: if r.optional_field("NH") > 1: nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 continue iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): notaligned += 1 continue try: if (r[0] is not None and r[0].optional_field("NH") > 1 ) or \ (r[1] is not None and r[1].optional_field("NH") > 1): nonunique += 1 continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual): lowqual += 1 continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: empty += 1 elif len(fs) > 1: ambiguous += 1 else: counts[list(fs)[0]] += 1 except UnknownChrom: empty += 1 except: sys.stderr.write("Error occured when processing SAM input ({}):\n" .format(read_seq_file.get_line_number_string())) raise if not quiet: sys.stderr.write("{!s} SAM {} processed.\n" .format(i, "alignments " if not pe_mode else "alignment pairs")) # map to higher order features if applicable if mapping_file: abundances = {} with open(mapping_file) as mapping_h: for row in csv.reader(mapping_h, delimiter='\t'): try: feature, feature_category, feature_length, organism = row except ValueError: sys.stderr.write("Can't determine the format of '{}'".format(mapping_file)) raise if feature not in counts: continue if not feature_category: feature_category = feature abund = counts[feature] if scale_method == 'none' else scale_abundance(counts[feature], int(feature_length)) if ',' in feature_category: cats = feature_category.split(',') for category in cats: abundances[category] = abundances.get(category, 0) + abund else: abundances[feature_category] = abundances.get(feature_category, 0) + abund if num_features > 0 and len(abundances) == 0: sys.stderr.write("Warning: No higher order features found. Please " "make sure the mapping file is formatted correctly.\n") for feature in counts: if feature not in abundances: abundances['UNMAPPED'] = abundances.get('UNMAPPED', 0) + counts[feature] else: abundances = counts # "UNMAPPED" can be interpreted as a single unknown gene of length 1 # kilobase recruiting all reads that failed to map to known sequences abundances['UNMAPPED'] = (abundances.get('UNMAPPED', 0) + empty + ambiguous + lowqual + notaligned + nonunique) for fn in sorted(abundances.keys()): print("{}\t{!s}".format(fn, abundances[fn])) sys.stderr.write("__no_feature\t{!s}\n".format(empty)) sys.stderr.write("__ambiguous\t{!s}\n".format(ambiguous)) sys.stderr.write("__too_low_aQual\t{!s}\n".format(lowqual)) sys.stderr.write("__not_aligned\t{!s}\n".format(notaligned)) sys.stderr.write("__alignment_not_unique\t{!s}\n".format(nonunique))
def count_PE_reads(sam_files, labels, regions, file_type="sam", use_chrom_name=False, order="name"): """ counts fragments (PE read pairs) for each region from all SAM/BAM files """ assert len(sam_files) == len(labels) if use_chrom_name: print "INFO: Running in mode for counting per chromosome name." m = len(sam_files) # initialize a list with default zero counts all_counts = [collections.Counter() for i in range(m)] # iterate over all sam/bam files for j in range(m): print "INFO: Start to count reads in", sam_files[j], "..." if file_type == "sam": almnt_file = HTSeq.SAM_Reader(sam_files[j]) else: almnt_file = HTSeq.BAM_Reader(sam_files[j]) # pair alignment records according to PE pairs and iterate over pairs if order == "name": print "INFO: Assuming SAM/BAM file ordered by read name." alignmentIterator = HTSeq.pair_SAM_alignments(almnt_file) else: print "INFO: Assuming SAM/BAM file ordered by position" alignmentIterator = HTSeq.pair_SAM_alignments_with_buffer( almnt_file, max_buffer_size=100 * 3000000) for pair in alignmentIterator: first_almnt, second_almnt = pair # extract pair # check if both pairs are mapped if first_almnt == None or second_almnt == None or not ( first_almnt.aligned and second_almnt.aligned): all_counts[j]["_unmapped"] += 1 continue # potential speed up for transcript fragments as reference if use_chrom_name: if first_almnt.iv.chrom == second_almnt.iv.chrom: all_counts[j][first_almnt.iv.chrom] += 1 else: all_counts[j]["_no_feature"] += 1 else: # build set for all regions overalapping with the reads gene_ids_first = set() gene_ids_second = set() # extract all region names that overlap with the reads and add them to set for iv, val in regions[first_almnt.iv].steps(): gene_ids_first |= val for iv, val in regions[second_almnt.iv].steps(): gene_ids_second |= val # take only those genes that are common for first and second read gene_ids = gene_ids_first & gene_ids_second # handle read-pairs not mapped to a feature if len(gene_ids) == 0: all_counts[j]["_no_feature"] += 1 # if pair maps to a unique gene count it else: # add increase counter for all genes for gene_id in list(gene_ids): all_counts[j][gene_id] += 1 # return counts return (all_counts)
def pool(infile, targets, intron_set, fiveSS, threeSS, Branches, Branchto3ss): SI_counts = defaultdict(int) junction_counts = defaultdict(int) for f, s in HTSeq.pair_SAM_alignments_with_buffer( HTSeq.BAM_Reader('%s/%s.bam' % (infile, infile))): if f != None and f.aligned == True and f.aQual > 5: chrome = f.iv.chrom start = f.iv.start end = f.iv.end strand = f.iv.strand if strand == '+': geneint = HTSeq.GenomicPosition(chrome, start, strand) else: geneint = HTSeq.GenomicPosition(chrome, end, strand) if len(targets[geneint]) == 0: introns = set() junctions = set() for i, cigop in enumerate(f.cigar): if cigop.type == 'M': for iv, val in targets[cigop.ref_iv].steps(): introns |= val elif cigop.type == 'N': if f.cigar[i - 1].type == 'M' and f.cigar[ i - 1].size > 3 and f.cigar[ i + 1].type == 'M' and f.cigar[i + 1].size > 3: for iv, val in targets[cigop.ref_iv].steps(): junctions |= val chrom = cigop.ref_iv.chrom if cigop.ref_iv.strand == '+': first = cigop.ref_iv.end second = cigop.ref_iv.start + 1 strand = "+" else: first = cigop.ref_iv.start + 1 second = cigop.ref_iv.end strand = '-' if (chrom, first, strand) in fiveSS and (chrom, second, strand) in threeSS: up = fiveSS[chrom, first, strand] down = threeSS[chrom, second, strand] if up[0] == down[0]: if up[1] == down[1]: junction_counts[(infile, up[0], int(up[1]), int(down[1]) + 1, "Constituitive")] += 1 else: junction_counts[(infile, up[0], int(up[1]), int(down[1]) + 1, "Exon Skipping")] += 1 elif (chrom, first, strand) in fiveSS: junction_counts[(infile, up[0], int(up[1]), int(down[1]) + 1, "Alternative 3'")] += 1 elif (chrom, second, strand) in threeSS: junction_counts[(infile, up[0], int(up[1]), int(down[1]) + 1, "Alternative 5'")] += 1 intron_num_mat = {} intron_num_pre = {} intron = '' junction = '' if len(introns) > 0: for i in introns: a = i.split(';') intron_num_pre[i] = a[1] intron = max(intron_num_pre.items(), key=lambda x: x[1]) intron = intron[0] if len(junctions) > 0: for i in junctions: a = i.split(';') intron_num_mat[i] = a[1] junction = max(intron_num_mat.items(), key=lambda x: x[1]) junction = junction[0] if junction == intron: intron = '' junction = '' if junction and intron: if junction.split(';')[1] > intron.split(';')[1]: intron = '' else: junction = '' candidate_genes = set() for i in introns: candidate_genes.add(i.split(';')[0]) for i in junctions: candidate_genes.add(i.split(';')[0]) if len(candidate_genes) == 1: if junction: SI_counts[('mature', junction)] += 1 if intron: SI_counts[('premature', intron)] += 1 if f.proper_pair == True and s.proper_pair == True and s.aligned == True and s.aQual > 5: if junction: SI_counts[('concordant_mature', junction)] += 1 if intron: SI_counts[('concordant_premature', intron)] += 1 # Counts starting position of read 2's that fall within specified lariat intermediate and branch to 3'SS windows if intron > 0 and s.aligned == True and s.proper_pair == True and s.aQual > 5: chrome = s.iv.chrom start = s.iv.start end = s.iv.end strand = s.iv.strand if strand == '+': geneint = HTSeq.GenomicPosition(chrome, start, strand) else: geneint = HTSeq.GenomicPosition(chrome, end, strand) if intron in Branches[geneint] and len( Branches[geneint]) == 1: SI_counts[('lariat_int', intron)] += 1 if intron in Branchto3ss[geneint] and len( Branchto3ss[geneint]) == 1: SI_counts[('branch_to3ss', intron)] += 1 with open('%s/%s_splicing_counts.txt' % (infile, infile), 'w') as out: for intron in sorted(intron_set): out.write('%s\t%d\t%d\n' % (intron, SI_counts[('mature', intron)], SI_counts[('premature', intron)])) with open('%s/%s_concordant_splicing_counts.txt' % (infile, infile), 'w') as out: for intron in sorted(intron_set): out.write('%s\t%d\t%d\n' % (intron, SI_counts[('concordant_mature', intron)], SI_counts[('concordant_premature', intron)])) with open('%s/%s_lariat_int_counts.txt' % (infile, infile), 'w') as out: for intron in sorted(intron_set): out.write('%s\t%d\n' % (intron, SI_counts[('lariat_int', intron)])) with open('%s/%s_branch_to3ss_counts.txt' % (infile, infile), 'w') as out: for intron in sorted(intron_set): out.write('%s\t%d\n' % (intron, SI_counts[('branch_to3ss', intron)])) with open('%s/%s_junction_counts.txt' % (infile, infile), 'w') as out: out.write('Gene\tUpstream\tDownstream\tType\tCount\n') for junc in sorted(junction_counts): out.write( '%s\t%d\t%d\t%s\t%d\n' % (junc[1], junc[2], junc[3], junc[4], junction_counts[junc]))
def count_reads_single_file( isam, sam_filename, features, feature_attr, order, max_buffer_size, stranded, overlap_mode, multimapped_mode, secondary_alignment_mode, supplementary_alignment_mode, feature_type, id_attribute, additional_attributes, quiet, minaqual, samout_format, samout_filename, ): def write_to_samout(r, assignment, samoutfile, template=None): if samoutfile is None: return if not pe_mode: r = (r, ) for read in r: if read is not None: read.optional_fields.append(('XF', assignment)) if samout_format in ('SAM', 'sam'): samoutfile.write(read.get_sam_line() + "\n") else: samoutfile.write(read.to_pysam_AlignedSegment(template)) try: if sam_filename == "-": read_seq_file = HTSeq.BAM_Reader(sys.stdin) else: read_seq_file = HTSeq.BAM_Reader(sam_filename) # Get template for output BAM if samout_filename is None: template = None samoutfile = None elif samout_format in ('bam', 'BAM'): template = read_seq_file.get_template() samoutfile = pysam.AlignmentFile( samout_filename, 'wb', template=template, ) else: template = None samoutfile = open(samout_filename, 'w') read_seq_iter = iter(read_seq_file) # Catch empty BAM files try: first_read = next(read_seq_iter) pe_mode = first_read.paired_end # FIXME: catchall can hide subtle bugs except: first_read = None pe_mode = False if first_read is not None: read_seq = itertools.chain([first_read], read_seq_iter) else: read_seq = [] except: sys.stderr.write( "Error occured when reading beginning of SAM/BAM file.\n") raise # CIGAR match characters (including alignment match, sequence match, and # sequence mismatch com = ('M', '=', 'X') counts = {key: 0 for key in feature_attr} try: if pe_mode: if ((supplementary_alignment_mode == 'ignore') and (secondary_alignment_mode == 'ignore')): primary_only = True else: primary_only = False if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq, primary_only=primary_only) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq, max_buffer_size=max_buffer_size, primary_only=primary_only) else: raise ValueError("Illegal order specified.") empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write("%d alignment record%s processed.\n" % (i, "s" if not pe_mode else " pairs")) sys.stderr.flush() i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout(r, "__not_aligned", samoutfile, template) continue if ((secondary_alignment_mode == 'ignore') and r.not_primary_alignment): continue if ((supplementary_alignment_mode == 'ignore') and r.supplementary): continue try: if r.optional_field("NH") > 1: nonunique += 1 write_to_samout(r, "__alignment_not_unique", samoutfile, template) if multimapped_mode == 'none': continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(r, "__too_low_aQual", samoutfile, template) continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if (co.type in com and co.size > 0)) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type in com and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type in com and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "__not_aligned", samoutfile, template) notaligned += 1 continue if secondary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].not_primary_alignment: continue elif (r[1] is not None) and r[1].not_primary_alignment: continue if supplementary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].supplementary: continue elif (r[1] is not None) and r[1].supplementary: continue try: if ((r[0] is not None and r[0].optional_field("NH") > 1) or (r[1] is not None and r[1].optional_field("NH") > 1)): nonunique += 1 write_to_samout(r, "__alignment_not_unique", samoutfile, template) if multimapped_mode == 'none': continue except KeyError: pass if ((r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual)): lowqual += 1 write_to_samout(r, "__too_low_aQual", samoutfile, template) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode in ("intersection-strict", "intersection-nonempty"): fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if ((len(fs2) > 0) or (overlap_mode == "intersection-strict")): if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: write_to_samout(r, "__no_feature", samoutfile, template) empty += 1 elif len(fs) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]", samoutfile, template) ambiguous += 1 else: write_to_samout(r, list(fs)[0], samoutfile, template) if fs is not None and len(fs) > 0: if multimapped_mode == 'none': if len(fs) == 1: counts[list(fs)[0]] += 1 elif multimapped_mode == 'all': for fsi in list(fs): counts[fsi] += 1 elif multimapped_mode == 'fraction': for fsi in list(fs): counts[fsi] += 1.0 / len(fs) elif multimapped_mode == 'random': fsi = random.choice(fs) counts[fsi] += 1 else: sys.exit("Illegal multimap mode.") except UnknownChrom: write_to_samout(r, "__no_feature", samoutfile, template) empty += 1 except: sys.stderr.write("Error occured when processing input (%s):\n" % (read_seq_file.get_line_number_string())) raise if not quiet: sys.stderr.write( "%d %s processed.\n" % (i, "alignments " if not pe_mode else "alignment pairs")) sys.stderr.flush() if samoutfile is not None: samoutfile.close() return { 'isam': isam, 'counts': counts, 'empty': empty, 'ambiguous': ambiguous, 'lowqual': lowqual, 'notaligned': notaligned, 'nonunique': nonunique, }
def main(): exe_parser = argparse.ArgumentParser() exe_parser.add_argument('infile', type=str, help='<input file> [(full path), -b/-s required]') exe_parser.add_argument("-u", "--not_aligned", help="output reads that were not aligned, including those that were aligned multiple times(flat file).", type=str) exe_parser.add_argument("-s", "--samout", help="output not aligned reads to [file path].", type=str) exe_parser.add_argument("-b", "--ambiguous_out", help="output a fasta file of ambiguous hits [file path].", type=str) exe_parser.add_argument("-v", "--verbose", help="verbose. (default = TRUE).", action="store_true") exe_parser.add_argument("gff", help="<gff file> [(full path)]", type=str) exe_parser.add_argument("-f", "--fasta", help="output fasta file of hits (full path).", type=str) exe_parser.add_argument("-m", "--min_read_length", help="minimal read length to consider. (default = 60b).", type=int) exe_parser.add_argument("-i", "--min_id", help="minimal percent id of hit to consider. (default = 80).", type=int) exe_parser.add_argument("-z", "--min_score", help="minimal aligner score to consider. (default = 0).", type=int) exe_parser.add_argument("-c", "--max_clip", help="proportion of bases clipped from read for alignment. (default = 0.3).", type=float) exe_parser.add_argument("--stranded", help="whether the data is stranded (y, n, reverse). (default = n).", type=str, choices=["y", "n", "reverse"], default="n") exe_parser.add_argument("--idattr", help="GFF attribute to be used as feature ID. (default = GeneID).", type=str) exe_parser.add_argument("--type", help="feature type (3rd column in GFF file) to be used. (default = CDS).", type=str) exe_parser.add_argument("-a", "--minaqual", help="min. alignment quality (default = 0).", type=str) exe_parser.add_argument("-p", "--paired_end_mode", help="input is paired end sorted by name (n) or position (p) . (default = p).", type=str, choices=["p", "n"], default="p") exe_parser.add_argument("-o", "--out", help="name of counts output file.", type=str) args = exe_parser.parse_args() if args.paired_end_mode == 'p': paired_end = True pe_order = 'p' elif args.paired_end_mode == 'n': paired_end = True pe_order = 'n' if args.infile: try: if args.infile == '-': # get sam on a stream seqfile = HTSeq.SAM_Reader(sys.stdin) if args.paired_end_mode: # read_seq_iter = iter(seqfile) # first_read = read_seq_iter.next() # read_seq = itertools.chain([first_read], read_seq_iter) # reader = HTSeq.pair_SAM_alignments(read_seq) if pe_order == 'p': reader = HTSeq.pair_SAM_alignments_with_buffer(seqfile) elif pe_order == 'n': reader = HTSeq.pair_SAM_alignments(seqfile) # (read_seq) else: reader = seqfile elif args.infile != '-': seqfile = HTSeq.SAM_Reader(args.infile) if args.paired_end_mode: read_seq_iter = iter(seqfile) first_read = read_seq_iter.next() read_seq = itertools.chain([first_read], read_seq_iter) reader = HTSeq.pair_SAM_alignments(read_seq) if pe_order == 'p': reader = HTSeq.pair_SAM_alignments_with_buffer(reader) elif pe_order == 'n': reader = HTSeq.pair_SAM_alignments(reader) else: reader = seqfile # fread_seq_iter = iter(reader) # first_read = iter(read_seq).next() elif args.infile == '': print "no input file type given. exiting..." sys.exit(1) except: print "failed processing SAM/BAM file" raise elif not args.infile: print "no input file given. exiting..." sys.exit(1) if args.gff: gff_file = args.gff else: print "no gff file given. exiting..." sys.exit(1) if args.verbose: verbose = True else: verbose = False if args.min_read_length: min_read_len = args.min_read_length else: min_read_len = 60 # default read length if args.max_clip: max_clip_ = float(args.max_clip) else: max_clip_ = float(0.3) # default read length if args.min_id: min_id = float(args.min_id) else: min_id = float(80) if args.min_score: min_score = int(args.min_score) else: min_score = 0 if args.stranded == 'n': stranded = 'no' elif args.stranded == 'y': stranded = 'yes' elif args.stranded == 'reverse': stranded = 'reverse' if args.minaqual: minaqual = args.minaqual else: minaqual = 0 if args.idattr: id_attribute = args.idattr else: id_attribute = "GeneID" if args.type: feature_type = args.type else: feature_type = 'CDS' # ### # parse GFF file features, counts = gff_reader(gff_file, feature_type, id_attribute, verbose, stranded) # ### if args.samout: samoutfile = open(args.samout, "w") else: samoutfile = None if args.ambiguous_out: ambiguousfile = open(args.ambiguous_out, "w") else: ambiguousfile = None if args.fasta: fastafile = open(args.fasta, "w") else: fastafile = None if args.not_aligned: not_aligned_file = open(args.not_aligned, "w") else: not_aligned_file = None if args.out: outfile = open(args.out, "w") else: outfile = None # if outfile and samoutfile and ambiguousfile and fastafile and not_aligned_file == None: # print "None of the possible output file options specified. exiting..." # sys.exit(1) # ####### # decalre counter variables empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 # ####### read_counter = 0 for alignment in reader: # for alignment entry (line in fact) in sam file # iv_seq # print alignment if not paired_end: if read_counter % 1000000 == 0 and verbose: if verbose: print read_counter, 'non paired-end alignments processed' read_name = alignment.read.name # read = alignment.read # READ. Note that def invert_strand( iv ): read_seq = alignment.read.seq read_length = len(alignment.read.seq) if not alignment.aligned: # check if read is aligned to ref sequence if alignment is not None: notaligned += 1 if args.samout: write_to_samout(samoutfile, paired_end, alignment, "not_aligned") if args.not_aligned: not_aligned_file.write(read_name + '\t' + 'not_aligned' + '\n') # continue elif alignment.aligned: opt_fields = alignment.optional_fields # flag = alignment.flag cigar_string = parse_cigar(alignment.original_sam_line.split('\t')[ 5]) # just the cigar string without the fancy HTseq additions cigar_soft_clipped, cigar_m, cigar_insertions, cigar_deletions, cigar_insertions = parse_cigar_alignment(cigar_string) # get alignment data from cigar string score, md_matches, md_deletions, md_mismatches = parse_opt_fields( opt_fields) # get alignment data from md string percent_id = 100.0 * ( float(md_matches) / (float(read_length - cigar_soft_clipped + cigar_insertions + cigar_deletions))) if alignment[0] is not None: # check if read is aligned to ref sequence if alignment.optional_field("NH") > 1: # check if read is mapped more than once # By default these reads are discarded. CHANGE? if args.samout: write_to_samout(samoutfile, paired_end, alignment, "alignment_not_unique") nonunique += 1 if args.not_aligned: not_aligned_file.write(read_name + '\t' + 'alignment_not_unique' + '\n') # continue if alignment.aQual < minaqual: # check quality. default is 0 lowqual += 1 if args.samout: write_to_samout(samoutfile, paired_end, alignment, "too_low_aQual") if args.not_aligned: not_aligned_file.write(read_name + '\t' + 'too_low_aQual' + '\n') # continue clipped = (float(cigar_soft_clipped) / float(read_length)) if read_length >= min_read_len: if (float(cigar_soft_clipped) / float(read_length)) <= max_clip_: if score >= args.min_score: if percent_id >= float(min_id): if stranded == "reverse": iv_seq = ( (invert_strand(cigar_operation.ref_iv) for cigar_operation in alignment[1].cigar if cigar_operation.type == "M" and cigar_operation.size > 0)) else: iv_seq = (cigar_operation.ref_iv for cigar_operation in alignment.cigar if cigar_operation.type == "M" and cigar_operation.size > 0) iv_seq_good = True # collects hits to chromosomes/features. """ cigarOperation in HTSeq: HTSeq.parse_cigar( "20M6I10M", 1000, "chr2", "+" ) #ref_iv == genomicInterval object of htSeq [< CigarOperation: 20 base(s) matched on ref iv chr2:[1000,1020)/+,query iv[0,20)>, < CigarOperation: 6 base(s) inserted on ref iv chr2:[1020,1020)/+,query iv[20,26)>,] """ # if args.fasta: # fastafile.write('>' + read_name + '\n' + read_seq + '\n') else: iv_seq_good = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, "percent_id_too_low=" + str(percent_id)) if args.not_aligned: not_aligned_file.write( read_name + '\t' + 'percent_id_too_low=' + str(percent_id) + '\n') else: iv_seq_good = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, 'alignment_score_too_low=' + str(score)) if args.not_aligned: not_aligned_file.write( read_name + '\t' + 'alignment_score_too_low=' + str(score) + '\n') else: iv_seq_good = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, 'too_many_bases_clipped_from_read=' + str(cigar_soft_clipped)) if args.not_aligned: not_aligned_file.write(read_name + '\t' + 'too_many_bases_clipped_from_read=' + str( cigar_soft_clipped) + '\n') elif paired_end: # print "read counter=", read_counter if read_counter % 100000 == 0 and verbose: if verbose: print read_counter, 'alignment pairs processed' if (alignment[0] is None) or not alignment[0].aligned: notaligned += 1 try: read_1_name = alignment[0].read.name except: read_1_name = 'None' if args.samout: write_to_samout(samoutfile, paired_end, alignment, "not_aligned") if args.not_aligned: not_aligned_file.write(read_1_name + '\t' + 'not_aligned' + '\n') elif (alignment[1] is None) or not alignment[1].aligned: notaligned += 1 try: read_2_name = alignment[1].read.name except: read_2_name = 'None' if args.samout: write_to_samout(samoutfile, paired_end, alignment, "not_aligned") if args.not_aligned: not_aligned_file.write(read_2_name + '\t' + 'not_aligned' + '\n') else: # else: read_1_name = alignment[0].read.name # read_1 = alignment[0].read #READ. read_1_length = len(alignment[0].read.seq) read_1_seq = alignment[0].read.seq read_2_name = alignment[1].read.name # read_2 = alignment[1].read #READ. # read_2_length = len(alignment[1].read.seq) read_2_seq = alignment[1].read.seq iv_seq = tuple() if (alignment[0] is not None) and alignment[0].aligned: # check if read is aligned to ref sequence opt_1_fields = alignment[0].optional_fields # flag_1 = alignment[0].flag cigar_1_string = parse_cigar(alignment[0].original_sam_line.split('\t')[ 5]) # just the cigar string without the fancy HTseq additions cigar_1_soft_clipped, cigar_1_m, cigar_1_insertions, cigar_1_deletions, cigar_1_insertions = parse_cigar_alignment( cigar_1_string) score_1, md_1_matches, md_1_deletions, md_1_mismatches = parse_opt_fields( opt_1_fields) # get alignment data from md string percent_1_id = (100.0 * ((float(md_1_matches) / ( float(read_1_length - cigar_1_soft_clipped + cigar_1_insertions + cigar_1_deletions))))) clipped_1 = (float(cigar_1_soft_clipped) / float(read_1_length)) if int(read_1_length) >= int(min_read_len): if (float(cigar_1_soft_clipped) / float(read_1_length)) <= float(max_clip_): # if int(score_1) >= int(args.min_score): if int(score_1) >= int(min_score): # if float(percent_1_id) >= float(args.min_id): if float(percent_1_id) >= float(min_id): if stranded == "reverse": iv_seq = itertools.chain(iv_seq, (invert_strand(cigar_operation.ref_iv) for cigar_operation in alignment[0].cigar if cigar_operation.type == "M" and cigar_operation.size > 0)) else: iv_seq = itertools.chain(iv_seq, (cigar_operation.ref_iv for cigar_operation in alignment[0].cigar if cigar_operation.type == "M" and cigar_operation.size > 0)) # if args.fasta: # fastafile.write('>' + read_1_name + '\n' + read_1_seq + '\n') iv_seq_good_1 = True else: iv_seq_good_1 = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, "percent_id_too_low=" + str(percent_1_id)) if args.not_aligned: not_aligned_file.write( read_1_name + '\t' + 'percent_id_too_low=' + str(percent_1_id) + '\n') else: iv_seq_good_1 = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, 'alignment_score_too_low=' + str(score_1)) if args.not_aligned: not_aligned_file.write( read_1_name + '\t' + 'alignment_score_too_low=' + str(score_1) + '\n') else: iv_seq_good = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, 'too_many_bases_clipped_from_read=' + str(cigar_1_soft_clipped)) if args.not_aligned: not_aligned_file.write(read_1_name + '\t' + 'too_many_bases_clipped_from_read=' + str( cigar_1_soft_clipped) + '\n') # else: # iv_seq = tuple() if (alignment[1] is not None) and alignment[1].aligned: # check if read is aligned to ref sequence opt_2_fields = alignment[1].optional_fields # flag_2 = alignment[1].flag # ', #'bit_length', 'conjugate', 'denominator', 'imag', 'numerator', 'real'] cigar_2_string = parse_cigar(alignment[1].original_sam_line.split('\t')[ 5]) # just the cigar string without the fancy HTseq additions cigar_2_soft_clipped, cigar_2_m, cigar_2_insertions, cigar_2_deletions, cigar_2_insertions = parse_cigar_alignment( cigar_2_string) score_2, md_2_matches, md_2_deletions, md_2_mismatches = parse_opt_fields( opt_2_fields) # get alignment data from md string read_2_name = alignment[1].read.name read_2_length = len(alignment[1].read.seq) # read_2 = alignment[1].read # READ. read_2_seq = alignment[1].read.seq percent_2_id = (100.0 * (float(md_2_matches) / ( float(read_2_length - cigar_2_soft_clipped + cigar_2_insertions + cigar_2_deletions)))) clipped_2 = (float(cigar_2_soft_clipped) / float(read_2_length)) if int(read_2_length) >= int(min_read_len): if (float(cigar_2_soft_clipped) / float(read_2_length)) <= float(max_clip_): if int(score_2) >= int(min_score): if float(percent_2_id) >= float(min_id): if stranded == "reverse": iv_seq = itertools.chain(iv_seq, (invert_strand(cigar_operation.ref_iv) for cigar_operation in alignment[1].cigar if cigar_operation.type == "M" and cigar_operation.size > 0)) else: iv_seq = itertools.chain(iv_seq, (cigar_operation.ref_iv for cigar_operation in alignment[1].cigar if cigar_operation.type == "M" and cigar_operation.size > 0)) iv_seq_good_2 = True try: if (alignment[0].optional_field("NH") > 1) or (alignment[1].optional_field( "NH") > 1): # or (alignment[1].optional_field("NH") > 1): #check if read is mapped more # than once # By default these reads are discarded. CHANGE? iv_seq_good_1 = False iv_seq_good_2 = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, "alignment_not_unique") nonunique += 1 if args.not_aligned: not_aligned_file.write(read_1_name + '\t' + 'not_aligned' + '\n') not_aligned_file.write(read_2_name + '\t' + 'not_aligned' + '\n') continue except KeyError: pass if (alignment[0] and alignment[0].aQual < minaqual) or (alignment[1] and alignment[1].aQual < minaqual): # check quality. default is 0 iv_seq_good_2 = False lowqual += 1 if args.samout: write_to_samout(samoutfile, paired_end, alignment, "too_low_aQual") if args.not_aligned: not_aligned_file.write(read_1_name + '\t' + 'not_aligned' + '\n') not_aligned_file.write(read_2_name + '\t' + 'not_aligned' + '\n') continue else: iv_seq_good_2 = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, "percent_id_too_low=" + str(percent_2_id)) if args.not_aligned: not_aligned_file.write( read_2_name + '\t' + 'percent_id_too_low=' + str(percent_2_id) + '\n') else: iv_seq_good_2 = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, 'alignment_score_too_low=' + str(score_2)) if args.not_aligned: not_aligned_file.write( read_2_name + '\t' + 'alignment_score_too_low=' + str(score_2) + '\n') else: iv_seq_good_2 = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, 'too_many_bases_clipped_from_read=' + str(cigar_2_soft_clipped)) if args.not_aligned: not_aligned_file.write(read_2_name + '\t' + 'too_many_bases_clipped_from_read=' + str( cigar_2_soft_clipped) + '\n') read_counter += 1 """ overlap_mode == "union" will count a hit even if read is mapped across an intron or there is an insertion. """ try: feature_set = set() for iv in iv_seq: # print iv if iv.chrom not in features.chrom_vectors: # check if alignment feaure name in features from GFF file # The name of a sequence (i.e., chromosome, contig, or the like). # check the gff features dictionary raise UnknownChrom for iv2, fs2 in features[iv].steps(): # fs == feature steps. """ from HTseq manual: GenomicArray objects use by default so-called StepVectors that store the data internally in steps of constant value """ feature_set = feature_set.union(fs2) # print feature_set if feature_set is None or len(feature_set) == 0: if args.samout: write_to_samout(samoutfile, paired_end, alignment, "no_feature") if args.not_aligned: not_aligned_file.write('None' + '\t' + 'no_feature' + '\n') empty += 1 elif len(feature_set) > 1: if args.samout: write_to_samout(samoutfile, paired_end, alignment, "ambiguous[" + '+'.join(feature_set) + "]") if ambiguousfile: if paired_end: if iv_seq_good_1: ambiguousfile.write('>' + read_1_name + '_' + "ambiguous[" + '+'.join( feature_set) + "]" + '_clipped_' + str(clipped_1) + '_score_' + str(score_2) + '_percent_id_' + str(percent_1_id) + '\n' + read_1_seq + '\n') if iv_seq_good_2: ambiguousfile.write('>' + read_2_name + '_' + "ambiguous[" + '+'.join( feature_set) + "]" + '_clipped_' + str(clipped_2) + '_score_' + str(score_2) + '_percent_id_' + str(percent_2_id) + '\n' + read_2_seq + '\n') else: if iv_seq_good: ambiguousfile.write('>' + alignment.read.name + '_' + "ambiguous[" + '+'.join( feature_set) + "]" + '_clipped_' + str(clipped) + '_score_' + str(score) + '_percent_id_' + str(percent_id) + '\n' + read_seq + '\n') """ #if args.not_aligned: # if paired_end: # not_aligned_file.write(alignment[0].read.name + '\t' + 'ambiguous['+'+'.join(feature_set)+']' + '\n') # not_aligned_file.write(alignment[1].read.name + '\t' + 'ambiguous['+'+'.join(feature_set)+']' + '\n') # else: # not_aligned_file.write(alignment.read.name + '\t' + 'ambiguous['+'+'.join(feature_set)+']' + '\n') """ ambiguous += 1 elif len(feature_set) == 1: if args.samout: write_to_samout(samoutfile, paired_end, alignment, list(feature_set)[0]) if args.fasta: if paired_end: if iv_seq_good_1: fastafile.write('>' + read_1_name + '_' + ''.join(list(feature_set)[0]) + '_clipped_' + str( clipped_1) + '_score_' + str(score_1) + '_percent_id_' + str(percent_1_id) + '\n' + read_1_seq + '\n') if iv_seq_good_2: fastafile.write('>' + read_2_name + '_' + ''.join(list(feature_set)[0]) + '_clipped_' + str( clipped_2) + '_score_' + str(score_2) + '_percent_id_' + str(percent_2_id) + '\n' + read_2_seq + '\n') else: if iv_seq_good: fastafile.write('>' + read_name + '_' + ''.join(list(feature_set)[0]) + '_clipped_' + str( clipped) + '_score_' + str(score) + '_percent_id_' + str(percent_id) + '\n' + read_seq + '\n') counts[list(feature_set)[0]] += 1 except: if args.samout: write_to_samout(samoutfile, paired_end, alignment, "__no_feature") empty += 1 # if not paired_end: # al = alignment # else: # al = alignment[0] if alignment[0] is not None else alignment[1] # if args.not_aligned: # not_aligned_file.write(al.read.name + '\t' + 'feature_not_in_gff_file' + '\n') # if not verbose: # print (("Warning: Skipping read '%s', because chromosome " + # "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % # (al.read.name, iv.chrom) ) print 'total', read_counter, 'alignments processed' if samoutfile is not None: samoutfile.close() if fastafile is not None: fastafile.close if not_aligned_file is not None: not_aligned_file.close() if outfile is not None: for feature in sorted(counts.keys()): outfile.write("%s\t%d\n" % (feature, counts[feature])) outfile.write("no_feature\t%d\n" % empty) outfile.write("ambiguous\t%d\n" % ambiguous) outfile.write("too_low_aQual\t%d\n" % lowqual) outfile.write("not_aligned\t%d\n" % notaligned) outfile.write("alignment_not_unique\t%d\n" % nonunique) if outfile is not None: outfile.close()
def count_reads_in_features(sam_filename, gff_filename, samtype, order, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout, include_non_annotated=False, htseq_no_ambiguous=True): """ This is taken from the function count_reads_in_features() from the script htseq-count in the HTSeq package version 0.61.p2 The reason to do so is to fix two really small bugs related to the SAM output. The code of the function is small and simple so for now we will use the patched function here. A patch request has been sent to the HTSeq team. The description of the parameters are the same as htseq-count. Two parameters were added to filter out what to write in the sam output The HTSEQ License HTSeq is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. The full text of the GNU General Public License, version 3, can be found here: http://www.gnu.org/licenses/gpl-3.0-standalone.html """ # Set up the filters count_reads_in_features.filter_htseq = \ ["__too_low_aQual", "__not_aligned", "__alignment_not_unique"] if not include_non_annotated: count_reads_in_features.filter_htseq.append("__no_feature") count_reads_in_features.filter_htseq_no_ambiguous = htseq_no_ambiguous # Open SAM output file flag_write = "wb" if samtype == "bam" else "wh" flag_read = "rb" if samtype == "bam" else "r" saminfile = pysam.AlignmentFile(sam_filename, flag_read) count_reads_in_features.samoutfile = pysam.AlignmentFile( samout, flag_write, template=saminfile) saminfile.close() # Counter of annotated records count_reads_in_features.annotated = 0 # Function to write to SAM output def write_to_samout(r, assignment): if not pe_mode: r = (r, ) for read in r: if read is not None and assignment not in count_reads_in_features.filter_htseq \ and not (count_reads_in_features.filter_htseq_no_ambiguous and assignment.find("__ambiguous") != -1): sam_record = read.to_pysam_AlignedRead( count_reads_in_features.samoutfile) sam_record.set_tag("XF", assignment, "Z") count_reads_in_features.samoutfile.write(sam_record) count_reads_in_features.annotated += 1 # Annotation objects features = HTSeq.GenomicArrayOfSets("auto", stranded != "no") counts = {} gff = HTSeq.GFF_Reader(gff_filename) try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[id_attribute] except KeyError: raise ValueError, ("Feature %s does not contain a '%s' attribute" \ % (f.name, id_attribute)) if stranded != "no" and f.iv.strand == ".": raise ValueError, ("Feature %s at %s does not have strand information but you are " \ "running htseq-count in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) features[f.iv] += feature_id counts[f.attr[id_attribute]] = 0 except: raise if len(counts) == 0: raise RuntimeError, "No features of type '%s' found.\n" % feature_type if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader else: raise ValueError, "Unknown input format %s specified." % samtype try: read_seq_file = SAM_or_BAM_Reader(sam_filename) read_seq = read_seq_file first_read = iter(read_seq).next() pe_mode = first_read.paired_end except: raise RuntimeError, "Error occurred when reading beginning of SAM/BAM file." try: if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) else: raise ValueError, "Illegal order specified." for r in read_seq: if not pe_mode: if not r.aligned: write_to_samout(r, "__not_aligned") continue try: if r.optional_field("NH") > 1: write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if r.aQual < minaqual: write_to_samout(r, "__too_low_aQual") continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "__not_aligned") continue try: if (r[0] is not None and r[0].optional_field("NH") > 1) \ or (r[1] is not None and r[1].optional_field("NH") > 1): write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual): write_to_samout(r, "__too_low_aQual") continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len( fs2 ) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: raise RuntimeError, "Illegal overlap mode." if fs is None or len(fs) == 0: write_to_samout(r, "__no_feature") elif len(fs) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]") else: write_to_samout(r, list(fs)[0]) except UnknownChrom: write_to_samout(r, "__no_feature") except: count_reads_in_features.samoutfile.close() raise count_reads_in_features.samoutfile.close() return count_reads_in_features.annotated