def count_reads_paired(read_seq, counter, order, quiet, minaqual): if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) else: raise ValueError("Illegal order specified.") i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: msg = "%d SAM alignment record pairs processed.\n" % (i) sys.stderr.write(msg) i += 1 if r[0] is not None and r[0].aligned: forward_iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) reverse_iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: forward_iv_seq = tuple() reverse_iv_seq = tuple() if r[1] is not None and r[1].aligned: rest = (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0) forward_iv_seq = itertools.chain(forward_iv_seq, rest) rest = (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0) reverse_iv_seq = itertools.chain(reverse_iv_seq, rest) else: if (r[0] is None) or not (r[0].aligned): counter.not_aligned(r) continue try: if (r[0] is not None and r[0].optional_field("NH") > 1) or \ (r[1] is not None and r[1].optional_field("NH") > 1): counter.non_unique(r) continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or \ (r[1] and r[1].aQual < minaqual): counter.too_low_quality(r) continue counter.forward_count(forward_iv_seq, r) counter.reverse_count(reverse_iv_seq, r) if not quiet: sys.stderr.write("%d SAM alignment pairs processed.\n" % (i))
def count_reads_paired(read_seq, counter, order, stranded, quiet, minaqual, write_to_samout ): if order == "name": read_seq = HTSeq.pair_SAM_alignments( read_seq ) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq ) else: raise ValueError, "Illegal order specified." i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write( "%d SAM alignment record%s processed.\n" % ( i, "s" if not pe_mode else " pairs" ) ) i += 1 if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain(iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: if ( r[0] is None ) or not ( r[0].aligned ): write_to_samout( r, "__not_aligned" ) counter.notaligned += 1 continue try: if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): counter.nonunique += 1 write_to_samout( r, "__alignment_not_unique" ) continue except KeyError: pass if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ): lowqual += 1 write_to_samout( r, "__too_low_aQual" ) continue counter.count(iv_seq, r) if not quiet: sys.stderr.write( "%d SAM %s processed.\n" % ( i, "alignments " if not pe_mode else "alignment pairs" ) )
def parse_fastx_sam_parallel(fastx_infile, sam_infile): """ Parse fastx and resulting sam file in parallel - generator yielding (name, seq, alignment_list) tuples. The sam file may contain multiple alignments per read. Program checks that the readnames match. """ fastx_generator = basic_seq_utilities.name_seq_generator_from_fasta_fastq(fastx_infile) sam_generator = iter(HTSeq.bundle_multiple_alignments(HTSeq.SAM_Reader(sam_infile))) if_finished_fastx, if_finished_sam = False, False while True: try: name, seq = fastx_generator.next() except StopIteration: if_finished_fastx = True try: alns = sam_generator.next() except StopIteration: if_finished_sam = True # if both finished, good, we're doine if if_finished_fastx and if_finished_sam: raise StopIteration # if one file was finished but the other wasn't, error! elif if_finished_fastx or if_finished_sam: raise DeepseqError("Parsing seq/aln files in parallel - inconsistent finished states! " +"(If finished: %s %s, %s %s)"%(fastx_infile, if_finished_fastx, sam_infile, if_finished_sam)) # if all the files still contained data, yield it else: name = name.split()[0] name2 = alns[0].read.name.split()[0] if not name2 == name: raise DeepseqError("Non-matching readnames between files! %s in %s, %s in %s"%(fastx_infile, name, sam_infile, name2)) yield (name, seq, alns)
def HTseq_count(bam_file, gtf_file, out_dir, identifier, parallel = True ): gtf_file = HTSeq.GFF_Reader(gtf_file) features = HTSeq.GenomicArrayOfSets( "auto", stranded=True ) print "extracting features from gtf file" for feature in gtf_file: # if feature.type == "exon": features[feature.iv] += feature.attr[identifier] counts = collections.Counter( ) almnt_file = HTSeq.SAM_Reader(bam_file) counts = collections.Counter( ) for bundle in HTSeq.pair_SAM_alignments( almnt_file, bundle=True ): if len(bundle) != 1: continue # Skip multiple alignments first_almnt, second_almnt = bundle[0] # extract pair if not first_almnt.aligned and second_almnt.aligned: count[ "_unmapped" ] += 1 continue gene_ids = set() for iv, val in features[ left_almnt.iv ].steps(): gene_ids |= val for iv, val in features[ right_almnt.iv ].steps(): gene_ids |= val if len(gene_ids) == 1: gene_id = list(gene_ids)[0] counts[ gene_id ] += 1 elif len(gene_ids) == 0: counts[ "_no_feature" ] += 1 else: counts[ "_ambiguous" ] += 1 for gene_id in counts: print gene_id, counts[ gene_id ]
def bam_count(args): bam = HTSeq.SAM_Reader(args.fi) #exons = htseq_read_gtf(args.fg) cnts = collections.Counter() for bundle in HTSeq.pair_SAM_alignments_with_buffer(bam): if len(bundle) != 1: continue aln1, aln2 = bundle[0] if not aln1.aligned and aln2.aligned: cnts["_unmapped"] += 1 continue gids = set() for iv, val in exons[aln1.iv].steps(): gids |= val for iv, val in exons[aln2.iv].steps(): gids |= val if len(gids) == 1: gid = list(gids)[0] cnts[gid] += 1 elif len(gids) == 0: cnts["_no_feature"] += 1 else: cnts["_ambiguous"] += 1 for gid in cnts: print("%s\t%d" % (gid, cnts[gid]))
def listFromCIGAR(cls, cigarstring,position_b0, refname, strand): read_parts = [] if strand == MINUS: # need to reverse the CIGAR logger.debug("Reversing CIGAR for minus strand read fragment") cigarstring = "".join(reversed(re.findall("\d+[MIDNSHP=X]", cigarstring))) op_type_list = [] for op in HTSeq.parse_cigar(cigarstring, position_b0, refname, strand): logger.debug(map(str,(op, op.query_from, op.query_to, op.ref_iv))) if op.type == "M": if "M" in op_type_list: if len(op_type_list) >=2 and op_type_list[-1] == "D" and op_type_list[-2] == "M": logger.debug(map(str,("extending (D):", op, op.query_from, op.query_to, op.ref_iv))) read_parts[-1].extend(op.query_from, op.query_to,op.ref_iv.start,op.ref_iv.end,op.ref_iv.chrom,strand) elif len(op_type_list) >=2 and op_type_list[-1] == "I" and op_type_list[-2] == "M": logger.debug(map(str,("extending (I):", op, op.query_from, op.query_to, op.ref_iv))) read_parts[-1].extend(op.query_from, op.query_to,op.ref_iv.start,op.ref_iv.end,op.ref_iv.chrom,strand) else: logger.debug("CIGAR WARNING: Number of matches > 1: {0}".format(cigarstring)) else: logger.debug(map(str,("appending:", op, op.query_from, op.query_to, op.ref_iv))) suppl_frag = cls(op.query_from, op.query_to,op.ref_iv.start,op.ref_iv.end,op.ref_iv.chrom,strand) read_parts.append(suppl_frag) op_type_list.append(op.type) return read_parts
def set_up_IO(fileIN,fileOUT,gff,downstream,upstream): '''Function that will open all the file required for the alignment processing ''' ## Open alignment alignIN = HTSeq.SAM_Reader(fileIN) alignIN = HTSeq.bundle_multiple_alignments(alignIN) ## Open GFF file annotation = HTSeq.GFF_Reader(gff,end_included = True) ## Open output file - write the header countTable = open(fileOUT,'w') coordinates = '\t'.join(i for i in map(str,range(-upstream,downstream))) countTable.write('name\t{coord}\n'.format(coord = coordinates)) return alignIN, annotation, countTable
def bam_parser_2(bam_file, min_len, max_clip, min_id, mode): bam_dict = {} query_counter = 0 output_list = list() if mode == 'paired': #import itertools #for aln in itertools.islice( HTSeq.pair_SAM_alignments(bam_file), 1000 ): # printing first N reads for aln in HTSeq.pair_SAM_alignments(bam_file): query_counter += 1 query_1, query_2 = aln q1_aln = parser_aln_list(query_1, aln_number = query_counter, pair_pos = 1, min_len=min_len, max_clip=max_clip, min_id=min_id) q2_aln = parser_aln_list(query_2, aln_number = query_counter, pair_pos = 2, min_len=min_len, max_clip=max_clip, min_id=min_id) alns = [q1_aln, q2_aln] if alns == [None, None]: continue else: if None in alns: alns.remove(None) output_list.append(alns) elif mode == 'single': for aln in bam_file: query_counter += 1 query_1 = aln q1_aln = parser_aln_list(query_1, aln_number = query_counter, pair_pos = 1, min_len=min_len, max_clip=max_clip, min_id=min_id) alns = [q1_aln] if q1_aln != None: output_list.append(alns) df_columns = ['ALN','QUERY','REF','SEQ','LEN','ID','SCORE','CLIP_PCT'] output_list = [item for sublist in output_list for item in sublist] return pd.DataFrame(output_list, columns=df_columns)
def ungapped_pe_counter(sam_reader, feature_array): counts = collections.Counter( ) pair_iterator = hts.pair_SAM_alignments( sam_reader, bundle=True ) # bundle puts all multiply-mapped pairs together. t0 = datetime.datetime.now() for ic, bundle in enumerate(pair_iterator): # report progress (to prove that it is still alive): if ic % 1000000 == 0: t1 = datetime.datetime.now() print "\r%d read bundles counted in %s\r" % (ic, t1-t0) sys.stdout.flush() if bundle == []: # first bundle for some reason is always an empty list continue bcounts = assess_bundle(bundle, feature_array) """ To evaluate the multiply mapped bundles, each pair in a bundle must still ALWAYS and ONLY map to a single feature. Thus, every aligned pair has come from the same feature (gene), and this bundle counts as evidence of one read for this gene. If any of the read pairs maps to a different gene, or no gene, or multiple genes, then the bundle is considered ambiguous. If all pairs in a bundle map as _no_feature, _unmapped or _ambiguous, then the bundle counts as one count towards this feature type. (ie, it is passed on to the final counter to increment by 1). """ if len(bcounts) > 1: # ie, is a multiply mapped feature with multiple gene mappings counts[ "_ambiguous" ] += 1 continue elif len(bcounts) == 0: # uh oh! There is an error somewhere. print "#" * 40 print "Error! bundle was not assigned any status" print "Contents of bundle:" print bundle continue else: counts[ bcounts.keys()[0] ] += 1 return counts
def searchGeneName(self,annotationstring): if annotationstring == '.': genes = 'N/A' else: # Split the annotationstring by ',' which collapsed by bedtools groupby annotationstrings = annotationstring.split(',') collect = set() for annotation in annotationstrings: try: attr = HTSeq.parse_GFF_attribute_string(annotation) # Search for gene_name which is used by ensembl gtf annotation try: gene = attr['gene_name'] except KeyError: # Search for gene, which might used in GFF annotation try: gene = attr['gene'] except KeyError: # Search for gene_id try: gene = attr['gene_id'] except KeyError: try: gene = attr['transcript_id'] except KeyError: gene = 'N/A' except: gene = self.searchGeneName1(annotation) collect.add(gene) # Collapse all genes togethor if len(collect) > 1: try: collect.remove('N/A') except KeyError: pass genes = ','.join(collect) return genes
def count_reads(sam_filename, features, counts, samtype, order, forward, reverse, overlap_mode, quiet, minaqual, samout, directory): def write_to_samout(r, assignment): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n") if samout != "": samoutfile = open(samout, "w") else: samoutfile = None if samtype is None: samtype = detect_sam_type(sam_filename) if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader else: raise ValueError("Unknown input format %s specified." % samtype) try: if sam_filename != "-": read_seq_file = SAM_or_BAM_Reader(sam_filename) read_seq = read_seq_file first_read = iter(read_seq).next() else: read_seq_file = SAM_or_BAM_Reader(sys.stdin) read_seq_iter = iter(read_seq_file) first_read = read_seq_iter.next() read_seq = itertools.chain([first_read], read_seq_iter) pe_mode = first_read.paired_end except: sys.stderr.write("Error occured when reading beginning " "of SAM/BAM file.\n") raise try: if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) else: raise ValueError("Illegal order specified.") if forward: empty_forward = 0 ambiguous_forward = 0 counts_forward = copy.copy(counts) if reverse: empty_reverse = 0 ambiguous_reverse = 0 counts_reverse = copy.copy(counts) notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write("%d SAM alignment record%s processed.\n" % (i, "s" if not pe_mode else " pairs")) i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout(r, "__not_aligned") continue try: if r.optional_field("NH") > 1: nonunique += 1 write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(r, "__too_low_aQual") continue if forward: iv_seq_for = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) if reverse: iv_seq_rev = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: if forward: iv_seq_for = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) if reverse: iv_seq_rev = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq_rev = tuple() iv_seq_for = tuple() if r[1] is not None and r[1].aligned: if forward: iv_seq_for = (itertools.chain(iv_seq_for, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0))) if reverse: iv_seq_rev = itertools.chain(iv_seq_rev, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "__not_aligned") notaligned += 1 continue try: if ((r[0] is not None and r[0].optional_field("NH") > 1) or (r[1] is not None and r[1].optional_field("NH") > 1)): nonunique += 1 write_to_samout(r, "__alignment_not_unique") continue except KeyError: pass if ((r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual)): lowqual += 1 write_to_samout(r, "__too_low_aQual") continue try: if overlap_mode == "union": if forward: fs_for = set() for iv in iv_seq_for: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs_for = fs_for.union(fs2) if reverse: fs_rev = set() for iv in iv_seq_rev: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs_rev = fs_rev.union(fs2) elif (overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty"): if forward: fs_for = None for iv in iv_seq_for: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len(fs2) > 0 or \ overlap_mode == "intersection-strict": if fs_for is None: fs_for = fs2.copy() else: fs_for = fs_for.intersection(fs2) if reverse: fs_reverse = None for iv in iv_seq_rev: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len(fs2) > 0 or \ overlap_mode == "intersection-strict": if fs_rev is None: fs_rev = fs2.copy() else: fs_rev = fs_rev.intersection(fs2) else: sys.exit("Illegal overlap mode.") if forward: if fs_for is None or len(fs_for) == 0: write_to_samout(r, "__no_feature") empty_forward += 1 elif len(fs_for) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs_for) + "]") ambiguous_forward += 1 else: write_to_samout(r, list(fs_for)[0]) counts_forward[list(fs_for)[0]] += 1 if reverse: if fs_reverse is None or len(fs_rev) == 0: write_to_samout(r, "__no_feature") empty_reverse += 1 elif len(fs_reverse) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs_rev) + "]") ambiguous_reverse += 1 else: write_to_samout(r, list(fs_rev)[0]) counts_reverse[list(fs_rev)[0]] += 1 except UnknownChrom: write_to_samout(r, "__no_feature") empty_forward += 1 empty_reverse += 1 except: sys.stderr.write("Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string()) raise if not quiet: sys.stderr.write("%d SAM %s processed.\n" % (i, "alignments " if not pe_mode else "alignment pairs")) if samoutfile is not None: samoutfile.close() if forward: output = brenninc_utils.create_new_file(sam_filename, "_forward_count", outputdir=directory, extension="txt", gzipped=False) used_features_count = 0 used_features_sum = 0 print "Forward written to", output with open(output, "w") as output_file: for fn in sorted(counts_forward.keys()): output_file.write("%s\t%d\n" % (fn, counts_forward[fn])) used_features_count += 1 used_features_sum += counts_forward[fn] output_file.write("__no_feature\t%d\n" % empty_forward) output_file.write("__ambiguous\t%d\n" % ambiguous_forward) output_file.write("__too_low_aQual\t%d\n" % lowqual) output_file.write("__not_aligned\t%d\n" % notaligned) output_file.write("__alignment_not_unique\t%d\n" % nonunique) print "Forward features with alignment\t%d" % used_features_count print "Forward alignments asigned to feature\t%d" % used_features_sum print "__forward_no_feature\t%d" % empty_forward print "__forward_ambiguous\t%d" % ambiguous_forward if reverse: output = brenninc_utils.create_new_file(sam_filename, "_reverse_count", outputdir=directory, extension="txt", gzipped=False) used_features_count = 0 used_features_sum = 0 print "Reverse written to", output with open(output, "w") as output_file: for fn in sorted(counts_reverse.keys()): output.write("%s\t%d\n" % (fn, counts_reverse[fn])) used_features_count += 1 used_features_sum += counts_reverse[fn] output_file.write("__no_feature\t%d\n" % empty_reverse) output_file.write("__ambiguous\t%d\n" % ambiguous_reverse) output_file.write("__too_low_aQual\t%d\n" % lowqual) output_file.write("__not_aligned\t%d\n" % notaligned) output_file.write("__alignment_not_unique\t%d\n" % nonunique) print "Reverse features with alignment\t%d" % used_features_count print "Reverse alignments asigned to feature\t%d" % used_features_sum print "__reverse_no_feature\t%d" % empty_reverse print "__reverse_ambiguous\t%d" % ambiguous_reverse print "__too_low_aQual\t%d" % lowqual print "__not_aligned\t%d" % notaligned print "__alignment_not_unique\t%d" % nonunique
def softclipping_realignment(mapq_cutoff, max_del_len, input, output, ref_genome, gtf, splice_bin): bwa_bam = pysam.AlignmentFile(input, 'rb') output_bam = pysam.AlignmentFile('{}.temp.bam'.format(output), 'wb', template=bwa_bam) # for RNAseq splice_motif = ['GTAG', 'CTAC', 'GCAG', 'CTGC', 'ATAC', 'GTAT'] try: fastafile = pysam.Fastafile(ref_genome) except IOError as e: print('read reference genome ' + ref_genome + ' error!', e) sys.exit(1) try: cvg = extract_splice_sites(gtf, splice_bin) except IOError as e: print('read GTF file ' + gtf + ' error!', e) sys.exit(1) try: for read in bwa_bam.fetch(until_eof=True): if read.mapq >= mapq_cutoff and not read.is_secondary and not read.has_tag( 'XA'): chrm = bwa_bam.getrname(read.rname) newcigar, newpos = detect_sv_from_cigar( chrm, read, mapq_cutoff, max_del_len) if newcigar != 'NA' and newcigar != read.cigar: old_cigarstring, old_cigar, old_pos = read.cigarstring, read.cigar, read.pos read.cigar, read.pos = newcigar, newpos if 'D' in read.cigarstring: if read.is_reverse: strand = '-' else: strand = '+' junc_start, junc_end = read.blocks[0][1], read.blocks[ 1][0] htpos1 = HTSeq.GenomicPosition(chrm, junc_start, '.') htpos2 = HTSeq.GenomicPosition(chrm, junc_end, '.') if splice_checker(fastafile, chrm, junc_start, junc_end, strand): read.cigar, read.pos = old_cigar, old_pos read.setTag('JM', 'shift') output_bam.write(read) continue if cvg[htpos1] > 0 or cvg[htpos2] > 0: read.cigar, read.pos = old_cigar, old_pos read.setTag('JM', 'GTF') output_bam.write(read) continue m1 = fastafile.fetch(chrm, junc_start, junc_start + 2) m2 = fastafile.fetch(chrm, junc_end - 2, junc_end) motif = m1.upper() + m2.upper() if motif in splice_motif: read.cigar, read.pos = old_cigar, old_pos read.setTag('JM', motif) output_bam.write(read) continue read.setTag('OA', str(old_pos + 1) + ',' + old_cigarstring) output_bam.write(read) except ValueError as e: print('Bam index file is not found!', e, file=sys.stderr) sys.exit(1) bwa_bam.close() output_bam.close() try: subprocess.check_call( "samtools sort {0}.temp.bam -o {0}".format(output), shell=True, stdin=subprocess.PIPE) except subprocess.CalledProcessError as e: print('Execution failed for samtools:', e, file=sys.stderr) sys.exit(1) subprocess.check_call("samtools index {}".format(output), shell=True)
#!/usr/bin/python import HTSeq as h from collections import defaultdict #reader_masked = h.SAM_Reader("/Users/dashazhernakova/Documents/UMCG/data/geuvadis/mappedData/ERR188022/aligned_masked.sam") #reader = h.SAM_Reader("/Users/dashazhernakova/Documents/UMCG/data/geuvadis/mappedData/ERR188022/aligned.sam") reader = h.SAM_Reader("/Users/dashazhernakova/Documents/UMCG/data/geuvadis/mappedData/ERR188022_masked/Aligned.out.filtered.new.1017680.sam") reader_masked = h.SAM_Reader("/Users/dashazhernakova/Documents/UMCG/data/geuvadis/mappedData/ERR188022/Aligned.out.filtered.new1mb.sam") it_p = iter(h.pair_SAM_alignments(reader)) it_p_m = iter(h.pair_SAM_alignments(reader_masked)) same_aligned = 0 one_same_pos = 0 both_same_pos = 0 masked_more_pos = 0 simple_more_pos = 0 #cur_read = {} #cur_m_read = {} not_in_simple = 0 not_in_masked = 0 n_m = defaultdict(list) i = 0 for r1, r2 in h.pair_SAM_alignments(reader): n_m[r1.read.name].append((r1,r2)) i += 1 if i%10000 == 0: print i, " lines" #for k,v in n_m.items():
def pool(infile, targets, intron_set, fiveSS, threeSS, Branches, Branchto3ss): SI_counts = defaultdict(int) junction_counts = defaultdict(int) for f, s in HTSeq.pair_SAM_alignments_with_buffer( HTSeq.BAM_Reader('%s/%s.bam' % (infile, infile))): if f != None and f.aligned == True and f.aQual > 5: chrome = f.iv.chrom start = f.iv.start end = f.iv.end strand = f.iv.strand if strand == '+': geneint = HTSeq.GenomicPosition(chrome, start, strand) else: geneint = HTSeq.GenomicPosition(chrome, end, strand) if len(targets[geneint]) == 0: introns = set() junctions = set() for i, cigop in enumerate(f.cigar): if cigop.type == 'M': for iv, val in targets[cigop.ref_iv].steps(): introns |= val elif cigop.type == 'N': if f.cigar[i - 1].type == 'M' and f.cigar[ i - 1].size > 3 and f.cigar[ i + 1].type == 'M' and f.cigar[i + 1].size > 3: for iv, val in targets[cigop.ref_iv].steps(): junctions |= val chrom = cigop.ref_iv.chrom if cigop.ref_iv.strand == '+': first = cigop.ref_iv.end second = cigop.ref_iv.start + 1 strand = "+" else: first = cigop.ref_iv.start + 1 second = cigop.ref_iv.end strand = '-' if (chrom, first, strand) in fiveSS and (chrom, second, strand) in threeSS: up = fiveSS[chrom, first, strand] down = threeSS[chrom, second, strand] if up[0] == down[0]: if up[1] == down[1]: junction_counts[(infile, up[0], int(up[1]), int(down[1]) + 1, "Constituitive")] += 1 else: junction_counts[(infile, up[0], int(up[1]), int(down[1]) + 1, "Exon Skipping")] += 1 elif (chrom, first, strand) in fiveSS: junction_counts[(infile, up[0], int(up[1]), int(down[1]) + 1, "Alternative 3'")] += 1 elif (chrom, second, strand) in threeSS: junction_counts[(infile, up[0], int(up[1]), int(down[1]) + 1, "Alternative 5'")] += 1 intron_num_mat = {} intron_num_pre = {} intron = '' junction = '' if len(introns) > 0: for i in introns: a = i.split(';') intron_num_pre[i] = a[1] intron = max(intron_num_pre.items(), key=lambda x: x[1]) intron = intron[0] if len(junctions) > 0: for i in junctions: a = i.split(';') intron_num_mat[i] = a[1] junction = max(intron_num_mat.items(), key=lambda x: x[1]) junction = junction[0] if junction == intron: intron = '' junction = '' if junction and intron: if junction.split(';')[1] > intron.split(';')[1]: intron = '' else: junction = '' candidate_genes = set() for i in introns: candidate_genes.add(i.split(';')[0]) for i in junctions: candidate_genes.add(i.split(';')[0]) if len(candidate_genes) == 1: if junction: SI_counts[('mature', junction)] += 1 if intron: SI_counts[('premature', intron)] += 1 if f.proper_pair == True and s.proper_pair == True and s.aligned == True and s.aQual > 5: if junction: SI_counts[('concordant_mature', junction)] += 1 if intron: SI_counts[('concordant_premature', intron)] += 1 # Counts starting position of read 2's that fall within specified lariat intermediate and branch to 3'SS windows if intron > 0 and s.aligned == True and s.proper_pair == True and s.aQual > 5: chrome = s.iv.chrom start = s.iv.start end = s.iv.end strand = s.iv.strand if strand == '+': geneint = HTSeq.GenomicPosition(chrome, start, strand) else: geneint = HTSeq.GenomicPosition(chrome, end, strand) if intron in Branches[geneint] and len( Branches[geneint]) == 1: SI_counts[('lariat_int', intron)] += 1 if intron in Branchto3ss[geneint] and len( Branchto3ss[geneint]) == 1: SI_counts[('branch_to3ss', intron)] += 1 with open('%s/%s_splicing_counts.txt' % (infile, infile), 'w') as out: for intron in sorted(intron_set): out.write('%s\t%d\t%d\n' % (intron, SI_counts[('mature', intron)], SI_counts[('premature', intron)])) with open('%s/%s_concordant_splicing_counts.txt' % (infile, infile), 'w') as out: for intron in sorted(intron_set): out.write('%s\t%d\t%d\n' % (intron, SI_counts[('concordant_mature', intron)], SI_counts[('concordant_premature', intron)])) with open('%s/%s_lariat_int_counts.txt' % (infile, infile), 'w') as out: for intron in sorted(intron_set): out.write('%s\t%d\n' % (intron, SI_counts[('lariat_int', intron)])) with open('%s/%s_branch_to3ss_counts.txt' % (infile, infile), 'w') as out: for intron in sorted(intron_set): out.write('%s\t%d\n' % (intron, SI_counts[('branch_to3ss', intron)])) with open('%s/%s_junction_counts.txt' % (infile, infile), 'w') as out: out.write('Gene\tUpstream\tDownstream\tType\tCount\n') for junc in sorted(junction_counts): out.write( '%s\t%d\t%d\t%s\t%d\n' % (junc[1], junc[2], junc[3], junc[4], junction_counts[junc]))
# Verify taxids against nodes.dmp taxids = set() with open('nodes.dmp') as nodes: for line in nodes: data = line.split("|") taxids.add(data[0].strip()) # Map mmp_id from downloaded sequences to lineages and write to stdout in Kaiju preferred format. processed_mmp = set() warnings = defaultdict(int) for root, dirs, files in os.walk('genomes/'): for filename in files: if filename.endswith(".faa"): counter = 1 for seq in HTSeq.FastaReader(os.path.join(root, filename)): mmp_id = seq.name.split("_") mmp_id = mmp_id[-1] # If this mmp accession is a duplicate (already processed), skip it. if mmp_id in processed_mmp: warnings['duplicate'] += 1 break # If this mmp accession has no taxonomic linage for some reason, skip it. if not mapdict[mmp_id]: warnings['nolineage'] += 1 break # If taxonomic id is not in nodes.dmp, skip. if not mapdict[mmp_id] in taxids: warnings['notax'] += 1 break
def parse_vcf(vcf_file, snp_data, min_reads, min_af, min_qual, annotations, seqs, options, line_num=100000): """ Parse VCF file counts synonymous and non-synonymous SNPs :param file vcf_file: file handle to a VCF file :param dict snp_data: dictionary from :func:`init_count_set` with per sample SNPs information :param int min_reads: minimum number of reads to accept a SNP :param float min_af: minimum allele frequency to accept a SNP :param int min_qual: minimum quality (Phred score) to accept a SNP :param dict annotations: annotations grouped by their reference sequence :param dict seqs: reference sequences :param int line_num: the interval in number of lines at which progress will be printed """ vcf_handle = HTSeq.VCF_Reader(compressed_handle(vcf_file)) vcf_handle.parse_meta() vcf_handle.make_info_dict() # total number of SNPs accepted count_tot = 0 # number of SNPs skipped for low depth skip_dp = 0 # number of SNPs skipped for low allele frequency skip_af = 0 # number of SNPs skipped for low quality skip_qual = 0 # indels skip_indels = 0 for vcf_record in vcf_handle: # the SNP is a sequence with no annotations if vcf_record.chrom not in annotations: continue if float(vcf_record.qual) < min_qual: # low quality SNP skip_qual += 1 continue # unpack info records (needed for vcf_record.info to be a dictionary) vcf_record.unpack_info(vcf_handle.infodict) if vcf_record.info['INDEL']: skip_indels += 1 continue if not isinstance(vcf_record.info['DP'], int): LOG.warning(vcf_record.info['DP']) if vcf_record.info['DP'] < min_reads: # not enough reads (depth) for the SNP skip_dp += 1 continue # Samtools mpileup -> bcftools call doesn't output the allele freq. # it can be calculated with AC/AN for each ALT nucleotide # checked on bfctools (roh command) manual # https://samtools.github.io/bcftools/bcftools.html try: allele_freqs = vcf_record.info['AF'] except KeyError: if isinstance(vcf_record.info['AC'], list): allele_freqs = [ AC / vcf_record.info['AN'] for AC in vcf_record.info['AC'] ] else: allele_freqs = vcf_record.info['AC'] / vcf_record.info['AN'] # if the allele frequency is a single value, make it a list, so # the iteration below works anyway if isinstance(allele_freqs, float): allele_freqs = [allele_freqs] # alt is the nucleotidic change iter_data = zip(allele_freqs, vcf_record.alt) for alt_index, (allele_freq, change) in enumerate(iter_data): if allele_freq < min_af: # the allele frequency for the SNP is too low, it'll be # skipped skip_af += 1 continue # the samples that contain the SNP is a string separated by '-' if options.bcftools_vcf: samples = set() for sample_id, sample_info in vcf_record.samples.items(): # prepare the genotype list, to make the comparison easier # the genotype separator to '/' only, to use only one # type of split sample_info_gt = sample_info['GT'].replace('|', '/') sample_info_gt = sample_info_gt.split('/') for genotype in sample_info_gt: if genotype == '.': continue if int(genotype) == (alt_index + 1): samples.add(sample_id) else: samples = [ sample for sample in vcf_record.info['set'].split('-') ] check_snp_in_set(samples, snp_data, vcf_record.pos.start, change, annotations[vcf_record.chrom], seqs[vcf_record.chrom]) # increase the total number of snps available count_tot += 1 if vcf_handle.line_no % line_num == 0: LOG.info( "Line %d, SNPs passed %d; skipped for: qual %d, " + "depth %d, freq %d, indels %d", vcf_handle.line_no, count_tot, skip_qual, skip_dp, skip_af, skip_indels)
def count_reads_in_features( sam_filename, gff_filename, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout, \ filename_read_names_gene_names,filename_read_names_gene_names_amb_unique): """ Main function to count reads in features i.e. genes. Input: + sam_filename: Input alignment with all the ambiguously mapped reads + gff_filename: GTF containing all genes for a given species + stranded: specify whether data are stranded - see -s option + overlap_mode: mode to handle reads overlapping more than one feature (e.g. union) - See -m option: choices = ( "union", "intersection-strict", "intersection-nonempty") + feature_type: see -t option + id_attribute: see -i option + quiet: see -q option + minaqual: see -a option + samout: SAM output file storing disambiguated reads (see -o option). + filename_read_names_gene_names: filename for the output file containing the mappings readName to geneNames for multimapped reads + filename_read_names_gene_names_amb_unique: filename for the output file containing the mappings readName to geneNames for ambiguously mapped reads Output: + Writes readName to geneName outputs. + Writes SAM output file for ddisambiguated uniquely mapped reads. + Writes to stdout the genes and their read counts with read count for distinct read type: non-ambiguous unique, multimapped and ambiguous unique. This output redirected and stored to an output file in main peakRescue pipeline. This output is used in the later stage of the peakRescue pipeline to rescue the reads present in the readName to genNames mappings. """ # Output filhandles for readName to geneNames mappings fh_read_names_gene_names = open(filename_read_names_gene_names, 'w') fh_read_names_gene_names_amb_unique = open(filename_read_names_gene_names_amb_unique, 'w') def write_to_samout( r, assignment ): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: samoutfile.write( read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n" ) if quiet: warnings.filterwarnings( action="ignore", module="HTSeq" ) if samout != "": samoutfile = open( samout, "w" ) else: samoutfile = None features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" ) ## Hash table to store unique reads per exon (if modified GTF) counts = {} ## Hash table to store original non unique reads per gene (without dict_nonunique = {} ## Hash table to store all unique reads as per original GTF dict_gene_unique_counts = {} ## hast table to store ambigouous read count for unique reads... dict_gene_unique_counts_ambiguous = {} ## Hash table to store all non-unique reads including shared reads ## (either split reads or read pair matching on two distinct exons, same gene) dict_gene_nonunique_counts = {} ## Hash to store the non-unique read-names as key and genes names as values (fragments) dict_read_name_genes_names = {} ## Hash to store the non-unique read-names as key and genes names as values (fragments) including instances of a given multimapped read on same gene dict_read_name_genes_names_final = {} dict_read_name_genes_names_ambiguous = {} ## @todo: tag_gff - parameter to be removed - only deal with gene level information ## tag_gff: type to specify whether it contains gene or exons information tag_gff = "gene_gff" # Try to open samfile and fail early in case it is not there if sam_filename != "-": open( sam_filename ).close() gff = HTSeq.GFF_Reader( gff_filename ) exons = HTSeq.GenomicArrayOfSets( "auto", stranded=False ) i = 0 try: for f in gff: if f.type == feature_type: exons[ f.iv ] += f # added to get exon interval data try: feature_id = f.attr[ id_attribute ] except KeyError: sys.exit( "Feature %s does not contain a '%s' attribute" % ( f.name, id_attribute ) ) if stranded != "no" and f.iv.strand == ".": sys.exit( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % ( f.name, f.iv ) ) features[ f.iv ] += feature_id counts[ f.attr[ id_attribute ] ] = 0 # -- Initialisation feature_name = f.attr[ id_attribute ] # -- Added tag_gff for GFF type if tag_gff == "gene_gff": # Original GTF (genes) dict_nonunique = initialise_counts_per_feature(dict_nonunique, feature_name) dict_gene_unique_counts = initialise_counts_per_feature(dict_gene_unique_counts, feature_name) dict_gene_nonunique_counts = initialise_counts_per_feature(dict_gene_nonunique_counts, feature_name) dict_gene_unique_counts_ambiguous = initialise_counts_per_feature(dict_gene_unique_counts_ambiguous, feature_name) i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) except: sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) if len( counts ) == 0 and not quiet: sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type ) try: if sam_filename != "-": read_seq = HTSeq.SAM_Reader( sam_filename ) first_read = iter(read_seq).next() else: read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) ) first_read = read_seq.next() read_seq = itertools.chain( [ first_read ], read_seq ) pe_mode = first_read.paired_end #pe_mode = 1 ## Added by us except: sys.stderr.write( "Error occured when reading first line of sam file.\n" ) raise ################################################################################################### try: if pe_mode: read_seq_pe_file = read_seq read_seq = HTSeq.pair_SAM_alignments( read_seq ) empty = 0 ambiguous = 0 ambiguous_tag=0 notaligned = 0 lowqual = 0 nonunique = 0 nonunique_nonamb_to_be_rescued = 0 temp_read_name="NA" previous_read_name="NA" temp_interval_r0="NA" temp_interval_r1="NA" counter_fragment = 0 flag_result = 0 i = 0 pe_mode_for_SE = 0 ## -- Added pe_mode on for SE files so that multireads reads will be accounted for if not pe_mode: # real SE pe_mode_for_SE = 1 # read_seq_pe_file = read_seq pe_mode=1 ## -- End index_fragment = 0 for r in read_seq: prev_index_fragment = index_fragment tag_nonunique_NH = 0 tag_overlapping_genes = 0 flag_aln_not_unique = 0 # flag_ambiguous = 0 # #-- LOOP OVER ALL READS IN INPUT BAM FILE if pe_mode_for_SE: r = (r, None) counter_fragment += 1 i += 1 if not pe_mode: # -- SINGLE_END mode if not r.aligned: notaligned += 1 #write_to_samout( r, "not_aligned" ) continue try: if r.optional_field( "NH" ) > 1: # --- Rescue multimappers in singel-end mode #write_to_samout( r, "alignment_not_unique" ) #nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 #write_to_samout( r, "too_low_aQual" ) continue if stranded != "reverse": iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" ) else: # -- PAIRED-END if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" ) ) else: if ( r[0] is None ) or not ( r[0].aligned ): #write_to_samout( r, "not_aligned" ) notaligned += 1 continue try: if (( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 )): tag_nonunique_NH = 1 if ( r[0] is not None and r[1] is None ): result, fs_genes, fs_exons,dict_read_name_genes_names,ambiguous_tag = is_read_in_gene_interval(r[0], features,dict_read_name_genes_names,ambiguous_tag, exons) if result: flag_result = 1 (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[0], \ temp_interval_r0, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names) else: if len(fs_genes) != 0: (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[0], \ temp_interval_r0, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names) if ( r[0] is None and r[1] is not None ): result, fs_genes, fs_exons,dict_read_name_genes_names,ambiguous_tag = is_read_in_gene_interval(r[1], features,dict_read_name_genes_names,ambiguous_tag,exons) if result: flag_result = 1 (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[1], \ temp_interval_r1, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names) else: if len(fs_genes) != 0: (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[1], \ temp_interval_r1, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names) if ( r[0] is not None and r[1] is not None ): result1, fs_genes1, fs_exons1,dict_read_name_genes_names,ambiguous_tag = is_read_in_gene_interval(r[0], features,dict_read_name_genes_names,ambiguous_tag,exons) result2, fs_genes2, fs_exons2,dict_read_name_genes_names,ambiguous_tag = is_read_in_gene_interval(r[1], features,dict_read_name_genes_names,ambiguous_tag,exons) if len(fs_genes1.intersection(fs_genes2)) > 0: fs_genes = fs_genes1.intersection(fs_genes2) elif len(fs_genes1.intersection(fs_genes2))==0: fs_genes = fs_genes1.union(fs_genes2) if result1 and not result2: flag_result = 1 (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[0], \ temp_interval_r0, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names) elif result2 and not result1: flag_result = 1 (dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[1], \ temp_interval_r1, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names) else: if len(fs_genes1) != 0 or len(fs_genes2) != 0: flag_result = 1 if ( ( ((temp_interval_r0 != str(r[0].iv)) or (temp_interval_r1 != str(r[1].iv))) or (temp_read_name != r[0].read.name) ) ): (dict_nonunique)= add_non_unique_counts_per_feature(fs_genes, dict_nonunique) dict_read_name_genes_names = _populate_read_name_gene_name(dict_read_name_genes_names, fs_genes, r[0].read.name, tag_report_instances_same_multiread_on_same_gene) flag_aln_not_unique = 1 #write_to_samout( r, "alignment_not_unique" ) nonunique += 1 if flag_result: if r[0] is not None and r[1] is None: non_uniq_read_name = r[0].read.name elif r[0] is None and r[1] is not None: non_uniq_read_name = r[1].read.name elif r[0] is not None and r[1] is not None: non_uniq_read_name= r[0].read.name non_uniq_read_name2 = dict_read_name_genes_names.keys()[0] if flag_aln_not_unique: nonunique_nonamb_to_be_rescued += 1 # -- Re-initialise hash # previous_read_name: read which falls into at least one gene interval # tmp_read_name: the previous read in the bam file # BAM is sorted by read name hence each multimapper will be arranged one after another if previous_read_name == "NA": previous_read_name = non_uniq_read_name if non_uniq_read_name != previous_read_name: if previous_read_name in dict_read_name_genes_names.keys(): fs_genes_names = dict_read_name_genes_names[previous_read_name] fh_read_names_gene_names.write("%s\t%s\n" % (previous_read_name, "\t".join(list(fs_genes_names)) )) previous_read_name = non_uniq_read_name tmp_dict = {} if non_uniq_read_name in dict_read_name_genes_names.keys(): #print "non_uniq_read_name IN dict_read_name_genes_names.keys()" tmp_dict[non_uniq_read_name] = dict_read_name_genes_names[non_uniq_read_name] dict_read_name_genes_names.clear() # only one read stored dict_read_name_genes_names = tmp_dict flag_result = 0 flag_aln_not_unique = 0 # (temp_read_name, temp_interval_r0, temp_interval_r1) = initalize_read_name_and_interval(r[0], r[1]) continue # except KeyError: except KeyError: pass if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ): lowqual += 1 #write_to_samout( r, "too_low_aQual" ) continue try: # -- if overlap_mode == "union": fs = set() for iv in iv_seq: # interval from bam file for each fragment if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): #if debug: #print "****Unique_feature %s and feature_interval %s" %(fs2,iv2) fs = fs.union( fs2 ) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection( fs2 ) else: sys.exit( "Illegal overlap mode." ) fs_genes = fs if fs_genes is None or len( fs_genes ) == 0: #write_to_samout( r, "no_feature" ) empty += 1 # ambiguous read count and/or one of the read pair mapping on different gene (potential gene fusion events)... # elif len( fs ) > 1: elif len( fs_genes ) > 1: ############################################################### ## AMBIGUOUS UNIQUE ############################################################### is_disambiguated = 0 if not tag_nonunique_NH: if ( r[0] is not None and r[1] is None ): result, fs_genes, fs_exons,dict_read_name_genes_names_ambiguous, ambiguous_tag = is_read_in_gene_interval(r[0], features, dict_read_name_genes_names_ambiguous, ambiguous_tag,exons) if result: (dict_gene_unique_counts) = add_unique_counts_per_feature(dict_gene_unique_counts, fs_genes) is_disambiguated = 1 if ambiguous_tag: (dict_gene_unique_counts_ambiguous) = add_unique_counts_per_feature_ambiguous(fs_genes, dict_gene_unique_counts_ambiguous) flag_ambiguous = 1 # write in the file ambiguous read name gene name data... fh_read_names_gene_names_amb_unique.write("%s\t%s\n" % (r[0].read.name, "\t".join(list(fs_genes)) )) if ( r[0] is None and r[1] is not None ): result, fs_genes, fs_exons, dict_read_name_genes_names_ambiguous, ambiguous_tag = is_read_in_gene_interval(r[1], features, dict_read_name_genes_names_ambiguous, ambiguous_tag,exons) if result: (dict_gene_unique_counts) = add_unique_counts_per_feature(dict_gene_unique_counts, fs_genes) is_disambiguated = 1 if ambiguous_tag: (dict_gene_unique_counts_ambiguous) = add_unique_counts_per_feature_ambiguous(fs_genes, dict_gene_unique_counts_ambiguous) flag_ambiguous = 1 fh_read_names_gene_names_amb_unique.write("%s\t%s\n" % (r[1].read.name, "\t".join(list(fs_genes)) )) if ( r[0] is not None and r[1] is not None ): result1, fs_genes1, fs_exons, dict_read_name_genes_names_ambiguous, ambiguous_tag1 = is_read_in_gene_interval(r[0], features, dict_read_name_genes_names_ambiguous, ambiguous_tag,exons) result2, fs_genes2, fs_exons, dict_read_name_genes_names_ambiguous, ambiguous_tag2 = is_read_in_gene_interval(r[1], features, dict_read_name_genes_names_ambiguous, ambiguous_tag,exons) if debug: print "IN UNIQUE DISAMBIGUATION -->r[0].read.name=%s\t%s\t%s\t%s\t%s\n" % (r[0].read.name,result1, result2, fs_genes1, fs_genes2) if len(fs_genes1.intersection(fs_genes2))==1: fs_genes = fs_genes1.intersection(fs_genes2) (dict_gene_unique_counts) = add_unique_counts_per_feature(dict_gene_unique_counts, fs_genes) is_disambiguated = 1 elif len(fs_genes1.intersection(fs_genes2)) > 1: fs_genes = fs_genes1.intersection(fs_genes2) (dict_gene_unique_counts_ambiguous) = add_unique_counts_per_feature_ambiguous(fs_genes, dict_gene_unique_counts_ambiguous) flag_ambiguous = 1 fh_read_names_gene_names_amb_unique.write("%s\t%s\n" % (r[0].read.name, "\t".join(list(fs_genes)) )) elif len(fs_genes1.intersection(fs_genes2))==0: fs_genes = fs_genes1.union(fs_genes2) if (fs_genes1 == set([]) or fs_genes2 == set([])) and len(fs_genes) == 1: ## Disambiguate the uniquely mapped to the single gene it maps on (dict_gene_unique_counts) = add_unique_counts_per_feature(dict_gene_unique_counts, fs_genes) is_disambiguated = 1 elif (fs_genes1 != set([]) or fs_genes2 != set([])): ## Add fragment to the RN-GN for ambiguous uniquely mapped based on ## union of both fs_genes (fs_genes1 & fs_genes2) > 1 (dict_gene_unique_counts_ambiguous) = add_unique_counts_per_feature_ambiguous(fs_genes, dict_gene_unique_counts_ambiguous) flag_ambiguous = 1 fh_read_names_gene_names_amb_unique.write("%s\t%s\n" % (r[0].read.name, "\t".join(list(fs_genes)) )) if flag_ambiguous: ambiguous += 1 #write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" ) if is_disambiguated: write_to_samout( r, list(fs_genes)[0] ) else: if debug: #print "DEBUG::CR:: len(fs) <-> 1:: fs = %s" %fs pass write_to_samout( r, list(fs)[0] ) rr2 = r[0] if r[0] is not None else r[1] if not tag_nonunique_NH: (dict_gene_unique_counts) = add_unique_counts_per_feature(dict_gene_unique_counts, fs_genes) except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] if not quiet: sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " + "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % ( rr.read.name, iv.chrom ) ) if i % 100000 == 0 and not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) flag_ambiguous = 0 ## re-initialise.... index_fragment += 1 ######################### # This is to store the last read/fragment since it will no pass in previous condition: # => if non_uniq_read_name != previous_read_name: # -- At same level as the for loop (outside of the for loop) - column: 7 #fh_read_names_gene_names.close() if dict_read_name_genes_names.keys() != []: #print "dict_read_name_genes_names passing" non_uniq_read_name = dict_read_name_genes_names.keys()[0] fs_genes_names = dict_read_name_genes_names[non_uniq_read_name] fh_read_names_gene_names.write("%s\t%s\n" % (non_uniq_read_name, "\t".join(list(fs_genes_names)) )) # -- fh_read_names_gene_names.close() fh_read_names_gene_names_amb_unique.close() ################################################################################################### #except UnboundLocalError: except AttributeError: #except: if not pe_mode: sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() ) else: sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) if samoutfile is not None: samoutfile.close() if tag_gff == "gene_gff": tuples_genenames_exontag = [(fn, fn) for fn in dict_gene_unique_counts.keys()] tuples_genenames_exontag.sort() previous_gene_name = "NA" for gene_name, fn in tuples_genenames_exontag: gene_name = gene_name.strip() fn = fn.strip() if tag_gff == "gene_gff": # if gene_name in dict_gene_unique_counts.keys(): print "%s\t%i\t%i\t%s" % ( fn, dict_gene_unique_counts[gene_name], dict_nonunique[gene_name],dict_gene_unique_counts_ambiguous[gene_name] ) else: # -- No non-unique reads for that gene_name print "%s\t%i\t%i\t%i" % ( fn, dict_gene_unique_counts[gene_name], 0,dict_gene_unique_counts_ambiguous[gene_name] ) # -- Re-initialise gene name previous_gene_name = gene_name print "no_feature\t%d" % empty print "ambiguous\t%d" % ambiguous print "too_low_aQual\t%d" % lowqual print "not_aligned\t%d" % notaligned print "alignment_not_unique\t%d" % nonunique print "nonunique_nonamb_to_be_rescued:\t%d" % nonunique_nonamb_to_be_rescued
def sciRNA_count_parallel(gtf_file, input_folder, sample_ID, core_number): # read in the gtf file, and then construct the genome interval for exons, genes, and gene end dictionary gtf_file = HTSeq.GFF_Reader(gtf_file, end_included=True) gene_annotat_file = input_folder + "/gene_name_annotate.txt" cell_annotat_file = input_folder + "/cell_annotate.txt" report_annotate_file = input_folder + "/report_annotate.txt" gene_annotat = open(gene_annotat_file, "w") cell_annotat = open(cell_annotat_file, "w") report_annotate = open(report_annotate_file, "w") exons = HTSeq.GenomicArrayOfSets("auto", stranded=True) genes = HTSeq.GenomicArrayOfSets("auto", stranded=True) gene_end = {} exon_n = 0 gene_n = 0 transcript_n = 0 gene_count = 0 print("Start generating exon genomic arrays....") print("Start generating gene genomic arrays....") print("Start calculating transcript end of genes....") for feature in gtf_file: if feature.type == "exon": exon_n += 1 exons[feature.iv] += feature.attr["gene_id"] elif feature.type == "gene": gene_n += 1 genes[feature.iv] += feature.attr["gene_id"] gene_count += 1 # for human and mouse gtf file message = (feature.attr["gene_id"] + "," + feature.attr["gene_biotype"] + "," + "exon" + "," + feature.attr["gene_name"] + "," + str(gene_count) + "\n") gene_annotat.write(message) gene_count += 1 # for human and mouse gtf file message = (feature.attr["gene_id"] + "_intron" + "," + feature.attr["gene_biotype"] + "," + "intron" + "," + feature.attr["gene_name"] + "_intron" + "," + str(gene_count) + "\n") gene_annotat.write(message) elif feature.type == "transcript": transcript_n += 1 #print "feature gene name: ", feature.attr["gene_id"] if feature.attr["gene_id"] in gene_end.keys(): gene_end[feature.attr["gene_id"]].add(feature.iv.end_d) else: gene_end[feature.attr["gene_id"]] = set() gene_end[feature.attr["gene_id"]].add(feature.iv.end_d) print("Detected gene number: ", gene_n) print("Detected transcript number: ", transcript_n) print("Detected exon number: ", exon_n) gene_annotat.close() gene_annotat = pd.read_csv(gene_annotat_file, header=None) gene_annotat.index = gene_annotat[0] #print("print WBGENE id:", gene_annotat.loc["WBGene00004947", 4]) #print("Print transcript end, ", len(gene_end)) sample_ID = list(pd.read_csv(sample_ID, header=None)[0]) # generate the cell ID annotate file cell_count = 0 for i in sample_ID: cell_count += 1 message = i + "," + str(cell_count) + "\n" cell_annotat.write(message) cell_annotat.close() # Generate the report annotate file report_annotate.write("1, Perfect intersect exon match\n") report_annotate.write("2, Nearest intersect exon match\n") report_annotate.write("3, Perfect combine exon match\n") report_annotate.write("4, Nearest combine exon match\n") report_annotate.write("5, Perfect intersect gene match\n") report_annotate.write("6, Nearest intersect gene match\n") report_annotate.write("7, Perfect combine gene match\n") report_annotate.write("8, Nearest combine gene match\n") report_annotate.write("9, ambiguous match for exons\n") report_annotate.write("10, ambiguous match for genes\n") report_annotate.write("11, No match\n") report_annotate.close() # parallele for the functions p = Pool(processes=int(core_number)) #print("Processing core number: ", core_number) func = partial(sciRNAseq_count, input_folder=input_folder, exons=exons, genes=genes, gene_end=gene_end, gene_annotat=gene_annotat, sample_ID=sample_ID) #sciRNAseq_count(sample, input_folder, exons, genes, gene_end) result = p.map(func, sample_ID) p.close() p.join() print("All analysis done~")
def count_reads_in_features(sam_filename, gff_filename, samtype, order, overlap_mode, feature_type, id_attribute, quiet, minaqual, mapping_file, scale_method): features = HTSeq.GenomicArrayOfSets("auto", False) counts = {} # Try to open samfile to fail early in case it is not there if sam_filename != "-": open(sam_filename).close() # Try to open mapping file to fail early in case it is not there if mapping_file: open(mapping_file).close() gff = HTSeq.GFF_Reader(gff_filename) i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[id_attribute] except KeyError: continue features[f.iv] += feature_id counts[feature_id] = 0 i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write("{!s} GFF lines processed.\n".format(i)) except: sys.stderr.write("Error occured when processing GFF file ({}):\n" .format(gff.get_line_number_string())) raise if not quiet: sys.stderr.write("{!s} GFF lines processed.\n".format(i)) num_features = len(counts) if num_features == 0: sys.stderr.write("Warning: No features of type '{}' found.\n" .format(feature_type)) if samtype == "sam": align_reader = HTSeq.SAM_Reader elif samtype == "bam": align_reader = HTSeq.BAM_Reader else: raise ValueError, "Unknown input format {} specified.".format(samtype) try: if sam_filename != "-": read_seq_file = align_reader(sam_filename) read_seq = read_seq_file first_read = iter(read_seq).next() else: read_seq_file = align_reader(sys.stdin) read_seq_iter = iter(read_seq_file) first_read = read_seq_iter.next() read_seq = itertools.chain([first_read], read_seq_iter) pe_mode = first_read.paired_end except: sys.stderr.write("Error occured when reading SAM/BAM file.\n" ) raise try: if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) elif order == "position": read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) else: raise ValueError, "Illegal order specified." empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write("{!s} SAM alignment record{} processed.\n" .format(i, "s" if not pe_mode else " pairs")) i += 1 if not pe_mode: if not r.aligned: notaligned += 1 continue try: if r.optional_field("NH") > 1: nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 continue iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): notaligned += 1 continue try: if (r[0] is not None and r[0].optional_field("NH") > 1 ) or \ (r[1] is not None and r[1].optional_field("NH") > 1): nonunique += 1 continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual): lowqual += 1 continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: empty += 1 elif len(fs) > 1: ambiguous += 1 else: counts[list(fs)[0]] += 1 except UnknownChrom: empty += 1 except: sys.stderr.write("Error occured when processing SAM input ({}):\n" .format(read_seq_file.get_line_number_string())) raise if not quiet: sys.stderr.write("{!s} SAM {} processed.\n" .format(i, "alignments " if not pe_mode else "alignment pairs")) # map to higher order features if applicable if mapping_file: abundances = {} with open(mapping_file) as mapping_h: for row in csv.reader(mapping_h, delimiter='\t'): try: feature, feature_category, feature_length, organism = row except ValueError: sys.stderr.write("Can't determine the format of '{}'".format(mapping_file)) raise if feature not in counts: continue if not feature_category: feature_category = feature abund = counts[feature] if scale_method == 'none' else scale_abundance(counts[feature], int(feature_length)) if ',' in feature_category: cats = feature_category.split(',') for category in cats: abundances[category] = abundances.get(category, 0) + abund else: abundances[feature_category] = abundances.get(feature_category, 0) + abund if num_features > 0 and len(abundances) == 0: sys.stderr.write("Warning: No higher order features found. Please " "make sure the mapping file is formatted correctly.\n") for feature in counts: if feature not in abundances: abundances['UNMAPPED'] = abundances.get('UNMAPPED', 0) + counts[feature] else: abundances = counts # "UNMAPPED" can be interpreted as a single unknown gene of length 1 # kilobase recruiting all reads that failed to map to known sequences abundances['UNMAPPED'] = (abundances.get('UNMAPPED', 0) + empty + ambiguous + lowqual + notaligned + nonunique) for fn in sorted(abundances.keys()): print("{}\t{!s}".format(fn, abundances[fn])) sys.stderr.write("__no_feature\t{!s}\n".format(empty)) sys.stderr.write("__ambiguous\t{!s}\n".format(ambiguous)) sys.stderr.write("__too_low_aQual\t{!s}\n".format(lowqual)) sys.stderr.write("__not_aligned\t{!s}\n".format(notaligned)) sys.stderr.write("__alignment_not_unique\t{!s}\n".format(nonunique))
def count_reads_in_features( sam_filename, gff_filename, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout, custom_stat ): def write_to_samout( r, assignment ): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: samoutfile.write( read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n" ) if quiet: warnings.filterwarnings( action="ignore", module="HTSeq" ) if samout != "": samoutfile = open( samout, "w" ) else: samoutfile = None # MB if custom_stat != "": custom_stat_file=open(custom_stat,"a") else: custom_stat_file = None # endMB features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" ) counts = {} # Try to open samfile to fail early in case it is not there if sam_filename != "-": open( sam_filename ).close() gff = HTSeq.GFF_Reader( gff_filename ) i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[ id_attribute ] except KeyError: sys.exit( "Feature %s does not contain a '%s' attribute" % ( f.name, id_attribute ) ) if stranded != "no" and f.iv.strand == ".": sys.exit( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % ( f.name, f.iv ) ) features[ f.iv ] += feature_id counts[ f.attr[ id_attribute ] ] = 0 i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) except: sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) if len( counts ) == 0 and not quiet: sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type ) try: if sam_filename != "-": read_seq = HTSeq.SAM_Reader( sam_filename ) first_read = iter(read_seq).next() else: read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) ) first_read = read_seq.next() read_seq = itertools.chain( [ first_read ], read_seq ) pe_mode = first_read.paired_end except: sys.stderr.write( "Error occured when reading first line of sam file.\n" ) raise try: if pe_mode: read_seq_pe_file = read_seq read_seq = HTSeq.pair_SAM_alignments( read_seq ) empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 # MB: Creating detailed stats if custom_stat_file: sam_lines = 0 skipped = 0 assigned_reads = 0 assigned_reads_s = 0 assigned_reads_p = 0 assigned_genes = 0 assigned_genes_s = 0 assigned_genes_p = 0 empty_s = 0 empty_p = 0 ambiguous_s = 0 ambiguous_p = 0 anu_dict = {} # endMB i = 0 for r in read_seq: i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout( r, "not_aligned" ) continue try: if r.optional_field( "NH" ) > 1: write_to_samout( r, "alignment_not_unique" ) nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout( r, "too_low_aQual" ) continue if stranded != "reverse": iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" ) else: if r[0] is not None and r[0].aligned: #for co in r[0].cigar: #sys.stderr.write("ID: %s, %s\n" % (r[0].original_sam_line.split('\t')[0],co.ref_iv)) if stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" ) ) else: if ( r[0] is None ) or not ( r[0].aligned ): write_to_samout( r, "not_aligned" ) notaligned += 1 continue try: if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): nonunique += 1 write_to_samout( r, "alignment_not_unique" ) # MB: Counting the 'alignment_not_unique' for one or both mates if custom_stat_file: if r[0] is not None and r[1] is not None: # The 2 mates are mapped read_id = r[0].original_sam_line.split('\t')[0] if read_id not in anu_dict: # The read is not indexed yet anu_dict[read_id] = {} anu_dict[read_id]['chr1'] = r[0].original_sam_line.split('\t')[2] anu_dict[read_id]['chr2'] = r[1].original_sam_line.split('\t')[2] anu_dict[read_id]['start1'] = r[0].original_sam_line.split('\t')[3] anu_dict[read_id]['start2'] = r[1].original_sam_line.split('\t')[3] anu_dict[read_id]['al_unique1'] = True anu_dict[read_id]['al_unique2'] = True else: # Read already indexed if anu_dict[read_id]['al_unique1']: if anu_dict[read_id]['chr1'] != r[0].original_sam_line.split('\t')[2] or anu_dict[read_id]['start1'] != r[0].original_sam_line.split('\t')[3]: # At least two positions exists for mate r[0] anu_dict[read_id]['al_unique1'] = False if anu_dict[read_id]['al_unique2']: if anu_dict[read_id]['chr2'] != r[1].original_sam_line.split('\t')[2] or anu_dict[read_id]['start2'] != r[1].original_sam_line.split('\t')[3]: # At least two positions exists for mate r[1] anu_dict[read_id]['al_unique2'] = False elif r[0] is not None: # Only r[1] is mapped anu_dict[r[0].original_sam_line.split('\t')[0]] = {} anu_dict[r[0].original_sam_line.split('\t')[0]]['al_unique1'] = False else: # Only r[0] is mapped anu_dict[r[1].original_sam_line.split('\t')[0]] = {} anu_dict[r[1].original_sam_line.split('\t')[0]]['al_unique2'] = False # endMB continue except KeyError: pass if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ): lowqual += 1 write_to_samout( r, "too_low_aQual" ) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): fs = fs.union( fs2 ) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection( fs2 ) else: sys.exit( "Illegal overlap mode." ) if fs is None or len( fs ) == 0: write_to_samout( r, "no_feature" ) empty += 1 # MB if custom_stat_file: if r[0] is not None and r[1] is not None: empty_p += 1 else: empty_s += 1 # endMB elif len( fs ) > 1: write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" ) ambiguous += 1 # MB if custom_stat_file: if r[0] is not None and r[1] is not None: ambiguous_p += 1 else: ambiguous_s += 1 # endMB else: write_to_samout( r, list(fs)[0] ) counts[ list(fs)[0] ] += 1 # MB if custom_stat_file: if counts[ list(fs)[0] ] == 1: assigned_genes += 1 assigned_reads += 1 if r[0] is not None and r[1] is not None: assigned_reads_p += 1 else: assigned_reads_s += 1 # endMB except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] # MB if custom_stat_file: skipped += 1 #endMB if not quiet: sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " + "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % ( rr.read.name, iv.chrom ) ) if i % 100000 == 0 and not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) except: if not pe_mode: sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() ) else: sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) if samoutfile is not None: samoutfile.close() for fn in sorted( counts.keys() ): print "%s\t%d" % ( fn, counts[fn] ) print "no_feature\t%d" % empty print "ambiguous\t%d" % ambiguous print "too_low_aQual\t%d" % lowqual print "not_aligned\t%d" % notaligned print "alignment_not_unique\t%d" % nonunique # MB: Adding stats in the custom_stat file if custom_stat_file: custom_stat_file.write("Input SAM file line count\t"+"{:,}".format(sum(1 for line in open(sam_filename) if not line.startswith('@')))+"\n\n") custom_stat_file.write("SAM lines (pairs or singles) processed\t"+"{:,}".format(i)+"\n\n") custom_stat_file.write("Skipped pairs (chr.not found)\t"+"{:,}".format(skipped)+"\n\n") custom_stat_file.write("Assigned_genes\t"+"{:,}".format(assigned_genes)+"\n\n") custom_stat_file.write("Assigned_reads\t"+"{:,}".format(assigned_reads)+"\n") custom_stat_file.write("\tSingle reads\t"+"{:,}".format(assigned_reads_s)+"\n") custom_stat_file.write("\tPaired reads\t"+"{:,}".format(assigned_reads_p)+"\n\n") custom_stat_file.write("No_features\t"+"{:,}".format(empty)+"\n") custom_stat_file.write("\tSingle reads\t"+"{:,}".format(empty_s)+"\n") custom_stat_file.write("\tPaired reads\t"+"{:,}".format(empty_p)+"\n\n") custom_stat_file.write("Ambiguous\t"+"{:,}".format(ambiguous)+"\n") custom_stat_file.write("\tSingle reads\t"+"{:,}".format(ambiguous_s)+"\n") custom_stat_file.write("\tPaired reads\t"+"{:,}".format(ambiguous_p)+"\n\n") custom_stat_file.write("Alignment_not_unique\t"+"{:,}".format(nonunique)+"\n") custom_stat_file.write("\tSAM lines (pairs or singles)\t"+"{:,}".format(len(anu_dict))+"\n") # Counting the 'alignment_not_unique' with one or both mates multiply aligned simpl = 0 multipl = 0 for i in anu_dict: if 'al_unique1' in anu_dict[i] and 'al_unique2' in anu_dict[i]: if anu_dict[i]['al_unique1'] or anu_dict[i]['al_unique2']: simpl += 1 else: multipl += 1 else: multipl += 1 custom_stat_file.write("\tOne_mate_uniquely_mapped\t"+"{:,}".format(simpl)+"\n") custom_stat_file.write("\tTwo_mates_multiply_mapped\t"+"{:,}".format(multipl)+"\n")
def count_reads_in_features( sam_filenames, colnames, gff_filename, opts ): """ Hacked version of htseq count.py """ if opts.quiet: warnings.filterwarnings( action="ignore", module="HTSeq" ) features = HTSeq.GenomicArrayOfSets( "auto", opts.stranded != "no" ) mapqMin = int(opts.mapqMin) counts = {} nreads = 0 empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 filtered = 0 # new filter_extras - need a better way to do this - independent filter tool? gff = HTSeq.GFF_Reader( gff_filename ) try: for i, f in enumerate(gff): if f.type == opts.feature_type: try: feature_id = f.attr[ opts.id_attribute ] except KeyError: try: feature_id = f.attr[ 'gene_id' ] except KeyError: sys.exit( "Feature at row %d %s does not contain a '%s' attribute OR a gene_id attribute - faulty GFF?" % ( (i + 1), f.name, opts.id_attribute ) ) if opts.stranded != "no" and f.iv.strand == ".": sys.exit( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % ( f.name, f.iv ) ) features[ f.iv ] += feature_id counts[ feature_id ] = [0 for x in colnames] # we use sami as an index here to bump counts later except: sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() ) raise if not opts.quiet: sys.stdout.write( "%d GFF lines processed.\n" % i ) if len( counts ) == 0 and not opts.quiet: sys.stdout.write( "Warning: No features of type '%s' found.\n" % opts.feature_type ) for sami, sam_filename in enumerate(sam_filenames): colname = colnames[sami] isbam = sam_exts[sami] == 'bam' hasbai = sam_bais[sami] > '' if hasbai: tempname = os.path.splitext(os.path.basename(sam_filename))[0] tempbam = '%s_TEMP.bam' % tempname tempbai = '%s_TEMP.bai' % tempname os.link(sam_filename, tempbam) os.link(sam_bais[sami], tempbai) try: if isbam: if hasbai: read_seq = HTSeq.BAM_Reader( tempbam ) else: read_seq = HTSeq.BAM_Reader( sam_filename ) else: read_seq = HTSeq.SAM_Reader( sam_filename ) first_read = iter(read_seq).next() pe_mode = first_read.paired_end except: if isbam: print >> sys.stderr, "Error occured when reading first line of bam file %s colname=%s \n" % (sam_filename, colname ) else: print >> sys.stderr, "Error occured when reading first line of sam file %s colname=%s \n" % (sam_filename, colname ) raise try: if pe_mode: read_seq_pe_file = read_seq read_seq = HTSeq.pair_SAM_alignments( read_seq ) for seqi, r in enumerate(read_seq): nreads += 1 if not pe_mode: if not r.aligned: notaligned += 1 continue try: if len(opts.filter_extras) > 0: for extra in opts.filter_extras: if r.optional_field(extra): filtered += 1 continue if r.optional_field( "NH" ) > 1: nonunique += 1 continue except KeyError: pass if r.aQual < mapqMin: lowqual += 1 continue if opts.stranded != "reverse": iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 ) else: if r[0] is not None and r[0].aligned: if opts.stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if opts.stranded != "reverse": iv_seq = itertools.chain( iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: if r[0] is None or not r[0].aligned: notaligned += 1 continue try: if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): nonunique += 1 continue except KeyError: pass if ( r[0] and r[0].aQual < mapqMin ) or ( r[1] and r[1].aQual < mapqMin ): lowqual += 1 continue try: if opts.mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): fs = fs.union( fs2 ) elif opts.mode == "intersection-strict" or opts.mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or opts.mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection( fs2 ) else: sys.exit( "Illegal overlap mode %s" % opts.mode ) if fs is None or len( fs ) == 0: empty += 1 elif len( fs ) > 1: ambiguous += 1 else: ck = list(fs)[0] counts[ck][sami] += 1 # end up with counts for each sample as a list except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] empty += 1 if not opts.quiet: sys.stdout.write( ( "Warning: Skipping read '%s', because chromosome " + "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % ( rr.read.name, iv.chrom ) ) except: if not pe_mode: sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() ) else: sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() ) raise if not opts.quiet: sys.stdout.write( "%d sam %s processed for %s.\n" % ( seqi, "lines " if not pe_mode else "line pairs", colname ) ) return counts, empty, ambiguous, lowqual, notaligned, nonunique, filtered, nreads
def next_pair(self): """ Get next read pair """ for (first, second) in ht.pair_SAM_alignments(self.read_iter): yield (first, second)
def count_reads_in_features( sam_filename, gff_filename, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout ): def write_to_samout( r, assignment ): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: samoutfile.write( read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n" ) if quiet: warnings.filterwarnings( action="ignore", module="HTSeq" ) if samout != "": samoutfile = open( samout, "w" ) else: samoutfile = None features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" ) counts = {} ## added by CR dict_nonunique = {} # Try to open samfile to fail early in case it is not there if sam_filename != "-": open( sam_filename ).close() gff = HTSeq.GFF_Reader( gff_filename ) i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[ id_attribute ] except KeyError: sys.exit( "Feature %s does not contain a '%s' attribute" % ( f.name, id_attribute ) ) if stranded != "no" and f.iv.strand == ".": sys.exit( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % ( f.name, f.iv ) ) features[ f.iv ] += feature_id counts[ f.attr[ id_attribute ] ] = 0 ##added by CR dict_nonunique[ f.attr[ id_attribute ] ] = 0 i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) except: sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) if len( counts ) == 0 and not quiet: sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type ) try: if sam_filename != "-": read_seq = HTSeq.SAM_Reader( sam_filename ) first_read = iter(read_seq).next() else: read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) ) first_read = read_seq.next() read_seq = itertools.chain( [ first_read ], read_seq ) pe_mode = first_read.paired_end except: sys.stderr.write( "Error occured when reading first line of sam file.\n" ) raise try: if pe_mode: read_seq_pe_file = read_seq read_seq = HTSeq.pair_SAM_alignments( read_seq ) empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 #added by SB temp_read_name="NA" temp_interval_r0="NA" temp_interval_r1="NA" ## added by CR nonunique2 = 0 #added by SB i = 0 for r in read_seq: i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout( r, "not_aligned" ) continue try: if r.optional_field( "NH" ) > 1: write_to_samout( r, "alignment_not_unique" ) nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout( r, "too_low_aQual" ) continue if stranded != "reverse": iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" ) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" ) ) else: if ( r[0] is None ) or not ( r[0].aligned ): write_to_samout( r, "not_aligned" ) notaligned += 1 continue try: if (( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 )): #print "Reference i= ", i nonunique += 1 #print "%s--%s" % ( r[0].cigar, r[1].cigar) if ( r[0] is not None and r[1] is None ): result, fs_new = is_read_in_gene_interval(r[0], features) if result: if ((temp_read_name != r[0].read.name) and ( temp_interval_r0 is not r[0].iv) ): temp_read_name=r[0].read.name temp_interval_r0=r[0].iv ## -- ro: dir(ro) = ['__class__', '__delattr__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '_read', '_read_as_sequenced', 'aQual', 'aligned', 'cigar', 'failed_platform_qc', 'from_SAM_line', 'from_pysam_AlignedRead', 'get_sam_line', 'inferred_insert_size', 'iv', 'mate_aligned', 'mate_start', 'not_primary_alignment', 'optional_field', 'optional_fields', 'original_sam_line', 'paired_end', 'pcr_or_optical_duplicate', 'pe_which', 'proper_pair', 'read', 'read_as_aligned'] #print "## -- ro: = %s---" % (r[0].original_sam_line) dict_nonunique[ list(fs_new)[0]] += 1 #print "R1 %s--> %s " % (fs_new1 ,r[0].iv) if ( r[0] is None and r[1] is not None ): result, fs_new = is_read_in_gene_interval(r[1], features) if result: if ((temp_read_name != r[1].read.name) and ( temp_interval_r1 is not r[1].iv) ): temp_read_name=r[1].read.name temp_interval_r1=r[1].iv #print "## -- r1: = %s---" % (r[1].original_sam_line) dict_nonunique[ list(fs_new)[0]] += 1 #print "R2 %s--> %s" % (fs_new ,r[1].iv ) if ( r[0] is not None and r[1] is not None ): #print "## -- ro & r1 :: %s-%s" % (r[0].original_sam_line, r[1].original_sam_line) #print "%s--%s" % ( r[0].cigar, r[1].cigar) result1, fs_new1 = is_read_in_gene_interval(r[0], features) result2, fs_new2 = is_read_in_gene_interval(r[1], features) if result1 and not result2: if ((temp_read_name != r[0].read.name) and ( temp_interval_r0 is not r[0].iv) ): temp_interval_r0=r[0].iv #print "before ---%s -" % ( temp_read_name ) temp_read_name=r[0].read.name temp_interval_r0=r[0].iv #print "after %s" % ( temp_read_name ) dict_nonunique[ list(fs_new1)[0]] += 1 #print "R1 %s--> %s" % (fs_new1 ,r[0].iv) elif result2 and not result1: if ((temp_read_name != r[1].read.name)and ( temp_interval_r1 is not r[1].iv)): temp_read_name=r[1].read.name temp_interval_r1=r[1].iv #print "## -- ro & r1: r1" #print "%s" % (r[1].read.name ) dict_nonunique[ list(fs_new2)[0]] += 1 #print "R2 %s--> %s" % (fs_new2 ,r[1].iv) elif result1 and result2: if ((temp_read_name != r[0].read.name) and (temp_interval_r0 is not r[0].iv ) and \ ( temp_interval_r1 is not r[1].iv) ): temp_read_name=r[0].read.name temp_interval_r0=r[0].iv temp_interval_r1=r[1].iv #print "## -- ro & r1: ro&r1" #print "%s" % (r[0].original_sam_line) #print "---%s:%s -- %s --%s" % (r.count, r.index, r[1].read, r[0].read ) #print "%i---%i---%s---%s " % (result1, result2, fs_new1, fs_new2 ) if list(fs_new1)[0] != list(fs_new2)[0]: dict_nonunique[ list(fs_new1)[0]] += 1 dict_nonunique[ list(fs_new2)[0]] += 1 else: dict_nonunique[ list(fs_new1)[0]] += 1 #dict_nonunique[ list(fs_new1)[0]] += 1 #print "R1_R2, %s--> %s ---%s " % (fs_new1 ,r[0].iv, r[1].iv) #dict_nonunique[ list(fs_new2)[0]] += 1 #-------------------------Modified by SB------------------------------------------------------ #fs_new= set() #print "%s**%s**%s" % (type(r[0]), type(r[0].iv), type(features)) #zz=0 #for iv3, fs_new2 in features[ r[0].iv ].steps(): # print "%i--%s--%s" % (zz, iv3, fs_new2) # zz+=1 # fs_new = fs_new.union( fs_new2 ) #CAS:[1465,1544)/+ *** CAS:[1465,1499)/. -- set([]) --set(['gal5p']) #CAS:[1465,1544)/+ *** CAS:[1499,1544)/. -- set(['GALopt']) --set(['gal5p', 'GALopt']) #if not ( (fs_new is None or len( fs_new ) == 0 ) or (len( fs_new ) > 1 ) ) : #added by CR #dict_nonunique[ list(fs_new)[0]] += 1 #---------------------------EOF SB_changes----------------------------------------------------- write_to_samout( r, "alignment_not_unique" ) continue except KeyError: pass if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ): lowqual += 1 write_to_samout( r, "too_low_aQual" ) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): fs = fs.union( fs2 ) # added to test SB #CAS:[1465,1544)/+ *** CAS:[1465,1499)/. -- set([]) --set(['gal5p']) #CAS:[1465,1544)/+ *** CAS:[1499,1544)/. -- set(['GALopt']) --set(['gal5p', 'GALopt']) #print "%s *** %s -- %s --%s" % (iv, iv2, fs2, fs) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection( fs2 ) else: sys.exit( "Illegal overlap mode." ) if fs is None or len( fs ) == 0: write_to_samout( r, "no_feature" ) empty += 1 elif len( fs ) > 1: write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" ) ambiguous += 1 else: write_to_samout( r, list(fs)[0] ) counts[ list(fs)[0] ] += 1 ##aded by CR 2 lines #dict_nonunique[ list(fs)[0]] += nonunique2 #nonunique2 = 0 except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] if not quiet: sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " + "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % ( rr.read.name, iv.chrom ) ) if i % 100000 == 0 and not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) except: if not pe_mode: sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() ) else: sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) if samoutfile is not None: samoutfile.close() print "Gene\tUnique_reads" for fn in sorted( counts.keys() ): print "%s\t%d" % ( fn, counts[fn]) ##print "*%s\t%d" % (fn, dict_nonunique[fn]) print "no_feature\t%d" % empty print "ambiguous\t%d" % ambiguous print "too_low_aQual\t%d" % lowqual print "not_aligned\t%d" % notaligned print "alignment_not_unique\t%d" % nonunique
choices=['single_end', 'paired_end'], type=str, default="paired_end", help=""" whether the data is from a single-end read library, or a paired-end library. Default is paired-end. """) args = parser.parse_args() # define some test files: samfile = '/home/antqueen/booster/PRO_Odontomachus/trinity_denovo_normalized_camponotus/Star/Cplan_Q2_16Aligned.out.sam' gtffile = '/home/antqueen/genomics/experiments/analyses/PRO20160405_camponotus/trinity_denovo_normalized_camponotus/Transdecoder_ss/merge_genesets/Cpla_td_gff.Apr21_11.15.families.gtf' # create gtf iterator print "\nReading gtf file %s..." % (args.gtf_file[0]), gtf = hts.GFF_Reader(args.gtf_file[0]) print " done." # create genomic array and populate with exon features (transcripts and genes) print "Populating genomic array with GTF features...", sys.stdout.flush() if args.stranded == 'yes': feature_array = hts.GenomicArrayOfSets("auto", stranded=True) elif args.stranded == 'no': feature_array = hts.GenomicArrayOfSets("auto", stranded=False) for feature in gtf: if feature.type == args.type: feature_array[feature.iv] += feature.name
#sort you SAM file by read ID, so that multiple mappings are in adjacent lines and the write a script to filter the best one #Written by Simon Anders import sys, re import HTSeq insam = HTSeq.SAM_Reader( sys.stdin ) # Go through all reads, with their alignments bundled up: for bundle in HTSeq.bundle_multiple_alignments( insam ): bestAlmt = None # Go through all alignments of a given read, looking # for the one with the best alignment score for almt in bundle: if bestAlmt is None: bestAlmt = almt elif almt.aQual > bestAlmt.aQual: bestAlmt = almt elif almt.aQual == bestAlmt: # If there are more than one best alignment, # better skip the read bestAlmt = None if bestAlmt is not None: # Change the NH field to 1 and print the line print re.sub( "NH:i:\d+", "NH:i:1", bestAlmt.original_sam_line ) #call this script with the command sort samfile.sam | python chooseBest.py > filtered.sam
if line.startswith( '@' ): # count lines starting with '@' so extra newlines at the end don't throw off the read count n_reads = n_reads + 1 else: with open(filepath) as f: for line in f: if line.startswith('@'): n_reads = n_reads + 1 return n_reads n_reads_r1 = read_count(args.r1) # read in fastq file: fastq_r1 = HTSeq.FastqReader(args.r1) #n_reads_r1=len(list(fastq_r1)) # this was accurate, but way too memory intensive for large fastqs. PAIRED_END = False # default is to process single-end reads if args.r2: fastq_r2 = HTSeq.FastaReader(args.r2) PAIRED_END = True #n_reads_r2=len(list(fastq_r2)) n_reads_r2 = read_count(args.r2) if not n_reads_r1 == n_reads_r2: sys.exit("r1 and r2 have different read counts!") # determine how many reads to return: if args.percent:
def main(): exe_parser = argparse.ArgumentParser() exe_parser.add_argument('infile', type=str, help='<input file> [(full path), -b/-s required]') exe_parser.add_argument("-u", "--not_aligned", help="output reads that were not aligned, including those that were aligned multiple times(flat file).", type=str) exe_parser.add_argument("-s", "--samout", help="output not aligned reads to [file path].", type=str) exe_parser.add_argument("-b", "--ambiguous_out", help="output a fasta file of ambiguous hits [file path].", type=str) exe_parser.add_argument("-v", "--verbose", help="verbose. (default = TRUE).", action="store_true") exe_parser.add_argument("gff", help="<gff file> [(full path)]", type=str) exe_parser.add_argument("-f", "--fasta", help="output fasta file of hits (full path).", type=str) exe_parser.add_argument("-m", "--min_read_length", help="minimal read length to consider. (default = 60b).", type=int) exe_parser.add_argument("-i", "--min_id", help="minimal percent id of hit to consider. (default = 80).", type=int) exe_parser.add_argument("-z", "--min_score", help="minimal aligner score to consider. (default = 0).", type=int) exe_parser.add_argument("-c", "--max_clip", help="proportion of bases clipped from read for alignment. (default = 0.3).", type=float) exe_parser.add_argument("--stranded", help="whether the data is stranded (y, n, reverse). (default = n).", type=str, choices=["y", "n", "reverse"], default="n") exe_parser.add_argument("--idattr", help="GFF attribute to be used as feature ID. (default = GeneID).", type=str) exe_parser.add_argument("--type", help="feature type (3rd column in GFF file) to be used. (default = CDS).", type=str) exe_parser.add_argument("-a", "--minaqual", help="min. alignment quality (default = 0).", type=str) exe_parser.add_argument("-p", "--paired_end_mode", help="input is paired end sorted by name (n) or position (p) . (default = p).", type=str, choices=["p", "n"], default="p") exe_parser.add_argument("-o", "--out", help="name of counts output file.", type=str) args = exe_parser.parse_args() if args.paired_end_mode == 'p': paired_end = True pe_order = 'p' elif args.paired_end_mode == 'n': paired_end = True pe_order = 'n' if args.infile: try: if args.infile == '-': # get sam on a stream seqfile = HTSeq.SAM_Reader(sys.stdin) if args.paired_end_mode: # read_seq_iter = iter(seqfile) # first_read = read_seq_iter.next() # read_seq = itertools.chain([first_read], read_seq_iter) # reader = HTSeq.pair_SAM_alignments(read_seq) if pe_order == 'p': reader = HTSeq.pair_SAM_alignments_with_buffer(seqfile) elif pe_order == 'n': reader = HTSeq.pair_SAM_alignments(seqfile) # (read_seq) else: reader = seqfile elif args.infile != '-': seqfile = HTSeq.SAM_Reader(args.infile) if args.paired_end_mode: read_seq_iter = iter(seqfile) first_read = read_seq_iter.next() read_seq = itertools.chain([first_read], read_seq_iter) reader = HTSeq.pair_SAM_alignments(read_seq) if pe_order == 'p': reader = HTSeq.pair_SAM_alignments_with_buffer(reader) elif pe_order == 'n': reader = HTSeq.pair_SAM_alignments(reader) else: reader = seqfile # fread_seq_iter = iter(reader) # first_read = iter(read_seq).next() elif args.infile == '': print "no input file type given. exiting..." sys.exit(1) except: print "failed processing SAM/BAM file" raise elif not args.infile: print "no input file given. exiting..." sys.exit(1) if args.gff: gff_file = args.gff else: print "no gff file given. exiting..." sys.exit(1) if args.verbose: verbose = True else: verbose = False if args.min_read_length: min_read_len = args.min_read_length else: min_read_len = 60 # default read length if args.max_clip: max_clip_ = float(args.max_clip) else: max_clip_ = float(0.3) # default read length if args.min_id: min_id = float(args.min_id) else: min_id = float(80) if args.min_score: min_score = int(args.min_score) else: min_score = 0 if args.stranded == 'n': stranded = 'no' elif args.stranded == 'y': stranded = 'yes' elif args.stranded == 'reverse': stranded = 'reverse' if args.minaqual: minaqual = args.minaqual else: minaqual = 0 if args.idattr: id_attribute = args.idattr else: id_attribute = "GeneID" if args.type: feature_type = args.type else: feature_type = 'CDS' # ### # parse GFF file features, counts = gff_reader(gff_file, feature_type, id_attribute, verbose, stranded) # ### if args.samout: samoutfile = open(args.samout, "w") else: samoutfile = None if args.ambiguous_out: ambiguousfile = open(args.ambiguous_out, "w") else: ambiguousfile = None if args.fasta: fastafile = open(args.fasta, "w") else: fastafile = None if args.not_aligned: not_aligned_file = open(args.not_aligned, "w") else: not_aligned_file = None if args.out: outfile = open(args.out, "w") else: outfile = None # if outfile and samoutfile and ambiguousfile and fastafile and not_aligned_file == None: # print "None of the possible output file options specified. exiting..." # sys.exit(1) # ####### # decalre counter variables empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 # ####### read_counter = 0 for alignment in reader: # for alignment entry (line in fact) in sam file # iv_seq # print alignment if not paired_end: if read_counter % 1000000 == 0 and verbose: if verbose: print read_counter, 'non paired-end alignments processed' read_name = alignment.read.name # read = alignment.read # READ. Note that def invert_strand( iv ): read_seq = alignment.read.seq read_length = len(alignment.read.seq) if not alignment.aligned: # check if read is aligned to ref sequence if alignment is not None: notaligned += 1 if args.samout: write_to_samout(samoutfile, paired_end, alignment, "not_aligned") if args.not_aligned: not_aligned_file.write(read_name + '\t' + 'not_aligned' + '\n') # continue elif alignment.aligned: opt_fields = alignment.optional_fields # flag = alignment.flag cigar_string = parse_cigar(alignment.original_sam_line.split('\t')[ 5]) # just the cigar string without the fancy HTseq additions cigar_soft_clipped, cigar_m, cigar_insertions, cigar_deletions, cigar_insertions = parse_cigar_alignment(cigar_string) # get alignment data from cigar string score, md_matches, md_deletions, md_mismatches = parse_opt_fields( opt_fields) # get alignment data from md string percent_id = 100.0 * ( float(md_matches) / (float(read_length - cigar_soft_clipped + cigar_insertions + cigar_deletions))) if alignment[0] is not None: # check if read is aligned to ref sequence if alignment.optional_field("NH") > 1: # check if read is mapped more than once # By default these reads are discarded. CHANGE? if args.samout: write_to_samout(samoutfile, paired_end, alignment, "alignment_not_unique") nonunique += 1 if args.not_aligned: not_aligned_file.write(read_name + '\t' + 'alignment_not_unique' + '\n') # continue if alignment.aQual < minaqual: # check quality. default is 0 lowqual += 1 if args.samout: write_to_samout(samoutfile, paired_end, alignment, "too_low_aQual") if args.not_aligned: not_aligned_file.write(read_name + '\t' + 'too_low_aQual' + '\n') # continue clipped = (float(cigar_soft_clipped) / float(read_length)) if read_length >= min_read_len: if (float(cigar_soft_clipped) / float(read_length)) <= max_clip_: if score >= args.min_score: if percent_id >= float(min_id): if stranded == "reverse": iv_seq = ( (invert_strand(cigar_operation.ref_iv) for cigar_operation in alignment[1].cigar if cigar_operation.type == "M" and cigar_operation.size > 0)) else: iv_seq = (cigar_operation.ref_iv for cigar_operation in alignment.cigar if cigar_operation.type == "M" and cigar_operation.size > 0) iv_seq_good = True # collects hits to chromosomes/features. """ cigarOperation in HTSeq: HTSeq.parse_cigar( "20M6I10M", 1000, "chr2", "+" ) #ref_iv == genomicInterval object of htSeq [< CigarOperation: 20 base(s) matched on ref iv chr2:[1000,1020)/+,query iv[0,20)>, < CigarOperation: 6 base(s) inserted on ref iv chr2:[1020,1020)/+,query iv[20,26)>,] """ # if args.fasta: # fastafile.write('>' + read_name + '\n' + read_seq + '\n') else: iv_seq_good = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, "percent_id_too_low=" + str(percent_id)) if args.not_aligned: not_aligned_file.write( read_name + '\t' + 'percent_id_too_low=' + str(percent_id) + '\n') else: iv_seq_good = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, 'alignment_score_too_low=' + str(score)) if args.not_aligned: not_aligned_file.write( read_name + '\t' + 'alignment_score_too_low=' + str(score) + '\n') else: iv_seq_good = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, 'too_many_bases_clipped_from_read=' + str(cigar_soft_clipped)) if args.not_aligned: not_aligned_file.write(read_name + '\t' + 'too_many_bases_clipped_from_read=' + str( cigar_soft_clipped) + '\n') elif paired_end: # print "read counter=", read_counter if read_counter % 100000 == 0 and verbose: if verbose: print read_counter, 'alignment pairs processed' if (alignment[0] is None) or not alignment[0].aligned: notaligned += 1 try: read_1_name = alignment[0].read.name except: read_1_name = 'None' if args.samout: write_to_samout(samoutfile, paired_end, alignment, "not_aligned") if args.not_aligned: not_aligned_file.write(read_1_name + '\t' + 'not_aligned' + '\n') elif (alignment[1] is None) or not alignment[1].aligned: notaligned += 1 try: read_2_name = alignment[1].read.name except: read_2_name = 'None' if args.samout: write_to_samout(samoutfile, paired_end, alignment, "not_aligned") if args.not_aligned: not_aligned_file.write(read_2_name + '\t' + 'not_aligned' + '\n') else: # else: read_1_name = alignment[0].read.name # read_1 = alignment[0].read #READ. read_1_length = len(alignment[0].read.seq) read_1_seq = alignment[0].read.seq read_2_name = alignment[1].read.name # read_2 = alignment[1].read #READ. # read_2_length = len(alignment[1].read.seq) read_2_seq = alignment[1].read.seq iv_seq = tuple() if (alignment[0] is not None) and alignment[0].aligned: # check if read is aligned to ref sequence opt_1_fields = alignment[0].optional_fields # flag_1 = alignment[0].flag cigar_1_string = parse_cigar(alignment[0].original_sam_line.split('\t')[ 5]) # just the cigar string without the fancy HTseq additions cigar_1_soft_clipped, cigar_1_m, cigar_1_insertions, cigar_1_deletions, cigar_1_insertions = parse_cigar_alignment( cigar_1_string) score_1, md_1_matches, md_1_deletions, md_1_mismatches = parse_opt_fields( opt_1_fields) # get alignment data from md string percent_1_id = (100.0 * ((float(md_1_matches) / ( float(read_1_length - cigar_1_soft_clipped + cigar_1_insertions + cigar_1_deletions))))) clipped_1 = (float(cigar_1_soft_clipped) / float(read_1_length)) if int(read_1_length) >= int(min_read_len): if (float(cigar_1_soft_clipped) / float(read_1_length)) <= float(max_clip_): # if int(score_1) >= int(args.min_score): if int(score_1) >= int(min_score): # if float(percent_1_id) >= float(args.min_id): if float(percent_1_id) >= float(min_id): if stranded == "reverse": iv_seq = itertools.chain(iv_seq, (invert_strand(cigar_operation.ref_iv) for cigar_operation in alignment[0].cigar if cigar_operation.type == "M" and cigar_operation.size > 0)) else: iv_seq = itertools.chain(iv_seq, (cigar_operation.ref_iv for cigar_operation in alignment[0].cigar if cigar_operation.type == "M" and cigar_operation.size > 0)) # if args.fasta: # fastafile.write('>' + read_1_name + '\n' + read_1_seq + '\n') iv_seq_good_1 = True else: iv_seq_good_1 = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, "percent_id_too_low=" + str(percent_1_id)) if args.not_aligned: not_aligned_file.write( read_1_name + '\t' + 'percent_id_too_low=' + str(percent_1_id) + '\n') else: iv_seq_good_1 = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, 'alignment_score_too_low=' + str(score_1)) if args.not_aligned: not_aligned_file.write( read_1_name + '\t' + 'alignment_score_too_low=' + str(score_1) + '\n') else: iv_seq_good = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, 'too_many_bases_clipped_from_read=' + str(cigar_1_soft_clipped)) if args.not_aligned: not_aligned_file.write(read_1_name + '\t' + 'too_many_bases_clipped_from_read=' + str( cigar_1_soft_clipped) + '\n') # else: # iv_seq = tuple() if (alignment[1] is not None) and alignment[1].aligned: # check if read is aligned to ref sequence opt_2_fields = alignment[1].optional_fields # flag_2 = alignment[1].flag # ', #'bit_length', 'conjugate', 'denominator', 'imag', 'numerator', 'real'] cigar_2_string = parse_cigar(alignment[1].original_sam_line.split('\t')[ 5]) # just the cigar string without the fancy HTseq additions cigar_2_soft_clipped, cigar_2_m, cigar_2_insertions, cigar_2_deletions, cigar_2_insertions = parse_cigar_alignment( cigar_2_string) score_2, md_2_matches, md_2_deletions, md_2_mismatches = parse_opt_fields( opt_2_fields) # get alignment data from md string read_2_name = alignment[1].read.name read_2_length = len(alignment[1].read.seq) # read_2 = alignment[1].read # READ. read_2_seq = alignment[1].read.seq percent_2_id = (100.0 * (float(md_2_matches) / ( float(read_2_length - cigar_2_soft_clipped + cigar_2_insertions + cigar_2_deletions)))) clipped_2 = (float(cigar_2_soft_clipped) / float(read_2_length)) if int(read_2_length) >= int(min_read_len): if (float(cigar_2_soft_clipped) / float(read_2_length)) <= float(max_clip_): if int(score_2) >= int(min_score): if float(percent_2_id) >= float(min_id): if stranded == "reverse": iv_seq = itertools.chain(iv_seq, (invert_strand(cigar_operation.ref_iv) for cigar_operation in alignment[1].cigar if cigar_operation.type == "M" and cigar_operation.size > 0)) else: iv_seq = itertools.chain(iv_seq, (cigar_operation.ref_iv for cigar_operation in alignment[1].cigar if cigar_operation.type == "M" and cigar_operation.size > 0)) iv_seq_good_2 = True try: if (alignment[0].optional_field("NH") > 1) or (alignment[1].optional_field( "NH") > 1): # or (alignment[1].optional_field("NH") > 1): #check if read is mapped more # than once # By default these reads are discarded. CHANGE? iv_seq_good_1 = False iv_seq_good_2 = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, "alignment_not_unique") nonunique += 1 if args.not_aligned: not_aligned_file.write(read_1_name + '\t' + 'not_aligned' + '\n') not_aligned_file.write(read_2_name + '\t' + 'not_aligned' + '\n') continue except KeyError: pass if (alignment[0] and alignment[0].aQual < minaqual) or (alignment[1] and alignment[1].aQual < minaqual): # check quality. default is 0 iv_seq_good_2 = False lowqual += 1 if args.samout: write_to_samout(samoutfile, paired_end, alignment, "too_low_aQual") if args.not_aligned: not_aligned_file.write(read_1_name + '\t' + 'not_aligned' + '\n') not_aligned_file.write(read_2_name + '\t' + 'not_aligned' + '\n') continue else: iv_seq_good_2 = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, "percent_id_too_low=" + str(percent_2_id)) if args.not_aligned: not_aligned_file.write( read_2_name + '\t' + 'percent_id_too_low=' + str(percent_2_id) + '\n') else: iv_seq_good_2 = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, 'alignment_score_too_low=' + str(score_2)) if args.not_aligned: not_aligned_file.write( read_2_name + '\t' + 'alignment_score_too_low=' + str(score_2) + '\n') else: iv_seq_good_2 = False if args.samout: write_to_samout(samoutfile, paired_end, alignment, 'too_many_bases_clipped_from_read=' + str(cigar_2_soft_clipped)) if args.not_aligned: not_aligned_file.write(read_2_name + '\t' + 'too_many_bases_clipped_from_read=' + str( cigar_2_soft_clipped) + '\n') read_counter += 1 """ overlap_mode == "union" will count a hit even if read is mapped across an intron or there is an insertion. """ try: feature_set = set() for iv in iv_seq: # print iv if iv.chrom not in features.chrom_vectors: # check if alignment feaure name in features from GFF file # The name of a sequence (i.e., chromosome, contig, or the like). # check the gff features dictionary raise UnknownChrom for iv2, fs2 in features[iv].steps(): # fs == feature steps. """ from HTseq manual: GenomicArray objects use by default so-called StepVectors that store the data internally in steps of constant value """ feature_set = feature_set.union(fs2) # print feature_set if feature_set is None or len(feature_set) == 0: if args.samout: write_to_samout(samoutfile, paired_end, alignment, "no_feature") if args.not_aligned: not_aligned_file.write('None' + '\t' + 'no_feature' + '\n') empty += 1 elif len(feature_set) > 1: if args.samout: write_to_samout(samoutfile, paired_end, alignment, "ambiguous[" + '+'.join(feature_set) + "]") if ambiguousfile: if paired_end: if iv_seq_good_1: ambiguousfile.write('>' + read_1_name + '_' + "ambiguous[" + '+'.join( feature_set) + "]" + '_clipped_' + str(clipped_1) + '_score_' + str(score_2) + '_percent_id_' + str(percent_1_id) + '\n' + read_1_seq + '\n') if iv_seq_good_2: ambiguousfile.write('>' + read_2_name + '_' + "ambiguous[" + '+'.join( feature_set) + "]" + '_clipped_' + str(clipped_2) + '_score_' + str(score_2) + '_percent_id_' + str(percent_2_id) + '\n' + read_2_seq + '\n') else: if iv_seq_good: ambiguousfile.write('>' + alignment.read.name + '_' + "ambiguous[" + '+'.join( feature_set) + "]" + '_clipped_' + str(clipped) + '_score_' + str(score) + '_percent_id_' + str(percent_id) + '\n' + read_seq + '\n') """ #if args.not_aligned: # if paired_end: # not_aligned_file.write(alignment[0].read.name + '\t' + 'ambiguous['+'+'.join(feature_set)+']' + '\n') # not_aligned_file.write(alignment[1].read.name + '\t' + 'ambiguous['+'+'.join(feature_set)+']' + '\n') # else: # not_aligned_file.write(alignment.read.name + '\t' + 'ambiguous['+'+'.join(feature_set)+']' + '\n') """ ambiguous += 1 elif len(feature_set) == 1: if args.samout: write_to_samout(samoutfile, paired_end, alignment, list(feature_set)[0]) if args.fasta: if paired_end: if iv_seq_good_1: fastafile.write('>' + read_1_name + '_' + ''.join(list(feature_set)[0]) + '_clipped_' + str( clipped_1) + '_score_' + str(score_1) + '_percent_id_' + str(percent_1_id) + '\n' + read_1_seq + '\n') if iv_seq_good_2: fastafile.write('>' + read_2_name + '_' + ''.join(list(feature_set)[0]) + '_clipped_' + str( clipped_2) + '_score_' + str(score_2) + '_percent_id_' + str(percent_2_id) + '\n' + read_2_seq + '\n') else: if iv_seq_good: fastafile.write('>' + read_name + '_' + ''.join(list(feature_set)[0]) + '_clipped_' + str( clipped) + '_score_' + str(score) + '_percent_id_' + str(percent_id) + '\n' + read_seq + '\n') counts[list(feature_set)[0]] += 1 except: if args.samout: write_to_samout(samoutfile, paired_end, alignment, "__no_feature") empty += 1 # if not paired_end: # al = alignment # else: # al = alignment[0] if alignment[0] is not None else alignment[1] # if args.not_aligned: # not_aligned_file.write(al.read.name + '\t' + 'feature_not_in_gff_file' + '\n') # if not verbose: # print (("Warning: Skipping read '%s', because chromosome " + # "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % # (al.read.name, iv.chrom) ) print 'total', read_counter, 'alignments processed' if samoutfile is not None: samoutfile.close() if fastafile is not None: fastafile.close if not_aligned_file is not None: not_aligned_file.close() if outfile is not None: for feature in sorted(counts.keys()): outfile.write("%s\t%d\n" % (feature, counts[feature])) outfile.write("no_feature\t%d\n" % empty) outfile.write("ambiguous\t%d\n" % ambiguous) outfile.write("too_low_aQual\t%d\n" % lowqual) outfile.write("not_aligned\t%d\n" % notaligned) outfile.write("alignment_not_unique\t%d\n" % nonunique) if outfile is not None: outfile.close()
def build_gene_model(g, GFF_dict): """return gene model of a gene""" """define with codon_no, and codon partition""" gene_model=HTSeq.GenomicArrayOfSets( "auto", stranded=False ) exon_no=int(GFF_dict[g]['exonCount']) exon_start=[int(j) for j in GFF_dict[g]['exonStarts'].split(",")[:exon_no]] exon_end=[int(j) for j in GFF_dict[g]['exonEnds'].split(",")[:exon_no]] ###print g if GFF_dict[g]['strand']=="-": start_codon=int(GFF_dict[g]['cdsEnd']) stop_codon=int(GFF_dict[g]['cdsStart']) exon_start=list(reversed(exon_start)) exon_end=list(reversed(exon_end)) start_exon=[[s, e] for s, e in zip(exon_start, exon_end) if s<start_codon and e>=start_codon][0] pre_exon=len([[s, e] for s, e in zip(exon_start, exon_end) if e>=start_codon]) end_exon=[[s, e] for s, e in zip(exon_start, exon_end) if s<=stop_codon and e>stop_codon][0] else: start_codon=int(GFF_dict[g]['cdsStart']) stop_codon=int(GFF_dict[g]['cdsEnd']) start_exon=[[s, e] for s, e in zip(exon_start, exon_end) if s<=start_codon and e>start_codon][0] pre_exon=len([[s, e] for s, e in zip(exon_start, exon_end) if s<=start_codon]) end_exon=[[s, e] for s, e in zip(exon_start, exon_end) if s<stop_codon and e>=stop_codon][0] in_between_codon=[[s, e] for s, e in zip(exon_start, exon_end)] Start_index=in_between_codon.index(start_exon) End_index=in_between_codon.index(end_exon) if GFF_dict[g]['strand']=="-": start_exon=(start_exon[0], start_codon) end_exon=(stop_codon, end_exon[1]) else: start_exon=[start_codon, start_exon[1]] end_exon=[end_exon[0], stop_codon] exons_cood=[start_exon] exons_cood.extend(in_between_codon[Start_index+1:End_index]) exons_cood.append(end_exon) cDNA_part=0 exon_no=pre_exon-1 codon_n=1 codon_partition=0 if GFF_dict[g]['strand']=="-": for i in exons_cood: exon_no+=1 for location in list(reversed(range(i[0], i[1]))): cDNA_part+=1 if codon_partition==3: codon_n+=1 codon_partition=0 in_name=str(exon_no)+"_"+str(codon_n)+"_"+str(cDNA_part)+"_"+str(codon_partition) codon_partition+=1 gene_model[HTSeq.GenomicInterval(GFF_dict[g]['chrom'], location, location+1)]+=in_name else: for i in exons_cood: exon_no+=1 for location in range(i[0], i[1]): cDNA_part+=1 if codon_partition==3: codon_n+=1 codon_partition=0 in_name=str(exon_no)+"_"+str(codon_n)+"_"+str(cDNA_part)+"_"+str(codon_partition) codon_partition+=1 gene_model[HTSeq.GenomicInterval(GFF_dict[g]['chrom'], location, location+1)]+=in_name return gene_model
def intersectcirc(self, circ_file, modified_gtf_file): # imput the result file of print_start_end_file import pybedtools #intersectBed -a start.bed -b Drosophila_melanogaster.BDGP5.75.exon_id.dedup.gtf -wa -wb -loj > tmpintersect.2 circ = pybedtools.BedTool(circ_file) gtf = pybedtools.BedTool(modified_gtf_file) intersectfile = circ.intersect(gtf,wa=True,wb=True,loj=True) # Store circExons as: circle start or end intervals as key, custom_exon_id as value circExons = {} for lin in intersectfile: lin_split = str(lin).split('\t') if lin_split[11].strip('\n') == '.': #lin_split[11] = '' pass else: circExons.setdefault( HTSeq.GenomicInterval(lin_split[0],int(lin_split[1]),int(lin_split[2]),lin_split[9]), set() ).add( HTSeq.parse_GFF_attribute_string(lin_split[11])['custom_exon_id'] ) #circExons.setdefault( HTSeq.GenomicInterval(lin_split[0],int(lin_split[1]),int(lin_split[2]),lin_split[9]), [] ).append( { HTSeq.GenomicInterval(lin_split[3],int(lin_split[6]),int(lin_split[7]),lin_split[9]):HTSeq.parse_GFF_attribute_string(lin_split[11]) }) return circExons
def add_raw_reads_to_a_peak_region(self, peak, ga, range_to_mark=None): peak_center = int(float(peak[1] + peak[0]) / 2.) #print "Adding raw reads to a peak in %s" % self.gene_name left_raw = [] right_raw = [] if peak_center <= 1000: left_border = 1 to_pad = peak_center - 1000 - 1 left_raw = [0] * to_pad else: left_border = peak_center - 1000 right_border = peak_center + 1000 if peak_center - left_border < 2: return [0] if range_to_mark is not None: marks = [] left_marks_border = max(range_to_mark[0], left_border) right_marks_border = min(range_to_mark[1], right_border) # Case 1: the range to mark does not overlap the peak range. if not ((left_border <= range_to_mark[0] <= right_border) or (left_border <= range_to_mark[1] <= right_border)): marks += [0] * (int(right_border) - int(left_border)) else: marks += [0] * (left_marks_border - left_border) marks += [1] * (right_marks_border - left_marks_border) marks += [0] * (right_border - right_marks_border) return marks nope = ''' # Case 2: the range to mark is within the peak range. elif ( (left_border <= range_to_mark[0] <= right_border) and ( left_border <= range_to_mark[1] <= right_border) ): marks += [0] * (left_marks_border - left_border) marks += [1] * (right_marks_border - left_marks_border) marks += [0] * (right_border - right_marks_border) # Case 3: only the right range overlaps. elif ( (left_border > range_to_mark[0]) and ( left_border <= range_to_mark[1] <= right_border) ): ) if int(left_border) < range_to_mark[0] < int(right_border): marks += [0] * (range_to_mark[0] - int(left_border)) marks += [1] * (min(range_to_mark[1], int(right_border)) - range_to_mark[0]) if range_to_mark[1] > range_to_mark[0]: marks += [1] * (range_to_mark[1] - range_to_mark[0]) if peak_center + 1000 > range_to_mark[1]: marks += [0] * (peak_center + 1000 - range_to_mark[1]) return marks''' left_iv = HTSeq.GenomicInterval(self.chrom, int(left_border), peak_center, self.strand) right_iv = HTSeq.GenomicInterval(self.chrom, peak_center, peak_center + 1000, self.strand) for iv, score in ga[left_iv].steps(): left_raw += [score] * (iv.end - iv.start) for iv, score in ga[right_iv].steps(): right_raw += [score] * (iv.end - iv.start) peak_raw = left_raw + right_raw return peak_raw
def count_reads(features, counts, pe_mode, read_seq, order, stranded, overlap_mode, quiet, minaqual, write_to_samout ): if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments( read_seq ) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq ) else: raise ValueError, "Illegal order specified." empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write( "%d SAM alignment record%s processed.\n" % ( i, "s" if not pe_mode else " pairs" ) ) i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout( r, "__not_aligned" ) continue try: if r.optional_field( "NH" ) > 1: nonunique += 1 write_to_samout( r, "__alignment_not_unique" ) continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout( r, "__too_low_aQual" ) continue if stranded != "reverse": iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 ) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain(iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: if ( r[0] is None ) or not ( r[0].aligned ): write_to_samout( r, "__not_aligned" ) notaligned += 1 continue try: if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): nonunique += 1 write_to_samout( r, "__alignment_not_unique" ) continue except KeyError: pass if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ): lowqual += 1 write_to_samout( r, "__too_low_aQual" ) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): fs = fs.union( fs2 ) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection( fs2 ) else: sys.exit( "Illegal overlap mode." ) if fs is None or len( fs ) == 0: write_to_samout( r, "__no_feature" ) empty += 1 elif len( fs ) > 1: write_to_samout( r, "__ambiguous[" + '+'.join( fs ) + "]" ) ambiguous += 1 else: write_to_samout( r, list(fs)[0] ) counts[ list(fs)[0] ] += 1 except UnknownChrom: write_to_samout( r, "__no_feature" ) empty += 1 if not quiet: sys.stderr.write( "%d SAM %s processed.\n" % ( i, "alignments " if not pe_mode else "alignment pairs" ) ) for fn in sorted( counts.keys() ): print "%s\t%d" % ( fn, counts[fn] ) print "__no_feature\t%d" % empty print "__ambiguous\t%d" % ambiguous print "__too_low_aQual\t%d" % lowqual print "__not_aligned\t%d" % notaligned print "__alignment_not_unique\t%d" % nonunique
args = parser.parse_args() sample_name = args.sample_name chr_no = args.chr_no bam_file = args.bam_file ref_file = args.ref_file print("chr_no ", chr_no) snp_file = args.snp_file indel_file = args.indel_file if (args.chr_prefix): chr = args.chr_prefix + str(chr_no) else: chr = str(chr_no) sequence = {} for s in HTSeq.FastaReader(ref_file): sequence[s.name] = s reference_seq = sequence["chr" + str(chr_no)] pos_ref = 0 samfile = pysam.Samfile(bam_file, "rb") haplotyped_snp_file = subprocess.Popen(['tabix', snp_file, chr_no], stdout=subprocess.PIPE) haplotyped_indel_file = subprocess.Popen(['tabix', indel_file, chr_no], stdout=subprocess.PIPE) #d={'hc':0,'hd':0,'bt':0,'ot':0,'rf':0,'fr':0} haplotypeC_bam = pysam.Samfile("haplotypeC_" + chr + ".bam", "wb", template=samfile)
def count_reads_onto_prebuilt_features( sam_filename, features, feature_ids, stranded, overlap_mode, quiet, minaqual, samout, umis=False ): def write_to_samout(r, assignment): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n") if quiet: warnings.filterwarnings(action="ignore", module="HTSeq") if samout != "": samoutfile = open(samout, "w") else: samoutfile = None if umis: umi_re = re.compile(":UMI:(\w+):") umi_counts = {} def count_umis(fs, read_name): umi_seq = umi_re.search(read_name).group(1) umi_counts[fs][umi_seq] += 1 for feature_id in feature_ids: umi_counts[feature_id] = Counter() else: def count_umis(x, y): return None # Try to open samfile to fail early in case it is not there if sam_filename != "-": open(sam_filename).close() counts = {} for feature_id in feature_ids: counts[feature_id] = 0 try: if sam_filename != "-": read_seq_file = HTSeq.SAM_Reader(sam_filename) read_seq = read_seq_file first_read = iter(read_seq).next() else: read_seq_file = HTSeq.SAM_Reader(sys.stdin) read_seq_iter = iter(read_seq_file) first_read = read_seq_iter.next() read_seq = itertools.chain([first_read], read_seq_iter) pe_mode = first_read.paired_end except StopIteration: raise EmptySamError(sam_filename) try: if pe_mode: read_seq = HTSeq.pair_SAM_alignments(read_seq) empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout(r, "not_aligned") continue try: if r.optional_field("NH") > 1: write_to_samout(r, "alignment_not_unique") nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(r, "too_low_aQual") continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0) ) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0) ) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "not_aligned") notaligned += 1 continue try: if (r[0] is not None and r[0].optional_field("NH") > 1) or ( r[1] is not None and r[1].optional_field("NH") > 1 ): nonunique += 1 write_to_samout(r, "alignment_not_unique") continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual): lowqual += 1 write_to_samout(r, "too_low_aQual") continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: write_to_samout(r, "no_feature") empty += 1 elif len(fs) > 1: write_to_samout(r, "ambiguous[" + "+".join(fs) + "]") ambiguous += 1 else: write_to_samout(r, list(fs)[0]) counts[list(fs)[0]] += 1 count_umis(list(fs)[0], r.read.name) except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] empty += 1 # if not quiet: # sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " + # "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % # ( rr.read.name, iv.chrom ) ) if i % 100000 == 0 and not quiet: sys.stderr.write("%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs")) except: sys.stderr.write("Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string()) raise if not quiet: sys.stderr.write("%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs")) if samoutfile is not None: samoutfile.close() # sorted feature list. features+counts feats = [fn for fn in sorted(counts.keys())] if umis: counts = [len(umi_counts[fn]) for fn in feats] else: counts = [counts[fn] for fn in feats] # cat statistics summary to feature+count list feats = feats + ["no_feature", "ambiguous", "too_low_aQual", "not_aligned", "alignment_not_unique"] counts = counts + [empty, ambiguous, lowqual, notaligned, nonunique] return (feats, counts)
# Deal with any GFF file reading errors except ValueError as e: e.args += ( gff.get_line_number_string(), ) raise try: # Get the first read to see if we're dealing with paired-end data read_seq = HTSeq.SAM_Reader(options.sam) first_read = iter(read_seq).next() pe_mode = first_read.paired_end # Re-initialize read_seq depending on if it's paired-end data or not read_seq = HTSeq.SAM_Reader(options.sam) if pe_mode: read_seq = HTSeq.pair_SAM_alignments(read_seq) # Read counter, for feedback to user i = 0 total = 0 # Here we go, through each read... for r in read_seq: spliced = False if not pe_mode: if not r.aligned: continue total += 1 iv_seq = [] # Check to see if it's spliced for co in r.cigar:
def htseq_count(data): """ adapted from Simon Anders htseq-count.py script http://www-huber.embl.de/users/anders/HTSeq/doc/count.html """ sam_filename, gff_filename, out_file, stats_file = _get_files(data) stranded = _get_stranded_flag(data["config"]) overlap_mode = "union" feature_type = "exon" id_attribute = "gene_id" minaqual = 0 if file_exists(out_file): return out_file logger.info("Counting reads mapping to exons in %s using %s as the " "annotation and strandedness as %s." % (os.path.basename(sam_filename), os.path.basename(gff_filename), _get_strandedness(data["config"]))) features = HTSeq.GenomicArrayOfSets("auto", stranded != "no") counts = {} # Try to open samfile to fail early in case it is not there open(sam_filename).close() gff = HTSeq.GFF_Reader(gff_filename) i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[id_attribute] except KeyError: sys.exit("Feature %s does not contain a '%s' attribute" % (f.name, id_attribute)) if stranded != "no" and f.iv.strand == ".": sys.exit("Feature %s at %s does not have strand " "information but you are running htseq-count " "in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) features[f.iv] += feature_id counts[f.attr[id_attribute]] = 0 i += 1 if i % 100000 == 0: sys.stderr.write("%d GFF lines processed.\n" % i) except: sys.stderr.write("Error occured in %s.\n" % gff.get_line_number_string()) raise sys.stderr.write("%d GFF lines processed.\n" % i) if len(counts) == 0: sys.stderr.write("Warning: No features of type '%s' found.\n" % feature_type) try: align_reader = htseq_reader(sam_filename) first_read = iter(align_reader).next() pe_mode = first_read.paired_end except: sys.stderr.write("Error occured when reading first line of sam " "file.\n") raise try: if pe_mode: read_seq_pe_file = align_reader read_seq = HTSeq.pair_SAM_alignments(align_reader) empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: i += 1 if not pe_mode: if not r.aligned: notaligned += 1 continue try: if r.optional_field("NH") > 1: nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain(iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0)) else: iv_seq = itertools.chain(iv_seq, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): notaligned += 1 continue try: if (r[0] is not None and r[0].optional_field("NH") > 1) or \ (r[1] is not None and r[1].optional_field("NH") > 1): nonunique += 1 continue except KeyError: pass if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual): lowqual += 1 continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif (overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty"): fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if (len(fs2) > 0 or overlap_mode == "intersection-strict"): if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: empty += 1 elif len(fs) > 1: ambiguous += 1 else: counts[list(fs)[0]] += 1 except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] empty += 1 if i % 100000 == 0: sys.stderr.write("%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs")) except: if not pe_mode: sys.stderr.write("Error occured in %s.\n" % read_seq.get_line_number_string()) else: sys.stderr.write("Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() ) raise sys.stderr.write("%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs")) with file_transaction(out_file) as tmp_out_file: with open(tmp_out_file, "w") as out_handle: on_feature = 0 for fn in sorted(counts.keys()): on_feature += counts[fn] out_handle.write("%s\t%d\n" % (fn, counts[fn])) with file_transaction(stats_file) as tmp_stats_file: with open(tmp_stats_file, "w") as out_handle: out_handle.write("on_feature\t%d\n" % on_feature) out_handle.write("no_feature\t%d\n" % empty) out_handle.write("ambiguous\t%d\n" % ambiguous) out_handle.write("too_low_aQual\t%d\n" % lowqual) out_handle.write("not_aligned\t%d\n" % notaligned) out_handle.write("alignment_not_unique\t%d\n" % nonunique) return out_file
def count_reads_in_features(sam_filename, gff_filename, samtype, order, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout, include_non_annotated, htseq_no_ambiguous, outputDiscarded): """ This is taken from the function count_reads_in_features() from the script htseq-count in the HTSeq package version 0.61.p2 The reason to do so is to fix two really small bugs related to the SAM output. The code of the function is small and simple so for now we will use the patched function here. A patch request has been sent to the HTSeq team. The description of the parameters are the same as htseq-count. Two parameters were added to filter out what to write in the sam output The HTSEQ License HTSeq is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. The full text of the GNU General Public License, version 3, can be found here: http://www.gnu.org/licenses/gpl-3.0-standalone.html """ # Set up the filters count_reads_in_features.filter_htseq = \ ["__too_low_aQual", "__not_aligned", "__alignment_not_unique"] if not include_non_annotated: count_reads_in_features.filter_htseq.append("__no_feature") count_reads_in_features.filter_htseq_no_ambiguous = htseq_no_ambiguous # Open SAM/BAM output file flag_write = "wb" if samtype == "bam" else "wh" flag_read = "rb" if samtype == "bam" else "r" saminfile = pysam.AlignmentFile(sam_filename, flag_read) count_reads_in_features.samoutfile = pysam.AlignmentFile( samout, flag_write, template=saminfile) if outputDiscarded is not None: count_reads_in_features.samdiscarded = pysam.AlignmentFile( outputDiscarded, flag_write, template=saminfile) saminfile.close() # Counter of annotated records count_reads_in_features.annotated = 0 # Function to write to SAM output def write_to_samout(read, assignment): # Creates the PySAM record # to_pysam_AlignedSegment is the new method in HTSeq>=0.7.0 that # uses the latest Pysam API and reports the correct sequences sam_record = read.to_pysam_AlignedSegment( count_reads_in_features.samoutfile) sam_record.set_tag("XF", assignment, "Z") if read is not None and assignment not in count_reads_in_features.filter_htseq \ and not (count_reads_in_features.filter_htseq_no_ambiguous and assignment.find("__ambiguous") != -1): count_reads_in_features.samoutfile.write(sam_record) count_reads_in_features.annotated += 1 elif outputDiscarded is not None: count_reads_in_features.samdiscarded.write(sam_record) # Annotation objects features = HTSeq.GenomicArrayOfSets("auto", stranded != "no") counts = {} gff = HTSeq.GFF_Reader(gff_filename) try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[id_attribute] except KeyError: raise ValueError, ("Feature %s does not contain a '%s' attribute" \ % (f.name, id_attribute)) if stranded != "no" and f.iv.strand == ".": raise ValueError, ("Feature %s at %s does not have strand information but you are " \ "running htseq-count in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) features[f.iv] += feature_id counts[f.attr[id_attribute]] = 0 except: raise if len(counts) == 0: raise RuntimeError, "No features of type '%s' found.\n" % feature_type if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader else: raise ValueError, "Unknown input format %s specified." % samtype try: read_seq = SAM_or_BAM_Reader(sam_filename) except: raise RuntimeError, "Error occurred when reading beginning of SAM/BAM file." try: for r in read_seq: if not r.aligned: write_to_samout(r, "__not_aligned") continue try: if r.optional_field("NH") > 1: write_to_samout(r, "__alignment_not_unique") # Should these reads potentially be printed twice? # should there not be a continue statement here? # otherwise the read will move on through the if statemets # until it gets a gene id annotation and will be printed again? except KeyError: pass except Exception as e: raise e if r.aQual < minaqual: write_to_samout(r, "__too_low_aQual") continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0) try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len( fs2 ) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: raise RuntimeError, "Illegal overlap mode." if fs is None: continue elif len(fs) == 0: write_to_samout(r, "__no_feature") elif len(fs) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]") else: write_to_samout(r, list(fs)[0]) except UnknownChrom: pass except: count_reads_in_features.samoutfile.close() if outputDiscarded is not None: count_reads_in_features.samdiscarded.close() raise count_reads_in_features.samoutfile.close() if outputDiscarded is not None: count_reads_in_features.samdiscarded.close() return count_reads_in_features.annotated
def count_reads_in_features( sam_filename, gff_filename, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout ): def write_to_samout( r, assignment ): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: samoutfile.write( read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n" ) if quiet: warnings.filterwarnings( action="ignore", module="HTSeq" ) if samout != "": samoutfile = open( samout, "w" ) else: samoutfile = None features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" ) counts = {} gene_length = {} # Try to open samfile to fail early in case it is not there if sam_filename != "-": open( sam_filename ).close() counts, colgenes = parse_gff(gff_filename,features,feature_type,id_attribute,stranded,quiet,counts) if len( counts ) == 0 and not quiet: sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type ) ################# read sam file ####################### try: if sam_filename != "-": read_seq = HTSeq.SAM_Reader( sam_filename ) first_read = iter(read_seq).next() else: read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) ) first_read = read_seq.next() read_seq = itertools.chain( [ first_read ], read_seq ) pe_mode = first_read.paired_end except: sys.stderr.write( "Error occured when reading first line of sam file.\n" ) raise ################ read sam file ####################### try: if pe_mode: read_seq_pe_file = read_seq read_seq = HTSeq.pair_SAM_alignments( read_seq ) empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout( r, "not_aligned" ) continue try: if r.optional_field( "NH" ) > 1: write_to_samout( r, "alignment_not_unique" ) nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout( r, "too_low_aQual" ) continue if stranded != "reverse": iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 ) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: if ( r[0] is None ) or not ( r[0].aligned ): write_to_samout( r, "not_aligned" ) notaligned += 1 continue try: if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): nonunique += 1 write_to_samout( r, "alignment_not_unique" ) continue except KeyError: pass if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ): lowqual += 1 write_to_samout( r, "too_low_aQual" ) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): ## what is within the genomic interval of iv fs = fs.union( fs2 ) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection( fs2 ) else: sys.exit( "Illegal overlap mode." ) if fs is None or len( fs ) == 0: write_to_samout( r, "no_feature" ) empty += 1 elif len( fs ) > 1: write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" ) ambiguous += 1 else: write_to_samout( r, list(fs)[0] ) counts[ list(fs)[0] ] += 1 except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] empty += 1 #if not quiet: # sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " + # "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % # ( rr.read.name, iv.chrom ) ) if i % 100000 == 0 and not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) except: if not pe_mode: sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() ) else: sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) if samoutfile is not None: samoutfile.close() for fn in sorted( counts.keys() ): print i, sum(counts.values()) rpkm, feature_len = get_rpkm(counts[fn],i,colgenes[fn]) print "%s\t%d\t%d\t%d" % ( fn, counts[fn], feature_len,rpkm) print "no_feature\t%d" % empty print "ambiguous\t%d" % ambiguous print "too_low_aQual\t%d" % lowqual print "not_aligned\t%d" % notaligned print "alignment_not_unique\t%d" % nonunique
import sys import matplotlib.pyplot as plt if len(sys.argv) < 3: print("Please enter input file (.sam) and output file (.fastq)!") exit() input_file = sys.argv[1] output_file = sys.argv[2] if not (input_file.endswith(".sam") and output_file.endswith(".fastq")): print("Please enter input file (.sam) and output file (.fastq)!") exit() import HTSeq import numpy as np alignment_file = HTSeq.SAM_Reader(input_file) len_reads=[] my_fastq_file = open( output_file, "w" ) for aln in alignment_file: if not aln.aligned: len_reads.append(len(aln.read.seq)) if len(aln.read.seq)>200: myread = HTSeq.SequenceWithQualities( aln.read.seq, aln.read.name, aln.read.qualstr ) myread.write_to_fastq_file( my_fastq_file ) my_fastq_file.close() import matplotlib.pyplot as plt %matplotlib inline plt.hist(len_reads, bins=10) plt.savefig(output_file+".png")
set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] ) if len( set_of_gene_names ) == 0: counts[ '_empty' ] += 1 elif len( set_of_gene_names ) > 1: counts[ '_ambiguous' ] +=1 else: for f in rs: counts[ f.name ] += 1 num_reads += 1 if num_reads % 100000 == 0: sys.stderr.write( "%d reads processed.\n" % num_reads ) else: # paired-end num_reads = 0 for af, ar in HTSeq.pair_SAM_alignments( HTSeq.SAM_Reader( sam_file ) ): rs = set() if af and ar and not af.aligned and not ar.aligned: counts[ '_notaligned' ] += 1 continue if af and ar and not af.aQual < minaqual and ar.aQual < minaqual: counts[ '_lowaqual' ] += 1 continue if af and af.aligned and af.aQual >= minaqual and af.iv.chrom in features.chrom_vectors.keys(): for cigop in af.cigar: if cigop.type != "M": continue if reverse: cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand ) for iv, s in features[cigop.ref_iv].steps(): rs = rs.union( s )
def annotate_table(exp_design_name, mouse_seq, ref_peaks_list, transcript_list, PATH_PEAKS, col_name): """ Read list of windows_indexes and add : - annotation - motif presence score - relative position on transcript - overlapping refpeaks - INPUT: PATH_PEAKS + exp_design_name + '_windows_indexes.txt' PATH_PEAKS + exp_design_name + '_windows_list_Annot.txt' OUTPUT: PATH_PEAKS + exp_design_name + '_windows_indexes_Annot.txt' :param exp_design_name: :param mouse_seq: :param ref_peaks_list: :param PATH_PEAKS: :return: """ with open(PATH_PEAKS + col_name + '/' + exp_design_name + '_All.txt', "rU") as table_all, \ open(PATH_PEAKS + col_name + '/' + exp_design_name + '_' + col_name + '_Summary_All.txt', "w") as annot_index_file, \ open(PATH_PEAKS + '/' + exp_design_name + '/' + exp_design_name + '_' + col_name + '_Summary.txt', "w") as annot_final_file: csv_table_all = csv.DictReader(table_all, delimiter = '\t') df_annot = pd.read_csv(m6a_utils.PATH_ANNOT + 'gencodeVM13/gencode.vM13.annotation.entrez.uniprot_clean.txt',index_col=0, sep='\t') headers = ['WindowId'] list_biocond = exp_design.get_biocond_to_dataset('m6aExpDesign_' + exp_design_name) #for biocond in list_biocond: # headers.append(biocond) list_data = exp_design.get_data_list('m6aExpDesign_' + exp_design_name) for dataset in list_data: headers.append(dataset) headers.extend(['WindowId','Motif','Relative_pos','Ref_Peaks','Nb_ref_peaks','Classification']) headers.extend(['chromo_window','begin_window','end_window','strand_window','type_transcript','index_window']) headers.append('Transcript_ID') headers.extend(df_annot.columns) annot_index_file.write('\t'.join(headers)+'\n') annot_final_file.write('\t'.join(headers) + '\n') for row in csv_table_all: window_id = row['Peak_id'] peak = HTSeq.GenomicInterval(row['chr'], int(row['start']), int(row['end']), ".") transcript_id = '' for iv, value in transcript_list[peak].steps(): if type(value) is HTSeq.GenomicFeature: transcript_id = value.attr['transcript_id'] if transcript_id in df_annot.index: transcript_annot = df_annot.loc[transcript_id] chr = df_annot.loc[transcript_id]['chr'] begin_tr = int(df_annot.loc[transcript_id]['begin']) end_tr = int(df_annot.loc[transcript_id]['end']) strand_tr = df_annot.loc[transcript_id]['strand'] transcript_iv = HTSeq.GenomicInterval(chr, begin_tr, end_tr, strand_tr) else: transcript_id = '' # Calculate motif presence score sequence_score = 0 sequence = mouse_seq[peak.chrom][peak.start:peak.end].seq for motif in m6a_utils.MOTIF_METH: if motif in sequence: sequence_score += m6a_utils.MOTIF_METH[motif] # relative position on transcript relative_pos = 0.5 if transcript_id != '': diff_start = float(peak.start + peak.length / 2) - transcript_iv.start if df_annot['strand'][transcript_id] == '-': diff_start = float(peak.end + peak.length / 2) - transcript_iv.end diff_start = - diff_start relative_pos = diff_start / transcript_iv.length # lengthTranscript = math.fabs(float(df_annot['begin'][values[0]]) - float(df_annot['end'][values[0]])) # diffStart = float(begin + length / 2) - df_annot['begin'][values[0]] # if strand == '-': # diffStart = float(begin + length / 2) - df_annot['end'][values[0]] # diffStart = - diffStart # relative_pos = diffStart / lengthTranscript # Look at overlapping refpeaks ref_peaks = '' for iv, value in ref_peaks_list[peak].steps(): if len(value): ref_peaks = ref_peaks + str(value) + ';;' # print(ref_peaks) nb_ref_peaks = len(ref_peaks.split(';;')) - 1 # apply classification classification = 0 if (relative_pos < 0.3) or (relative_pos > 0.7): if sequence_score > 1: # if nb_ref_peaks > 0: classification = 1 new_row = [window_id] #for biocond in list_biocond: # new_row.append(row[biocond]) for dataset in list_data: new_row.append(row[dataset]) new_row.extend([window_id, str(sequence_score), str(relative_pos), ref_peaks, str(nb_ref_peaks), classification, peak.chrom, str(peak.start), str(peak.end), str(peak.length)]) if transcript_id != '': new_row.extend(['protein_coding', '1', transcript_id]) for i in range(0, len(df_annot.columns)): header = df_annot.columns[i] if header == 'UniprotIDs': uniprot = df_annot[header][transcript_id] #print(df_annot[header][values[0]].isnull()) #print(type(uniprot)) if not uniprot == 'None' and not type(uniprot) == numpy.float: uniprot = df_annot[header][transcript_id].split(';')[0] new_row.append(uniprot) else: new_row.append('none') else: new_row.append(df_annot[header][transcript_id]) annot_final_file.write('\t'.join([str(i) for i in new_row]) + '\n') annot_index_file.write('\t'.join([str(i) for i in new_row]) + '\n')
"""this is to cut promoter into totalbins bins and count the coverage in each bin""" bins = numpy.linspace(promoter.start, promoter.end, totalbins + 1) for i in range(totalbins): bin_range = HTSeq.GenomicInterval(promoter.chrom, int(bins[i]), int(bins[i + 1]), '.') hm_list.append( int( sum( numpy.fromiter(coverage[bin_range], dtype='i', count=(int(bins[i + 1]) - int(bins[i])))) / (reads / 1e6))) return hm_list coverage = HTSeq.GenomicArray('auto', stranded=False, typecode='i') bedfile = open(sys.argv[1]) reads = 0 if sys.argv[6] == 'fragment': while True: line1 = bedfile.readline().rstrip() if not line1: break reads += 1 line2 = bedfile.readline().rstrip() items1 = line1.split() items2 = line2.split() if items1[0] == items2[0]: chr = items1[0] start = min(int(items1[1]), int(items2[1])) end = max(int(items1[2]), int(items2[2]))
#sort you SAM file by read ID, so that multiple mappings are in adjacent lines and the write a script to filter the best one #Written by Simon Anders import sys, re import HTSeq insam = HTSeq.SAM_Reader(sys.stdin) # Go through all reads, with their alignments bundled up: for bundle in HTSeq.bundle_multiple_alignments(insam): bestAlmt = None # Go through all alignments of a given read, looking # for the one with the best alignment score for almt in bundle: if bestAlmt is None: bestAlmt = almt elif almt.aQual > bestAlmt.aQual: bestAlmt = almt elif almt.aQual == bestAlmt: # If there are more than one best alignment, # better skip the read bestAlmt = None if bestAlmt is not None: # Change the NH field to 1 and print the line print re.sub("NH:i:\d+", "NH:i:1", bestAlmt.original_sam_line) #call this script with the command sort samfile.sam | python chooseBest.py > filtered.sam
def count_reads_in_features( sam_filename, gff_filename, samtype, order, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout ): def write_to_samout( r, assignment ): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: samoutfile.write( read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n" ) if samout != "": samoutfile = open( samout, "w" ) else: samoutfile = None features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" ) counts = {} # Try to open samfile to fail early in case it is not there if sam_filename != "-": open( sam_filename ).close() gff = HTSeq.GFF_Reader( gff_filename ) i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[ id_attribute ] except KeyError: raise ValueError, ( "Feature %s does not contain a '%s' attribute" % ( f.name, id_attribute ) ) if stranded != "no" and f.iv.strand == ".": raise ValueError, ( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % ( f.name, f.iv ) ) features[ f.iv ] += feature_id counts[ f.attr[ id_attribute ] ] = 0 i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) except: sys.stderr.write( "Error occured when processing GFF file (%s):\n" % gff.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) if len( counts ) == 0: sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type ) if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader else: raise ValueError, "Unknown input format %s specified." % samtype try: if sam_filename != "-": read_seq_file = SAM_or_BAM_Reader( sam_filename ) read_seq = read_seq_file first_read = iter(read_seq).next() else: read_seq_file = SAM_or_BAM_Reader( sys.stdin ) read_seq_iter = iter( read_seq_file ) first_read = read_seq_iter.next() read_seq = itertools.chain( [ first_read ], read_seq_iter ) pe_mode = first_read.paired_end except: sys.stderr.write( "Error occured when reading beginning of SAM/BAM file.\n" ) raise try: if pe_mode: if order == "name": read_seq = HTSeq.pair_SAM_alignments( read_seq ) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq ) else: raise ValueError, "Illegal order specified." empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write( "%d SAM alignment record%s processed.\n" % ( i, "s" if not pe_mode else " pairs" ) ) i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout( r, "__not_aligned" ) continue try: if r.optional_field( "NH" ) > 1: nonunique += 1 write_to_samout( r, "__alignment_not_unique" ) continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout( r, "__too_low_aQual" ) continue if stranded != "reverse": iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 ) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) else: if ( r[0] is None ) or not ( r[0].aligned ): write_to_samout( r, "__not_aligned" ) notaligned += 1 continue try: if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): nonunique += 1 write_to_samout( r, "__alignment_not_unique" ) continue except KeyError: pass if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ): lowqual += 1 write_to_samout( r, "__too_low_aQual" ) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): fs = fs.union( fs2 ) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection( fs2 ) else: sys.exit( "Illegal overlap mode." ) if fs is None or len( fs ) == 0: write_to_samout( r, "__no_feature" ) empty += 1 elif len( fs ) > 1: write_to_samout( r, "__ambiguous[" + '+'.join( fs ) + "]" ) ambiguous += 1 else: write_to_samout( r, list(fs)[0] ) counts[ list(fs)[0] ] += 1 except UnknownChrom: write_to_samout( r, "__no_feature" ) empty += 1 except: sys.stderr.write( "Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d SAM %s processed.\n" % ( i, "alignments " if not pe_mode else "alignment pairs" ) ) if samoutfile is not None: samoutfile.close() for fn in sorted( counts.keys() ): print "%s\t%d" % ( fn, counts[fn] ) print "__no_feature\t%d" % empty print "__ambiguous\t%d" % ambiguous print "__too_low_aQual\t%d" % lowqual print "__not_aligned\t%d" % notaligned print "__alignment_not_unique\t%d" % nonunique
def centipede_footprint(bed_file, bam_file, sites, sample_name, plots_dir, fragmentsize=1, orientation=True, duplicates=True, strand_specific=True): """ Gets read coverage in genomic intervals. Passes coverage to centipede_call_footprints and returns posterior probabilities. :param bed_file: Bed file. :type bed_file: str :param bam: HTSeq.BAM_Reader object, must be sorted and indexed with .bai file. :type bam: HTSeq.BAM_Reader :type fragmentsize: int :type stranded: bool :type duplicates: bool :returns: OrderedDict with regionName:numpy.array(coverage) :rtype: collections.OrderedDict """ import pybedtools import os import HTSeq import numpy as np # read in bedfile motifs = pybedtools.BedTool(bed_file) # get motif name motif_name = os.path.basename(bed_file.split(".")[0]) # get motif length (length of first interval) motif_length = motifs[0].length # convert intervals to HTSeq.GenomicInterval intervals = map(bedtools_interval_to_genomic_interval, motifs) # Handle bam file bam = HTSeq.BAM_Reader(bam_file) # exclude bad chroms chroms_exclude = ['chrM', 'chrX', 'chrY'] # get dimensions of matrix to store profiles of Tn5 transposition n = len(intervals) m = intervals[0].length # create empty matrix if not strand_specific: coverage = np.zeros((n, m), dtype=np.float64) else: # if "strand_specific", get signal for both strands independently, but concatenated coverage = np.zeros((n, m * 2), dtype=np.float64) # Loop through intervals, get coverage, increment matrix count for i, feature in enumerate(intervals): # counter just to track if i % 1000 == 0: print(n - i) # Check if feature is not in bad chromosomes if feature.chrom in chroms_exclude: continue # Fetch alignments in interval for aln in bam[feature]: # check it's aligned if not aln.aligned: continue # check if duplicate if not duplicates and aln.pcr_or_optical_duplicate: continue aln.iv.length = fragmentsize # adjust reads to specified size # get position relative to window if required (motif-oriented) if orientation: if feature.strand == "+" or feature.strand == ".": start_in_window = aln.iv.start - feature.start - 1 end_in_window = aln.iv.end - feature.start - 1 else: start_in_window = feature.length - abs(feature.start - aln.iv.end) - 1 end_in_window = feature.length - abs(feature.start - aln.iv.start) - 1 else: start_in_window = aln.iv.start - feature.start - 1 end_in_window = aln.iv.end - feature.start - 1 # check fragment is within window; this is because of fragmentsize adjustment if start_in_window < 0 or end_in_window > feature.length: continue # add +1 to all positions overlapped by read within window if not strand_specific: coverage[i, start_in_window:end_in_window] += 1 else: if aln.iv.strand == "+": coverage[i, start_in_window:end_in_window] += 1 else: coverage[i, m + start_in_window:m + end_in_window] += 1 # Call footprints, get posterior probabilities try: probs = centipede_call_footprints( coverage, np.ones([len(coverage), 1]), motif_length, os.path.join(plots_dir, sample_name + "." + motif_name + ".pdf")) if len(probs) != len(coverage): probs = np.zeros(len(coverage)) except: # if error, return zeros probs = np.zeros(len(coverage)) return probs
def modifHTSeq(bam_filename, gff_filename, out_file, overlap_mode, feature_type, id_attribute, minaqual, exclude_start_distance, exclude_stop_distance, min_len, max_len): #feature GenomicArrayOfSets features = HTSeq.GenomicArrayOfSets("auto", stranded=True) counts = {} start_codon_sites = {} stop_codon_sites = {} #GTF gff = HTSeq.GFF_Reader(gff_filename, end_included=True) i = 0 for f in gff: if f.type == feature_type: if id_attribute in f.attr: #the same to the f.attr.keys() feature_id = f.attr[ id_attribute] # f.attr will return the 9-th colum of the input gtf file as {} else: feature_id = f.attr[ 'gene_id'] #in the gtf file of Rat, there are some CDS/exon dont have gene_name ,but every items have gene_id features[ f. iv] += feature_id #label the chrmosome with gene_name, if dont have gene_name,replaced by gene_id #counts[ f.attr[ id_attribute ] ] = 0 #only counts reads for genes with id_attribute, so cant repaced by counts[ feature_id ] = 0 counts[feature_id] = 0 ### if there are multiple TIS, use the most 5' end start codon and the most 3' end stop codon if f.type == "start_codon": if id_attribute in f.attr: gname = f.attr[id_attribute] if gname not in start_codon_sites: start_codon_sites[gname] = f.iv.start_d else: if f.iv.strand == "+": start_codon_sites[gname] = min(f.iv.start_d, start_codon_sites[gname]) else: start_codon_sites[gname] = max(f.iv.start_d, start_codon_sites[gname]) # if f.type == "stop_codon": if id_attribute in f.attr: gname = f.attr[id_attribute] if gname not in stop_codon_sites: stop_codon_sites[gname] = f.iv.end_d else: if f.iv.strand == "+": stop_codon_sites[gname] = max(f.iv.end_d, stop_codon_sites[gname]) else: stop_codon_sites[gname] = min(f.iv.end_d, stop_codon_sites[gname]) i += 1 if i % 100000 == 0: sys.stderr.write("%d GFF lines processed.\n" % i) #bam read_seq = HTSeq.BAM_Reader(bam_filename) #counts empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0: sys.stderr.write("%d SAM alignment record processed.\n" % i) i += 1 if not r.aligned: notaligned += 1 continue if r.optional_field("NH") > 1: nonunique += 1 continue if r.aQual < minaqual: lowqual += 1 continue ### if len(r.read.seq) < min_len or len(r.read.seq) > max_len: continue iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0) if overlap_mode == "union": fs = set() for iv in iv_seq: for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: for iv2, fs2 in features[iv].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: empty += 1 elif len(fs) > 1: ambiguous += 1 else: try: #some genes may dont have start or stop codon if abs(start_codon_sites[list(fs)[0]] - r.iv.start_d) < exclude_start_distance: continue elif abs(r.iv.end_d - stop_codon_sites[list(fs)[0]] ) < exclude_stop_distance: continue else: counts[list(fs)[0]] += 1 except: counts[list(fs)[0]] += 1 #output with open(out_file, "w") as fout: fout.write("%s\t%s\n" % (id_attribute.strip(), "count")) for fn in sorted(counts.keys()): fout.write("%s\t%s\n" % (fn, counts[fn])) fout.write("__no_feature\t%d\n" % empty) fout.write("__ambiguous\t%d\n" % ambiguous) fout.write("__too_low_aQual\t%d\n" % lowqual) fout.write("__not_aligned\t%d\n" % notaligned) fout.write("__alignment_not_unique\t%d\n" % nonunique)
def sciRNAseq_count(sample, input_folder, exons, genes, gene_end, gene_annotat, sample_ID): input_sam = input_folder + "/" + sample + ".sam" report = input_folder + "/" + sample + ".report" count_output = input_folder + "/" + sample + ".count" counts = collections.Counter() sam_file = input_sam almnt_file = HTSeq.SAM_Reader(sam_file) sam_name = sample cell_ID = sample_ID.index(sample) + 1 perfect_inter_exon = 0 nearest_inter_exon = 0 perfect_combine_exon = 0 nearest_combine_exon = 0 perfect_inter_gene = 0 nearest_inter_gene = 0 perfect_combine_gene = 0 nearest_combine_gene = 0 print("Start read the input file: " + sam_file + "....") for alnmt in almnt_file: #print alnmt if not alnmt.aligned: counts["_unmapped"] += 1 continue if alnmt.iv.chrom not in genes.chrom_vectors: counts["_unmapped"] += 1 continue # First check the intersectin with exons gene_id_intersect = set() gene_id_combine = set() inter_count = 0 for cigop in alnmt.cigar: if cigop.type != "M": continue for iv, val in exons[cigop.ref_iv].steps(): #print iv, val gene_id_combine |= val if inter_count == 0: gene_id_intersect |= val inter_count += 1 else: gene_id_intersect &= val #print "intersect set:", gene_id_intersect #print "combine set:", gene_id_combine # first check the intersection set if len(gene_id_intersect) == 1: gene_id = list(gene_id_intersect)[0] counts[gene_id] += 1 perfect_inter_exon += 1 elif len(gene_id_intersect) > 1: gene_id = find_nearest_gene(alnmt.iv.end_d, gene_id_intersect, gene_end) counts[gene_id] += 1 nearest_inter_exon += 1 else: # if there no intersection match, then find the union sets if len(gene_id_combine) == 1: gene_id = list(gene_id_combine)[0] counts[gene_id] += 1 perfect_combine_exon += 1 elif len(gene_id_combine) > 1: gene_id = find_nearest_gene(alnmt.iv.end_d, gene_id_combine, gene_end) counts[gene_id] += 1 nearest_combine_exon += 1 else: # if there is no intersection match or union match, then search for genes to find the intronic match gene_id_intersect = set() gene_id_combine = set() inter_count = 0 for cigop in alnmt.cigar: if cigop.type != "M": continue for iv, val in genes[cigop.ref_iv].steps(): gene_id_combine |= val if inter_count == 0: gene_id_intersect |= val inter_count += 1 else: gene_id_intersect &= val if len(gene_id_intersect) == 1: gene_id = list(gene_id_intersect)[0] + "_intron" counts[gene_id] += 1 perfect_inter_gene += 1 elif len(gene_id_intersect) > 1: gene_id = find_nearest_gene(alnmt.iv.end_d, gene_id_intersect, gene_end) + "_intron" counts[gene_id] += 1 nearest_inter_gene += 1 else: # if there no intersection match, then find the union sets if len(gene_id_combine) == 1: gene_id = list(gene_id_combine)[0] + "_intron" counts[gene_id] += 1 perfect_combine_gene += 1 elif len(gene_id_combine) > 1: gene_id = find_nearest_gene(alnmt.iv.end_d, gene_id_combine, gene_end) + "_intron" counts[gene_id] += 1 nearest_combine_gene += 1 else: counts["_no_feature"] += 1 print("File name: ", sam_file) print("1: Perfect intersect exon match: ", perfect_inter_exon) print("2: Nearest intersect exon match: ", nearest_inter_exon) print("3: Perfect combine exon match: ", perfect_combine_exon) print("4: Nearest combine exon match: ", nearest_combine_exon) print("5: Perfect intersect gene match: ", perfect_inter_gene) print("6: Nearest intersect gene match: ", nearest_inter_gene) print("7: Perfect combine gene match: ", perfect_combine_gene) print("8: Nearest combine gene match: ", nearest_combine_gene) print("9: ambiguous match for exons: ", counts["_ambiguous"]) print("10: ambiguous match for genes: ", counts["_ambiguous_intron"]) print("11: No match: ", counts["_no_feature"]) print("Sam file analysis finished~") with open(report, 'w') as report: report.write("1" + "," + str(cell_ID) + "," + str(perfect_inter_exon) + "\n") report.write("2" + "," + str(cell_ID) + "," + str(nearest_inter_exon) + "\n") report.write("3" + "," + str(cell_ID) + "," + str(perfect_combine_exon) + "\n") report.write("4" + "," + str(cell_ID) + "," + str(nearest_combine_exon) + "\n") report.write("5" + "," + str(cell_ID) + "," + str(perfect_inter_gene) + "\n") report.write("6" + "," + str(cell_ID) + "," + str(nearest_inter_gene) + "\n") report.write("7" + "," + str(cell_ID) + "," + str(perfect_combine_gene) + "\n") report.write("8" + "," + str(cell_ID) + "," + str(nearest_combine_gene) + "\n") report.write("9" + "," + str(cell_ID) + "," + str(counts["_ambiguous"]) + "\n") report.write("10" + "," + str(cell_ID) + "," + str(counts["_ambiguous_intron"]) + "\n") report.write("11" + "," + str(cell_ID) + "," + str(counts["_no_feature"]) + "\n") with open(count_output, 'w') as count_output: for gene in counts: if (gene in [ "_unmapped", "_ambiguous", "_ambiguous_intron", "_no_feature" ]): continue else: line = str(gene_annotat.loc[gene, 4]) + "," + str( cell_ID) + "," + str(counts[gene]) + "\n" count_output.write(line) return 0
def count_reads_in_features( sam_filename, gff_filename, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout, allow_ambiguous, allow_nonunique ): def write_to_samout( r, assignment ): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: samoutfile.write( read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n" ) if quiet: warnings.filterwarnings( action="ignore", module="HTSeq" ) if samout != "": samoutfile = open( samout, "w" ) else: samoutfile = None features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" ) features_dict = defaultdict(list) counts = {} # Try to open samfile to fail early in case it is not there if sam_filename != "-": open( sam_filename ).close() gff = HTSeq.GFF_Reader( gff_filename ) i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[ id_attribute ] except KeyError: sys.exit( "Feature %s does not contain a '%s' attribute" % ( f.name, id_attribute ) ) if stranded != "no" and f.iv.strand == ".": sys.exit( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % ( f.name, f.iv ) ) features[ f.iv ] += feature_id counts[ f.attr[ id_attribute ] ] = 0 features_dict[ f.attr[ id_attribute ] ].append(f) i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) except: sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) sys.stderr.write( "Sorting exons from GFF file.\n" ) for key, value in features_dict.items(): if features_dict[key][0].iv.strand == "-": features_dict[key] = sorted(features_dict[key], key=lambda feat: feat.iv.start, reverse=True) else: features_dict[key] = sorted(features_dict[key], key=lambda feat: feat.iv.start, reverse=False) if len( counts ) == 0 and not quiet: sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type ) try: if sam_filename != "-": read_seq = HTSeq.SAM_Reader( sam_filename ) first_read = iter(read_seq).next() else: read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) ) first_read = read_seq.next() read_seq = itertools.chain( [ first_read ], read_seq ) pe_mode = first_read.paired_end except: sys.stderr.write( "Error occured when reading first line of sam file.\n" ) raise try: if pe_mode: read_seq_pe_file = read_seq read_seq = HTSeq.pair_SAM_alignments( read_seq ) empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout( r, "not_aligned" ) continue try: if ((allow_nonunique == "no") and (r.optional_field( "NH" ) > 1)): write_to_samout( r, "alignment_not_unique" ) nonunique += 1 continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout( r, "too_low_aQual" ) continue if stranded != "reverse": iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" ) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" ) else: iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" ) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" ) ) else: iv_seq = itertools.chain( iv_seq, ( co.ref_iv for co in r[1].cigar if co.type == "M" ) ) else: if ( r[0] is None ) or not ( r[0].aligned ): write_to_samout( r, "not_aligned" ) notaligned += 1 continue try: if (allow_nonunique == "no") and (( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ ( r[1] is not None and r[1].optional_field( "NH" ) > 1 )): nonunique += 1 write_to_samout( r, "alignment_not_unique" ) continue except KeyError: pass if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ): lowqual += 1 write_to_samout( r, "too_low_aQual" ) continue try: iv_seq = list(iv_seq) if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): fs = fs.union( fs2 ) elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty": fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[ iv ].steps(): if len(fs2) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection( fs2 ) else: sys.exit( "Illegal overlap mode." ) # sys.stderr.write( "fs = %s with len = %d allow_ambiguous=%s\n" % (fs, len(fs), allow_ambiguous) ) if fs is None or len( fs ) == 0: write_to_samout( r, "no_feature" ) empty += 1 elif ((len( fs ) > 1) and (allow_ambiguous == "no")): write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" ) ambiguous += 1 else: # write_to_samout( r, list(fs)[0] ) # sys.stderr.write( "iv_seq is now %s\n" % iv_seq ) # sys.exit ("stopping for now" ) for iv in iv_seq: # sys.stderr.write( "iv in iv_seq is %s and getting %s\n" % (iv, features[iv]) ) for iv2, fs2 in features[ iv ].steps(): # sys.stderr.write( "iv2 = %s and fs2 = %s\n" % (iv2, fs2) ) if len(fs2) == 0: continue for fsi in fs2: # sys.stderr.write( "fsi = %s\n" % fsi ) offset = 0 # write_to_samout( r, fsi[0] ) for exon in features_dict[ fsi ]: if ((iv2.start >= exon.iv.start) and (iv2.end <= exon.iv.end)): # sys.stderr.write("found matching exon %s\n" % exon) if (exon.iv.strand == "+"): offset += (iv2.start - exon.iv.start) else: offset += (exon.iv.end - iv2.end) # sys.stderr.write("matching exon new offset %d\n" % offset) break else: offset += (exon.iv.end - exon.iv.start) # sys.stderr.write( "skipping exon %s - new offset %d\n" % (exon, offset) ) # sys.stderr.write( "mapping read %s to offset %d\n" % (r, offset) ) # sys.stderr.write( "dir(r) is %s" % "\t".join(list(dir(r))) ) if pe_mode: rname = r[0].read.name if r[0] is not None else r[1].read.name else: rname = r.read.name print "%s\t%d\t%d\t%s" % (fsi, offset, offset + (iv2.end-iv2.start-1), rname) # output is 0-based, inclusive on both ends except UnknownChrom: if not pe_mode: rr = r else: rr = r[0] if r[0] is not None else r[1] empty += 1 #if not quiet: # sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " + # "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % # ( rr.read.name, iv.chrom ) ) if i % 100000 == 0 and not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) except: if not pe_mode: sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() ) else: sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) ) if samoutfile is not None: samoutfile.close()
def intron_retention(outfile, ref_t): gff_file = outfile + "_addedintron.gff3" talnm_file = glob.glob(outfile + "_transcriptome_alnm.sam")[0] galnm_file = glob.glob(outfile + "_genome_alnm.sam")[0] #read intron information from GFF file sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Reading intron coordinates from GFF file\n") gff_features = HTSeq.GFF_Reader(gff_file, end_included=True) features = HTSeq.GenomicArrayOfSets("auto", stranded=False) dict_intron_info = {} for feature in gff_features: if "Parent" in feature.attr: info = feature.attr["Parent"].split(':') if info[0] == "transcript": feature_id = info[1] if feature_id not in dict_intron_info: dict_intron_info[feature_id] = [] if feature.type == "intron": # feature_id_2 = feature.name.split(':')[1] #feature_id_2 is same as feature_id above if feature is intron, I was just checking and testing it. then removed this line. features[feature.iv] += feature_id dict_intron_info[feature_id].append((feature.iv.start, feature.iv.end, feature.iv.length)) #read primary genome alignment for each read sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Read primary genome alignment for each read\n") dict_g_alnm = {} sam_reader = HTSeq.SAM_Reader g_alignments = sam_reader(galnm_file) for alnm in g_alignments: qname = alnm.read.name if alnm.aligned and not alnm.not_primary_alignment and not alnm.supplementary: dict_g_alnm[qname] = alnm if alnm.supplementary and qname in dict_g_alnm: del dict_g_alnm[qname] # delete chimeric reads #read primary transcriptome alignment for each read sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Read primary transcriptome alignment for each read\n") dict_t_alnm = {} sam_reader = HTSeq.SAM_Reader t_alignments = sam_reader(talnm_file) for alnm in t_alignments: qname = alnm.read.name if alnm.aligned and not alnm.not_primary_alignment and not alnm.supplementary: dict_t_alnm[qname] = alnm if alnm.supplementary and qname in dict_t_alnm: del dict_t_alnm[qname] # delete chimeric reads #count the length of Intron retention events sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Calculating probabilites for each intron retention event\n") dict_first_intron_state = {False: 0, True: 0} dict_states = {(False, False): 0, (False, True): 0, (True, False): 0, (True, True): 0} for qname in dict_g_alnm: galnm = dict_g_alnm[qname] if qname in dict_t_alnm: talnm = dict_t_alnm[qname] primary_trx = talnm.iv.chrom.split(".")[0] if stranded != "reverse": iv_seq = (co.ref_iv for co in galnm.cigar if (co.type in ('M', '=', 'X', 'D') and co.size > 0)) #iv_seq = (co.ref_iv for co in galnm.cigar if co.type in ('M', 'D') and co.size > 0) #tested. test the above cases too to make sure about it. else: iv_seq = (invert_strand(co.ref_iv) for co in galnm.cigar if (co.type in ('M', '=', 'X', 'D') and co.size > 0)) list_IR_positions = [] pos = [] ir_info = False try: length_IR = 0 for iv in iv_seq: for iv2, fs2 in features[iv].steps(): if fs2.intersection(set([primary_trx])): length_IR += iv2.length pos.append(iv2.start) pos.append(iv2.end) else: if length_IR != 0: for intron in dict_intron_info[primary_trx]: if length_IR == intron[2]: list_IR_positions.append(min(pos)) list_IR_positions.append(max(pos)) ir_info = True length_IR = 0 pos = [] except UnknownChrom: ir_info = False pass if ir_info == False: if primary_trx in dict_intron_info: if len(dict_intron_info[primary_trx]) >= 1: #if there is a intron dict_first_intron_state[False] += 1 for i in range(1, len(dict_intron_info[primary_trx])): dict_states[(False, False)] += 1 else: # Now, go over all introns and check with the IR events # First we need to determine the state of first intron: first_intron = dict_intron_info[primary_trx][0] first_intron_spos = first_intron[0] first_intron_epos = first_intron[1] flag = False for IR_pos in list_IR_positions: if first_intron_spos <= IR_pos <= first_intron_epos: flag = True break if flag == True: dict_first_intron_state[True] += 1 previous_state = True else: dict_first_intron_state[False] += 1 previous_state = False # Then we will go over other introns: for i in range (1, len(dict_intron_info[primary_trx])): intron = dict_intron_info[primary_trx][i] current_state = False intron_spos = intron[0] intron_epos = intron[1] for IR_pos in list_IR_positions: if intron_spos <= IR_pos <= intron_epos: current_state = True break #print(intron_spos, intron_epos, previous_state, current_state) dict_states[(previous_state, current_state)] += 1 previous_state = current_state sum_first_introns = dict_first_intron_state[True] + dict_first_intron_state[False] sum_for_noIR = dict_states[(False, False)] + dict_states[(False, True)] sum_for_IR = dict_states[(True, False)] + dict_states[(True, True)] fout = open(outfile + "_IR_markov_model", 'w') fout.write("succedent\tno_IR\tIR\n") fout.write("start\t" + str(round(dict_first_intron_state[False] / float(sum_first_introns), 4)) + "\t" \ + str(round(dict_first_intron_state[True] / float(sum_first_introns), 4)) + "\n") fout.write("no_IR\t" + str(round(dict_states[(False, False)] / float(sum_for_noIR), 4)) + "\t" \ + str(round(dict_states[(False, True)] / float(sum_for_noIR), 4)) + "\n") fout.write("IR\t" + str(round(dict_states[(True, False)] / float(sum_for_IR), 4)) + "\t" \ + str(round(dict_states[(True, True)] / float(sum_for_IR), 4)) + "\n") fout.close()
elif file[-13:-11] == 'A5': A5list = [x.split('\t')[1] for x in open(file).readlines()[1:]] elif file[-13:-11] == 'MX': MXlist = [x.split('\t')[1] for x in open(file).readlines()[1:]] elif file[-13:-11] == 'AL': ALlist = [x.split('\t')[1] for x in open(file).readlines()[1:]] elif file[-13:-11] == 'AF': AFlist = [x.split('\t')[1] for x in open(file).readlines()[1:]] GENE = Set(["gene", "pseudogene", "transposable_element_gene"]) EXON = Set(["exon", "pseudogenic_exon"]) num_lines = sum(1 for line in open(name_gff)) file_gff = open(name_gff, 'r') gff_file = HTSeq.GFF_Reader(file_gff) count = 0 transcript = set() lines = 0 gene_dict = {} for feature in gff_file: lines += 1 if feature.type in GENE or lines == num_lines: if len(transcript) == 2: count += 1 gene_dict[gene_cand.attr["ID"]] = len(transcript) gene_cand = feature transcript.clear() if feature.type in EXON:
def Get_IPAevent(input_tuple): label,all_bamfiles = input_tuple curr_label_all_gas = [] curr_label_all_ga = [] curr_label_all_gene_count = [] IPA_result = [] min_count = 30 for bamfile in all_bamfiles: bam_reader = HTSeq.BAM_Reader(bamfile) gas,ga,gene_count = Get_label_information(label,annot,bam_reader) curr_label_all_gas.append(gas) curr_label_all_ga.append(ga) curr_label_all_gene_count.append(gene_count) for feature,rank,chrom,start,end,strand,length,exon_rank_left,exon_rank_right in annot[label]: if feature == "intron" and int(length)>250: intron_start = start intron_end = end end_value = 15 index_list = [index for index,gene_count in enumerate(curr_label_all_gene_count) if gene_count[('intron',rank)]> min_count] if index_list != []: iv = HTSeq.GenomicInterval(chrom,intron_start,intron_end,strand) IPAtype = "Composite" curr_label_all_cov = [] for index in index_list: if strand == "-": curr_label_all_cov.append(list(curr_label_all_ga[index][iv])[::-1]) else: curr_label_all_cov.append(list(curr_label_all_ga[index][iv])) intron_region = chrom+":"+str(intron_start)+"-"+str(intron_end) skipend_dict_list = [Get_Skipend_dict(intron_region,bamfile,strand) for bamfile in all_bamfiles] for index,skipend_dict in enumerate(skipend_dict_list): for key,value in skipend_dict.items(): if int(start)+50 < int(key) < int(end)-50 and int(value) > 10: if strand == "+": skip_position = int(key)-int(start) else: skip_position = int(end)-int(key) curr_label_all_cov = [cvg_region[skip_position:] for cvg_region in curr_label_all_cov] IPAtype = "Skipped" start = int(key) end = int(key) end_value = int(value) break else: continue break min_mseratio_list,min_mse_point_list = Get_min_mseratio_list(curr_label_all_cov) min_mseratio = min(min_mseratio_list) min_mseratio_index = min_mseratio_list.index(min_mseratio) if min_mseratio < 0.5: min_mseratio_list_refine,min_mse_point_list_refine = Get_min_mseratio_list_refine(curr_label_all_cov,min_mse_point_list[min_mseratio_index]) min_mseratio_refine = min(min_mseratio_list_refine) min_mseratio_index_refine = min_mseratio_list_refine.index(min_mseratio_refine) IPA_point = int(min_mse_point_list_refine[min_mseratio_index_refine]) up_down_diff = max([np.mean(coverage[:IPA_point])-np.mean(coverage[IPA_point:]) for coverage in curr_label_all_cov]) upstream_cov = max([len(list(filter(lambda x:x>5,coverage[:IPA_point])))/IPA_point for coverage in curr_label_all_cov]) downstream_cov = np.mean([len(list(filter(lambda x:x>5,coverage[IPA_point:])))/(len(coverage)-IPA_point) for coverage in curr_label_all_cov]) if min_mseratio_refine < 0.5 and up_down_diff > 1 and upstream_cov > 0.8 and downstream_cov < 0.5: if strand == "+": IPA_location = int(start)+IPA_point IPA_inf = chrom+":"+str(start)+"-"+str(IPA_location) else: IPA_location = int(end)-IPA_point IPA_inf = chrom+":"+str(IPA_location)+"-"+str(end) skipstart_dict = Get_Skipstart_dict(intron_region,all_bamfiles,strand) for key,value in skipstart_dict.items(): if IPA_location-20<int(key)<IPA_location+20 and int(value) > end_value*0.8: break else: intronPA_inf = label + ";"+feature + "_" + str(rank) + ";" + IPA_inf + ";" + IPAtype IPA_information = Get_IPAsite_IPUI((intronPA_inf,curr_label_all_ga,gas)) IPA_result.append(IPA_information) return IPA_result
def test_bam_inconsistent_mate(): print('Test inconsistent BAM file') bamfile = HTSeq.BAM_Reader("example_data/inconsistent_mate.bam") for read in bamfile: pass print("Test passed")
try: import HTSeq except ImportError: sys.stderr.write( "Could not import HTSeq. Please install the HTSeq Python framework\n") sys.stderr.write( "available from http://www-huber.embl.de/users/anders/HTSeq\n") sys.exit(1) gtf_file = sys.argv[1] out_file = sys.argv[2] # Step 1: Store all exons with their gene and transcript ID # in a GenomicArrayOfSets exons = HTSeq.GenomicArrayOfSets("auto", stranded=True) for f in HTSeq.GFF_Reader(gtf_file): if f.type != "exon": continue f.attr['gene_id'] = f.attr['gene_id'].replace(":", "_") exons[f.iv] += (f.attr['gene_id'], f.attr['transcript_id']) # Step 2: Form sets of overlapping genes # We produce the dict 'gene_sets', whose values are sets of gene IDs. Each set # contains IDs of genes that overlap, i.e., share bases (on the same strand). # The keys of 'gene_sets' are the IDs of all genes, and each key refers to # the set that contains the gene. # Each gene set forms an 'aggregate gene'. gene_sets = collections.defaultdict(lambda: set())
def split_by_barcode(initial_filename): outfiles = {} outfiles['CAAT'] = open('n2_sp_lane3_rt3.fastq', 'w') outfiles['AATA'] = open('n2_sp_lane3_rt15.fastq', 'w') outfiles['TTAA'] = open('n2_sp_lane3_rt16.fastq', 'w') missingf = open('no_recognized_barcode_lane3.fastq', 'w') skip = """ # Lane 1: outfiles['GGTT'] = open('fbf1_sp_lane1_rt1.fastq', 'w') outfiles['TTGT'] = open('fbf2_sp_lane1_rt2.fastq', 'w') outfiles['CAAT'] = open('n2_oo_lane1_rt3.fastq', 'w') outfiles['CCGG'] = open('fbf1_sp_lane1_rt6.fastq', 'w') outfiles['TGGC'] = open('fbf1_sp_lane1_rt9.fastq', 'w') outfiles['CGGA'] = open('fbf2_sp_lane1_rt13.fastq', 'w') outfiles['GGCA'] = open('fbf2_sp_lane1_rt14.fastq', 'w') outfiles['AATA'] = open('n2_oo_lane1_rt15.fastq', 'w') outfiles['TTAA'] = open('n2_oo_lane1_rt16.fastq', 'w') missingf = open('no_recognized_barcode_lane1.fastq', 'w') # Lane 2: outfiles['GGTT'] = open('fbf1_oo_lane2_rt1.fastq', 'w') outfiles['TTGT'] = open('fbf2_oo_lane2_rt2.fastq', 'w') outfiles['CCGG'] = open('fbf1_oo_lane2_rt6.fastq', 'w') outfiles['TGGC'] = open('fbf1_oo_lane2_rt9.fastq', 'w') outfiles['CGGA'] = open('fbf2_oo_lane2_rt13.fastq', 'w') outfiles['GGCA'] = open('fbf2_oo_lane2_rt11.fastq', 'w') missingf = open('no_recognized_barcode_lane2.fastq', 'w') outfiles['TGGC'] = open('exp_fbf1_TGGC.fastq', 'w') outfiles['CGGA'] = open('exp_fbf1_CGGA.fastq', 'w') # There is an irregularity here. The GEO dataset indicates a GCCA/TCCG barcode # (the rc of the above) and a GGTT barcode (so we expect AACC). # What we use in the sp/oo is GGTT, though. outfiles['AACC'] = open('exp_fbf1_AACC.fastq', 'w') outfiles['GGTT'] = open('exp_fbf1_GGTT.fastq', 'w') outfiles['CCGG'] = open('fbf1_n2_CCGG.fastq', 'w') outfiles['TTGT'] = open('fbf1_n2_TTGT.fastq', 'w') outfiles['GGCA'] = open('fbf1_n2_GGCA.fastq', 'w') missingf = open('no_recognized_barcode_fbf1.fastq', 'w') """ skip = """ outfiles['TGGC'] = open('exp_fbf2_TGGC.fastq', 'w') outfiles['CGGA'] = open('exp_fbf2_CGGA.fastq', 'w') outfiles['AACC'] = open('exp_fbf2_AACC.fastq', 'w') outfiles['GGTT'] = open('exp_fbf2_GGTT.fastq', 'w') outfiles['CCGG'] = open('fbf2_n2_CCGG.fastq', 'w') outfiles['TTGT'] = open('fbf2_n2_TTGT.fastq', 'w') outfiles['GGCA'] = open('fbf2_n2_GGCA.fastq', 'w') """ fastq_file = HTSeq.FastqReader(initial_filename) total_reads = 0 for read in fastq_file: total_reads += 1 if (not (total_reads % 100000)): print "Read: %i " % (total_reads) found = False for bc in outfiles.keys(): if read.seq[3:7] == bc: # if(re.match('\w{3}' + bc + '.*', read.seq)): read.write_to_fastq_file(outfiles[bc]) found = True if not found: read.write_to_fastq_file(missingf) for bc in outfiles: outfiles[bc].close()
def get_profile(coverage, chrom, start, end, strand): window = HTSeq.GenomicInterval(chrom, start, end, strand) wincvg = np.fromiter(coverage[window], dtype='i', count=end - start) return wincvg
def extract_np_arrays(cov_array, seqid, length): plus = cov_array[HTSeq.GenomicInterval(seqid, 0, length, "+")].array minus = cov_array[HTSeq.GenomicInterval(seqid, 0, length, "-")].array return plus, minus
def main(argv): parser = OptionParser() parser.add_option("-r", "--chromsize", action="store", type="string", dest="chromsize", help="GRCh38 chromosome size file", metavar="<str>") parser.add_option("-v", "--variantfile", action="store", type="string", dest="variantfile", metavar="<file>", help="the variant calls files in a specific format") parser.add_option("-o", "--outdir", action="store", type="string", dest="outdir", metavar="<file>", help="the directory to store the output files") (opt, args) = parser.parse_args(argv) if len(argv) < 6: parser.print_help() sys.exit(1) chrom_size_file = open(opt.chromsize, 'r') # Read chrom size information from the chrom_size_file. chrom_size = {} for line in chrom_size_file: pline = line.strip() sline = pline.split('\t') chrom_size[sline[1]] = int(sline[0]) chrom_size_file.close() var_types_ga = VariantCallTabReader(opt.variantfile, chrom_size)[0] var_types_id = VariantCallTabReader(opt.variantfile, chrom_size)[1] for var_type in var_types_ga.keys(): # Creat a 'Genomic Array' using HTSeq package ga = HTSeq.GenomicArray(chrom_size, stranded=False, typecode="i") nssd = HTSeq.GenomicArrayOfSets(chrom_size, stranded=False) variant_interval = var_types_ga[var_type] variant_id = var_types_id[var_type] # Get the count of variant calls in each region variant_num = len(variant_interval) print "For " + var_type + ", there are " + str( variant_num) + " variant calls from the clustersed studies..." for i in xrange(variant_num): iv = variant_interval[i] try: ga[iv] += 1 nssd[iv] += variant_id[i] except: iv.length == 0 bedgraph = opt.outdir + '/' + var_type + '_dbVar.bedgraph' ga.write_bedgraph_file(bedgraph, strand=".", track_options="") gvf = opt.outdir + '/' + var_type + '_dbVar.gvf' write_to_gvf(ga, nssd, var_type, gvf)
def main(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( 'alignment_file', metavar='in.aln', help="input alignment file in SAM or BAM format. Use '-' to indicate " "that input should be taken from standard input (stdin)") parser.add_argument( 'feature_file', metavar='in.gff3', help="input feature annotation file in GFF3 format. Use '-' to indicate " "that input should be taken from standard input (stdin)") parser.add_argument( '-m', '--mapping', metavar='in.json', dest='map_files', action=ParseSeparator, sep=',', help="input one or more relational databases, in JSON format, " "containing features mapped to feature categories, such as genes " "to gene families or exons to genes. Abundance estimates for the " "given feature category will be reported in place of features. " "Multiple input files can be provided by separating them with a " "comma and no spaces") parser.add_argument( '-c', '--category', metavar='FIELD', dest='category', help="field in the relational database representing how features " "are categorized. WARNING: if the value type of the selected field " "is a list, then the category abundance totals can be greater than " "the feature abundance totals") gff_group = parser.add_argument_group('GFF3 arguments') gff_group.add_argument( '-t', '--type', metavar='TYPE', dest='ftype', default='CDS', help="feature type (3rd column in GFF file) to estimate abundance for " "[default: CDS]. All features of other type will be ignored") gff_group.add_argument( '-a', '--attr', metavar='ATTRIBUTE', default="Name", help="GFF attribute to use as the ID for the calculated abundances " "[default: 'Name']. This value will also be used as the search " "ID in the relational database, if provided") aln_group = parser.add_argument_group('SAM/BAM arguments') aln_group.add_argument( '-f', '--format', metavar='FORMAT', dest='aformat', choices=['bam', 'sam'], default='bam', help="input alignment file format [default: bam]. Options are 'sam' " "or 'bam'") aln_group.add_argument( '-q', '--qual', metavar='THRESH', dest='minqual', type=int, default=2, help="skip all reads with alignment quality lower than the threshold " "[default: 2]") aln_group.add_argument( '-s', '--sorting', metavar='ORDER', dest='order', choices=["position", "name"], default='position', help="alignment file sorting scheme. Options are 'position' and " "'name' [default: position]. Alignments must be pre-sorted " "either by position/coordinates or by read name. This option " "will be ignored for single-end reads") aln_group.add_argument( '-b', '--buffer', metavar='BYTES', dest='buffer_size', type=int, default=3145728, help="buffer size for paired reads in the alignment file if sorted by " "position [default: 3145728 (3GB)]. This value should be " "increased if memory issues are encountered") count_group = parser.add_argument_group('quantification arguments') count_group.add_argument( '-e', '--mode', metavar='MODE', choices=["union", "intersection-strict", "intersection-nonempty"], default="union", help="mode for handling different alignment scenarios. Options are " "'union', 'intersection-strict', and 'intersection-nonempty' " "[default: union]. The modes will count alignments differently " "depending on whether a read/pair overlaps more than one feature " "or only partially aligns to a single feature. The most " "inclusive mode is 'union' when given with the nonunique flag, " "and the least inclusive is 'intersection-strict'") count_group.add_argument( '-u', '--units', metavar='UNITS', dest='norm', action=ParseSeparator, sep=',', default='counts', help="comma-separated list of units to output abundance estimates in " "[default: counts]. Options are 'counts', 'fpk' (fragments per " "kilobase of feature), 'fpkm' (fragements per kilobase of " "feature per million mapped fragments), 'tpm' " "(transcripts/fragments per million), 'prop', and 'custom'. If " "other than 'counts', features will be normalized by recruitment " "length, which will be calculated from the start and end fields " "of the GFF3 file. This is the sole normalization method used " "when transforming counts to FPK, and is useful to correct for " "differences in feature lengths within a sample. In addition to " "feature length, FPKM and TPM attempt to account for differences " "between samples in sequencing effort. An advantage of TMP over " "FPKM is that TPM is a proportional measurement, making it " "easier to identify the extent that the relative 'importance' of " "a given feature changes between samples. A custom transformation " "can also be performed when used with the -k/--coeff argument, in " "which case the length normalized proportion of a feature will be " "multiplied by the provided scaling factor.") count_group.add_argument( '-k', '--coeff', metavar='MUL', dest='sfactor', type=float, default=1, help="multiplier to use when 'custom' is given to -u/--units " "[default: 1]") count_group.add_argument( '--cdna', dest='transcripts', action='store_true', help="sequences represent cDNA [default: False]. Whether sequences are " "from gDNA or cDNA will determine how the length of a feature is " "calculated for normalization. If cDNA, effective length will " "serve as feature length") count_group.add_argument( '--nonunique', action='store_true', help="allow reads to align with more than one feature") output_group = parser.add_argument_group('output control arguments') output_group.add_argument( '-o', '--outpref', type=str, metavar='PREFIX', dest='outpref', default='sample', help="prefix for the output tabular files containing feature abundance " "estimates [default: sample]. File names will be appended with " "the units, file format, and compression algorithm, if relevant " "[e.g. sample.counts.csv.gz]") output_group.add_argument( '--filter', dest='cat_only', action='store_true', help="only output abundances for features with an associated feature " "category [default: output all]") compression = output_group.add_mutually_exclusive_group() compression.add_argument('--gzip', dest='gzipped', action='store_true', help="compress output using the gzip algorithm") compression.add_argument('--bzip2', dest='bzipped', action='store_true', help="compress output using the bzip2 algorithm") compression.add_argument('--lzma', dest='lzma', action='store_true', help="compress output using the lzma algorithm") parser.add_argument('--version', action='version', version='%(prog)s ' + __version__) args = parser.parse_args() # Argument sanity checks if (args.category and not args.map_files) or \ (args.map_files and not args.category): parser.error("error: -m/--mapping and -c/--category must be supplied " "together") if args.alignment_file == '-' and args.feature_file == '-': parser.error("error: standard input (stdin) can only be redirected to " "a single positional argument") # Output run information all_args = sys.argv[1:] print("{} {!s}".format('count_features', __version__), file=sys.stderr) print(textwrap.fill("Command line parameters: {}"\ .format(' '.join(all_args)), 79), file=sys.stderr) print("", file=sys.stderr) # Track program run-time start_time = time() # Assign variables based on user inputs if args.gzipped: compression = '.gz' elif args.bzipped: compression = '.bz2' elif args.lzma: compression = '.xz' else: compression = '' allowed_units = ["counts", "tpm", "custom", "prop", "fpk", "fpkm"] out_handles = {} for unit in args.norm: if unit not in allowed_units: print("warning: unknown metric of abundance '{}' provided to " "-u/--unit. Please see the help message for a list of the " "allowed units".format(unit), file=sys.stderr) continue outfile = "{}.{}.csv{}".format(args.outpref, unit, compression) try: out_h = open_io(outfile, mode='wb') except AttributeError: print("error: unable to write to '{}'".format(outfile), \ file=sys.stderr) sys.exit(1) out_handles[unit] = out_h if not out_handles: print( "error: no output files can be created. Please re-run with one " "or more of the accepted units of abundance", file=sys.stderr) sys.exit(1) overlap_mode = args.mode minaqual = args.minqual feature_type = args.ftype id_field = args.attr category_field = args.category are_transcripts = args.transcripts category_only = args.cat_only multi_aln = args.nonunique match_types = ('M', '=', 'X') if args.aformat == "sam": align_reader = HTSeq.SAM_Reader else: #must be BAM then align_reader = HTSeq.BAM_Reader if args.map_files: mapping = load_dbs(args.map_files, fields=[category_field], csv=False) else: mapping = None # Store features in genomic arrays features = HTSeq.GenomicArrayOfSets("auto", stranded=False) counts = {} # Iterate over GFF3 file, storing features to estimate coverage for no_attr = 0 f_totals = 0 ftype_totals = 0 try: if args.feature_file == '-': gff = HTSeq.GFF_Reader(sys.stdin) else: gff = HTSeq.GFF_Reader(args.feature_file) for f in gff: f_totals += 1 try: feature_id = f.attr[id_field] except KeyError: no_attr += 1 feature_id = "unkwn_{:08}".format(no_attr) # Skip features of wrong type if feature_type: if f.type == feature_type: ftype_totals += 1 else: continue # Store feature length for normalization feature_length = abs(f.iv.end - f.iv.start + 1) features[f.iv] += feature_id #for mapping alignments counts[feature_id] = {'count': 0, 'length': feature_length} except: print("error: problem occured when processing GFF3 file at line {}". format(gff.get_line_number_string()), file=sys.stderr) sys.exit(1) # Verify GFF3 file contains features of the specified type if ftype_totals == 0: print("error: no features of type '{}' found.\n".format(feature_type), file=sys.stderr) sys.exit(1) if no_attr > 0: print("warning: found {!s} features without a '{}' attribute.\n"\ .format(no_attr, id_field), file=sys.stderr) # Check alignment file formatting try: if args.alignment_file == '-': read_seq_file = align_reader(sys.stdin) read_seq_iter = iter(read_seq_file) first_read = next(read_seq_iter) read_seq = itertools.chain([first_read], read_seq_iter) else: read_seq_file = align_reader(args.alignment_file) read_seq = read_seq_file first_read = next(iter(read_seq)) except: print( "error: unable to read the alignment file. Please verify that " "the formatting is correct.", file=sys.stderr) sys.exit(1) pe_mode = first_read.paired_end #reads are paired-end or single-end if pe_mode: if args.order == "name": read_seq = HTSeq.pair_SAM_alignments(read_seq) else: #order is by position read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq, \ max_buffer_size=args.buffer_size) # Iterate over alignment file empty = 0 #reads aligned somewhere in the assembly, but not to a feature duplicate = 0 #reads are duplicates of other reads ambiguous = 0 #reads overlapping more than one feature notaligned = 0 #unaligned reads lowqual = 0 #reads not passing minimum threshold for alignment quality nonunique = 0 #reads having multiple alignments with similar score r_totals = 0 #total reads aln_totals = 0 #correctly mapped to a feature fld = [] #fragment length / insert-size distribution for r in read_seq: r_totals += 1 if not pe_mode: #single-end read mapping # Check if read aligned if not r.aligned: notaligned += 1 continue # Check if the read aligned uniquely try: if r.optional_field("NH") > 1: nonunique += 1 print("warning: read '{}' has multiple alignments with " "similar score.\n".format(r.iv.chrom), \ file=sys.stderr) continue except KeyError: pass # Cehck if the alignment passed the quality requirement if r.aQual < minaqual: lowqual += 1 continue # Check whether the read was marked as a duplciate if r.pcr_or_optical_duplicate: duplicate += 1 continue # Store read coordiantes iv_seq = (co.ref_iv for co in r.cigar if co.type in match_types \ and co.size > 0) else: #paired-end read mapping # Store pair coordinates try: first_r, second_r = r except ValueError: notaligned += 1 continue if first_r is None or second_r is None: notaligned += 1 continue if first_r is not None and first_r.aligned: iv_seq = (co.ref_iv for co in first_r.cigar if co.type in \ match_types and co.size > 0) else: iv_seq = tuple() if second_r is not None and second_r.aligned: iv_seq = itertools.chain(iv_seq, (co.ref_iv for co in \ second_r.cigar if co.type in match_types and \ co.size > 0)) else: if (first_r is None) or not (first_r.aligned): notaligned += 1 continue # Check whether either read aligned more than once try: if (first_r.optional_field("NH") > 1) or \ (second_r.optional_field("NH") > 1): nonunique += 1 print("warning: read '{}' has multiple alignments with " "similar score.\n".format(first_r.iv.chrom), \ file=sys.stderr) continue except KeyError: pass # Check if both reads passed the quality requirement if first_r.aQual < minaqual or second_r.aQual < minaqual: lowqual += 1 continue # Check if the read pair was marked as a duplicate if first_r.pcr_or_optical_duplicate or \ second_r.pcr_or_optical_duplicate: duplicate += 1 continue # Append fragment length/insert-size to distribution try: fld.append(first_r.inferred_insert_size) except AttributeError: pass # Handle case where reads might overlap more than one feature try: if overlap_mode == "union": fs = set() #store feature names when reads align for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) else: #intersection fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if len(fs2 ) > 0 or overlap_mode == "intersection-strict": if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) # If a read correctly mapped to a feature, increment its abundance if not fs: empty += 1 continue elif len(fs) > 1: ambiguous += 1 if not multi_aln: continue else: aln_totals += 1 for fsi in list(fs): counts[fsi]['count'] += 1 except UnknownChrom: empty += 1 unaln_totals = empty + ambiguous + lowqual + notaligned + nonunique + \ duplicate nmapped = aln_totals + empty + ambiguous + nonunique + lowqual for unit in out_handles: # Set scaling function if unit == 'fpk': norm_method = scale_abundance_fpk scaling_factor = None elif unit == 'fpkm': norm_method = scale_abundance_fpkm # Scaling factor is all mapped reads scaling_factor = nmapped print("info: the total number of mapped reads used in calculation " "of FPKM is {!s}.\n".format(nmapped), file=sys.stderr) elif unit == 'tpm': norm_method = scale_abundance_tpm rates = [counts[j]['count'] / counts[j]['length'] for j in counts] rate_sum = sum(rates) print("info: the sum of all counts per bp rates used in " "estimating fragment proportions is {:.2f}.\n"\ .format(rate_sum), file=sys.stderr) # Scaling factor is sum of all reads per base rates scaling_factor = rate_sum elif unit == 'custom': norm_method = scale_abundance_prop rates = [counts[j]['count'] / counts[j]['length'] for j in counts] rate_sum = sum(rates) print("info: the sum of all counts per bp rates used in " "estimating fragment proportions is {:.2f}.\n"\ .format(rate_sum), file=sys.stderr) scaling_factor = args.sfactor / rate_sum elif unit == 'prop': norm_method = scale_abundance_prop rates = [counts[j]['count'] / counts[j]['length'] for j in counts] rate_sum = sum(rates) print("info: the sum of all counts per bp rates used in " "estimating fragment proportions is {:.2f}.\n"\ .format(rate_sum), file=sys.stderr) scaling_factor = 1 / rate_sum else: #default is counts norm_method = scale_abundance_none scaling_factor = None if are_transcripts and not pe_mode: print( "warning: unable to calculate effective length from single-end " "reads. Will use sequence length instead.\n", file=sys.stderr) calc_length = return_first_arg elif are_transcripts and pe_mode: calc_length = compute_effective_length else: calc_length = return_first_arg out_h = out_handles[unit] # Abundance normalization abundances = {} unkwn_feat = 0 no_map = 0 for feature in counts: fcount = counts[feature]['count'] flen = calc_length(counts[feature]['length'], fld) feature_abundance = norm_method(fcount, flen, scaling_factor) # Map to higher order features, if applicable if category_field: try: # Ensure that feature has corresponding entry in database feature_map = mapping[feature] except KeyError: no_map += 1 if not category_only: # Keep all features, even the uncategorized ones abundances[feature] = abundances.get(feature, 0) + \ feature_abundance continue else: try: # Ensure that entry has relevant category field category = feature_map[category_field] except KeyError: unkwn_feat += 1 if not category_only: abundances[feature] = abundances.get(feature, 0) + \ feature_abundance continue # Handle case where feature has more than one category, such # as if a protein sequence is assigned to more than one gene # family categories = [category] if not type(category) == type(list()) \ else category for category in categories: abundances[category.lstrip()] = \ abundances.get(category, 0) + feature_abundance else: abundances[feature] = abundances.get(feature, 0) + \ feature_abundance # "UNMAPPED" can be interpreted as a single unknown gene of length one # kilobase recruiting all reads that failed to map to input features #abundances['UNMAPPED'] = unaln_totals # Output abundances sorted by key name for fn in sorted(abundances): if not fn.startswith("unkwn_"): write_io(out_h, "{}\t{!s}\n".format(fn, abundances[fn])) out_h.close() if unkwn_feat > 0: print("warning: found '{!s}' features without the '{}' field in the " "relational database.\n".format(unkwn_feat, category_field), \ file=sys.stderr) if no_map > 0: print("warning: found {!s} features without an entry in the " "relational database.\n".format(no_map), file=sys.stderr) # Output statistics print("Features processed:", file=sys.stderr) print(" - feature totals:\t{!s}".format(f_totals), file=sys.stderr) if feature_type: print(" - of relevant type:\t{!s}".format(ftype_totals), \ file=sys.stderr) print(" - unique features:\t{!s}".format(len(counts)), file=sys.stderr) print("Reads processed:", file=sys.stderr) print(" - read totals:\t{!s}".format(r_totals), file=sys.stderr) print(" - successfully mapped:\t{!s}".format(aln_totals), \ file=sys.stderr) if multi_aln: print(" - ambiguous alignment:\t{!s}".format(ambiguous), \ file=sys.stderr) print(" - unsuccessfully mapped:\t{!s}".format(unaln_totals), \ file=sys.stderr) print(" - no feature\t{!s}".format(empty), file=sys.stderr) if not multi_aln: print(" - ambiguous alignment\t{!s}".format(ambiguous), \ file=sys.stderr) print(" - too low alignment quality\t{!s}".format(lowqual), \ file=sys.stderr) print(" - not aligned\t{!s}".format(notaligned), file=sys.stderr) print(" - duplicate\t{!s}".format(duplicate), file=sys.stderr) print(" - alignment not unique\t{!s}".format(nonunique), \ file=sys.stderr) print("", file=sys.stderr) # Calculate and print program run-time end_time = time() total_time = (end_time - start_time) / 60.0 print("It took {:.2e} minutes to count {!s} fragments for {!s} features"\ .format(total_time, r_totals, f_totals), file=sys.stderr) print("", file=sys.stderr)
def htseq_read_gtf(fg): gtf = HTSeq.GFF_Reader(fg) exons = HTSeq.GenomicArrayOfSets("auto", stranded=True) for feat in gtf: if feat.type == 'exon': exons[feat.iv] += feat.attr['gene_id']
def count_reads_in_features(sam_filenames, gff_filename, samtype, order, max_buffer_size, stranded, overlap_mode, multimapped_mode, secondary_alignment_mode, supplementary_alignment_mode, feature_type, id_attribute, additional_attributes, quiet, minaqual, samouts): def write_to_samout(r, assignment, samoutfile): if samoutfile is None: return if not pe_mode: r = (r,) for read in r: if read is not None: read.optional_fields.append(('XF', assignment)) samoutfile.write(read.get_sam_line() + "\n") if samtype == "sam": SAM_or_BAM_Reader = HTSeq.SAM_Reader samname = 'SAM' elif samtype == "bam": SAM_or_BAM_Reader = HTSeq.BAM_Reader samname = 'BAM' else: raise ValueError("Unknown input format %s specified." % samtype) if samouts != []: if len(samouts) != len(sam_filenames): raise ValueError( 'Select the same number of {:} input and output files'.format(samname)) # Try to open samout files early in case any of them has issues for samout in samouts: with open(samout, 'w'): pass # Try to open samfiles to fail early in case any of them is not there if (len(sam_filenames) != 1) or (sam_filenames[0] != '-'): for sam_filename in sam_filenames: with open(sam_filename): pass # CIGAR match characters (including alignment match, sequence match, and # sequence mismatch com = ('M', '=', 'X') features = HTSeq.GenomicArrayOfSets("auto", stranded != "no") gff = HTSeq.GFF_Reader(gff_filename) counts = {} attributes = {} i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[id_attribute] except KeyError: raise ValueError( "Feature %s does not contain a '%s' attribute" % (f.name, id_attribute)) if stranded != "no" and f.iv.strand == ".": raise ValueError( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) features[f.iv] += feature_id counts[f.attr[id_attribute]] = 0 attributes[f.attr[id_attribute]] = [ f.attr[attr] if attr in f.attr else '' for attr in additional_attributes] i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write("%d GFF lines processed.\n" % i) sys.stderr.flush() except: sys.stderr.write( "Error occured when processing GFF file (%s):\n" % gff.get_line_number_string()) raise if not quiet: sys.stderr.write("%d GFF lines processed.\n" % i) sys.stderr.flush() if len(counts) == 0: sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type) counts_all = [] empty_all = [] ambiguous_all = [] notaligned_all = [] lowqual_all = [] nonunique_all = [] for isam, (sam_filename) in enumerate(sam_filenames): if samouts != []: samoutfile = open(samouts[isam], 'w') else: samoutfile = None try: if sam_filename == "-": read_seq_file = SAM_or_BAM_Reader(sys.stdin) else: read_seq_file = SAM_or_BAM_Reader(sam_filename) read_seq_iter = iter(read_seq_file) # Catch empty BAM files try: first_read = next(read_seq_iter) pe_mode = first_read.paired_end except: first_read = None pe_mode = False if first_read is not None: read_seq = itertools.chain([first_read], read_seq_iter) else: read_seq = [] except: sys.stderr.write( "Error occured when reading beginning of {:} file.\n".format( samname)) raise try: if pe_mode: if ((supplementary_alignment_mode == 'ignore') and (secondary_alignment_mode == 'ignore')): primary_only = True else: primary_only = False if order == "name": read_seq = HTSeq.pair_SAM_alignments( read_seq, primary_only=primary_only) elif order == "pos": read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq, max_buffer_size=max_buffer_size, primary_only=primary_only) else: raise ValueError("Illegal order specified.") empty = 0 ambiguous = 0 notaligned = 0 lowqual = 0 nonunique = 0 i = 0 for r in read_seq: if i > 0 and i % 100000 == 0 and not quiet: sys.stderr.write( "%d %s alignment record%s processed.\n" % (i, samname, "s" if not pe_mode else " pairs")) sys.stderr.flush() i += 1 if not pe_mode: if not r.aligned: notaligned += 1 write_to_samout(r, "__not_aligned", samoutfile) continue if ((secondary_alignment_mode == 'ignore') and r.not_primary_alignment): continue if ((supplementary_alignment_mode == 'ignore') and r.supplementary): continue try: if r.optional_field("NH") > 1: nonunique += 1 write_to_samout( r, "__alignment_not_unique", samoutfile) if multimapped_mode == 'none': continue except KeyError: pass if r.aQual < minaqual: lowqual += 1 write_to_samout(r, "__too_low_aQual", samoutfile) continue if stranded != "reverse": iv_seq = (co.ref_iv for co in r.cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if (co.type in com and co.size > 0)) else: if r[0] is not None and r[0].aligned: if stranded != "reverse": iv_seq = (co.ref_iv for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type in com and co.size > 0) else: iv_seq = tuple() if r[1] is not None and r[1].aligned: if stranded != "reverse": iv_seq = itertools.chain( iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type in com and co.size > 0)) else: iv_seq = itertools.chain( iv_seq, (co.ref_iv for co in r[1].cigar if co.type in com and co.size > 0)) else: if (r[0] is None) or not (r[0].aligned): write_to_samout(r, "__not_aligned", samoutfile) notaligned += 1 continue if secondary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].not_primary_alignment: continue elif (r[1] is not None) and r[1].not_primary_alignment: continue if supplementary_alignment_mode == 'ignore': if (r[0] is not None) and r[0].supplementary: continue elif (r[1] is not None) and r[1].supplementary: continue try: if ((r[0] is not None and r[0].optional_field("NH") > 1) or (r[1] is not None and r[1].optional_field("NH") > 1)): nonunique += 1 write_to_samout(r, "__alignment_not_unique", samoutfile) if multimapped_mode == 'none': continue except KeyError: pass if ((r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual)): lowqual += 1 write_to_samout(r, "__too_low_aQual", samoutfile) continue try: if overlap_mode == "union": fs = set() for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): fs = fs.union(fs2) elif overlap_mode in ("intersection-strict", "intersection-nonempty"): fs = None for iv in iv_seq: if iv.chrom not in features.chrom_vectors: raise UnknownChrom for iv2, fs2 in features[iv].steps(): if ((len(fs2) > 0) or (overlap_mode == "intersection-strict")): if fs is None: fs = fs2.copy() else: fs = fs.intersection(fs2) else: sys.exit("Illegal overlap mode.") if fs is None or len(fs) == 0: write_to_samout(r, "__no_feature", samoutfile) empty += 1 elif len(fs) > 1: write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]", samoutfile) ambiguous += 1 else: write_to_samout(r, list(fs)[0], samoutfile) if fs is not None and len(fs) > 0: if multimapped_mode == 'none': if len(fs) == 1: counts[list(fs)[0]] += 1 elif multimapped_mode == 'all': for fsi in list(fs): counts[fsi] += 1 else: sys.exit("Illegal multimap mode.") except UnknownChrom: write_to_samout(r, "__no_feature", samoutfile) empty += 1 except: sys.stderr.write( "Error occured when processing %s input (%s):\n" % (samname, read_seq_file.get_line_number_string())) raise if not quiet: sys.stderr.write( "%d %s %s processed.\n" % (i, samname, "alignments " if not pe_mode else "alignment pairs")) sys.stderr.flush() if samoutfile is not None: samoutfile.close() counts_all.append(counts.copy()) for fn in counts: counts[fn] = 0 empty_all.append(empty) ambiguous_all.append(ambiguous) lowqual_all.append(lowqual) notaligned_all.append(notaligned) nonunique_all.append(nonunique) pad = ['' for attr in additional_attributes] for fn in sorted(counts.keys()): print('\t'.join([fn] + attributes[fn] + [str(c[fn]) for c in counts_all])) print('\t'.join(["__no_feature"] + pad + [str(c) for c in empty_all])) print('\t'.join(["__ambiguous"] + pad + [str(c) for c in ambiguous_all])) print('\t'.join(["__too_low_aQual"] + pad + [str(c) for c in lowqual_all])) print('\t'.join(["__not_aligned"] + pad + [str(c) for c in notaligned_all])) print('\t'.join(["__alignment_not_unique"] + pad + [str(c) for c in nonunique_all]))