def main(): parser = OptionParser(usage=usage) parser.add_option( "-v", action="store_true", dest="verbose", default=False, help= "Verbose. Prints regularly how many alignments have been processed.") (options, args) = parser.parse_args() if (len(args) != 3): parser.print_help() return 1 bam_reader = pysam.Samfile(args[0], "rb") read_length = int(args[1]) result_dir = args[2] if result_dir[-1] != '/': result_dir += '/' insert_sizes = CountData() # insert size histogram length_deletion_splits = CountData() # length of deletion splits histogram length_insertion_splits = CountData( ) # length of insertion splits histogram # number of alignments found: n_align = 0 n_align_del_split = 0 n_align_ins_split = 0 for align in bam_reader.fetch(): if align.isize == 0: # alignment is unmapped continue if align.isize - 2 * read_length > 0: insert_sizes.add(align.isize - 2 * read_length, 1) n_align += 1 if options.verbose and n_align % 100000 == 0: print('Having processed %d alignments' % n_align, file=sys.stderr) insertion_split_present = False deletion_split_present = False i = align.pos for (cigar_type, cigar_length) in align.cigar: # walk through the cigar string if cigar_type == 1: # insertion insertion_split_present = True length_insertion_splits.add(cigar_length, 1) elif cigar_type == 2: # deletion deletion_split_present = True length_deletion_splits.add(cigar_length, 1) i += cigar_length if deletion_split_present: n_align_del_split += 1 if insertion_split_present: n_align_ins_split += 1 bam_reader.close() # print results to file insert_sizes.print(open(result_dir + 'histogram-data.insert-sizes', 'w')) length_deletion_splits.print( open(result_dir + 'histogram-data.length-deletion-splits', 'w')) length_insertion_splits.print( open(result_dir + 'histogram-data.length-insertion-splits', 'w')) print("%d\t%d\t%d" % (n_align, n_align_del_split, n_align_ins_split), file=(open(result_dir + 'histogram-data.meta', 'w')))
def readThread( samfile_path, Coords, Strand ): # <samfile_path>: bam file input for analysis # <Coords>: A 3-element object with Chromosome, Start, End of element # <ReadLength>: integer of read length # <Strand>: True = postive Strand, False = Negative strand # # Classify each read and it's mate based on their intersection to the TE # Upstream = Evidence of transcription going into a TE # Downstream = Evidence of transcription being generated in a TE # # RepeatStart RepeatEnd # ___________________ #__________|________TE_________|_______________ # | | Read Class # ;==== | | Left # ;==== | LeftEdge # | ;==== | InExon # | ;==== RightEdge # | | ;==== Right # ;======================== Span # | | # | | # # # | | Read-mate Cases # ;====---;==== | Upstream # ;====---;==== | Upstream # ;====---;==== | Upstream # | | # ;==--;== | | Discard (external) # | | ;==--;== Discard (external) # | ;====---;==== | Discard (internal) # ;====--|-------------------|-;==== Discard (Splice Span) # | | # | ;====---;==== Downstream # | ;====---;==== Downstream # | ;====---;==== Downstream # | | # ;========================----;====... Force Upstream # ;====----|-----------------;==== Force Upstream # ;====----------------;==== Force Upstream # ;====------------------|-----;==== Force Upstream # | | # | | # Legend: # ; Leftmost position of read (ReadStart or MateStart) # ==== Aligned read length (ReadLength) # --- Internal sequence # Input Bam File samfile = pysam.Samfile( samfile_path, "rb" ) # Import reads overlapping the element coordinates # [0] = Chromosome , [1] = Start, [2] = End ParsedCoord = 'chr{0[0]}:{0[1]}-{0[2]}'.format(Coords) readIterator = samfile.fetch(region = ParsedCoord) #print(ParsedCoord) # Initialize output discardThread = 0 forceUpThread = 0 upThread = 0 downThread = 0 RepeatStart = int(Coords[1]) RepeatEnd = int(Coords[2]) #print(RepeatStart, RepeatEnd) # Iterate through the reads # Assume +ve strand, flip results if negative strand for read in readIterator: # VERBOSE DEBUG # commented out #print # read.start,read.end,mate.start,mate.end # print(read.pos,(read.pos+read.rlen),"-",read.mpos,(read.mpos + read.rlen),"~") # Paired reads on same chromosome only if (read.is_paired and read.tid==read.rnext and int(read.mapq)>0): # Accessing Mate Information # read.pos = read start position (leftmost) # readMate = samfile.mate(read) # read.mpost = mate start position # MateStart = read.mpos ReadLength = read.rlen ReadStart = read.pos ReadEnd = ReadStart + ReadLength MateStart = read.mpos MateEnd = MateStart + ReadLength # START CLASSIFICATION ===================================================== if ( ReadStart <= RepeatStart ): # (A) TRUE Read starts left of Repeat Start if ( ReadEnd > RepeatStart): # (B) TRUE Read Ends right of Repeat Start if ( ReadEnd > RepeatEnd): # (C) TRUE Read Ends right of Repeat End # Case: Read spans the entire repeat # --> Force Upstream forceUpThread = forceUpThread + 1 #print("FU 1: Read.ExonInside; Mate.UNK") else: # (C) FALSE Read Ends left of Repeat End # Case: Read is on left edge # Requires mate classification == if ( MateEnd > RepeatEnd ): # Mate Ends Right of Repeat End if ( MateStart <= RepeatStart): # Mate Starts Left of Repeat Start # Case: Mate spans repeat # --> Discard discardThread = discardThread + 1 #print("DIS 1: Read.LeftEdge; Mate.ExonInside") else: # Mate Starts Right of Repaet Start # Mate is on right edge, or right of repeat # --> Force Upstream forceUpThread = forceUpThread + 1 #print("FU 2: Read.LeftEdge; Mate.RightEdge,RightOf ") else: # Mate Ends Left of Repeat End # Mate is left, internal or on left edge # --> Upstream upThread = upThread + 1 #print("U 1: Read.LeftEdge; Mate.Left,Internal,LeftEdge") # End Mate Logic ================ else: # (B) FALSE Read Ends left of Repeat Start # Case: Read is completely upstream of repeat # --> Discard discardThread = discardThread + 1 #print("DIS 2: Read.UpStream; Mate.UNK") else: # (A) FALSE Read start right of Repeat Start if (ReadEnd > RepeatEnd): # (C) TRUE Read Ends right of Repeat End if (ReadStart > RepeatEnd): # (D) TRUE Read starts right of Repeat End # Case: Read Starts is downstream of Repeat # --> Discard discardThread = discardThread + 1 #print("DIS 3: Read.Downstream; Mate.UNK") else: # (D) FALSE Read ends left of Repeat End # Case: Read is on right edge # Requires mate calssification == if (MateStart <= RepeatStart): # (A) # Mate starts left of Repeat Start if (MateEnd > RepeatStart): # (B) # Case: Mate is on left edge or spans repeat # --> Discard discardThread = discardThread + 1 #print("DIS 4: Read.RightEdge; Mate.LeftEdge,ExonIn") else: # (B) FALSE # Case: Mate is upstream of Repeat # --> Force Upstream forceUpThread = forceUpThread + 1 #print(read) #print("FU 3: Read.RightEdge; Mate.Left") else: # (A) False # Case: Mate is internal, rightEdge or right of repeat # --> Downstream downThread = downThread + 1 #print("D 1: Read.RightEdge; Mate.Right") # End Mate Logic ================ else: #(C) FALSE Read Ends left of Repeat End # Case: Read is internal to Repeat # Requires Mate Classification == if (MateStart > RepeatEnd): # Mate Starts Right of Repeat End (D) # Case: Mate is Right of repeat # --> Upstream downThread = downThread + 1 #print("D 2: Read.Internal; Mate.Right") elif (MateEnd <= RepeatStart): # Mate Ends Left of Repeat Start (E) # Case: Mate is right of repeat # --> Downstream upThread = upThread + 1 #print("U 2: Read.Internal; Mate.Left") else: # Mate is rightEdge, internal, leftEdge or spans repeat # --> Discard discardThread = discardThread + 1 #print("DIS 5: Read.Internal; Mate.Internal") # End Mate Logic ================ # END CLASSIFICATION ============================================================= #print(discardThread, forceUpThread, upThread, downThread) # Output if (Strand): # True = Positive Strand Orientation upThread = upThread + forceUpThread localResults = (upThread, downThread) else:# False = Negative Strand downThread = downThread + forceUpThread localResults = (downThread, upThread) #print(localResults) return(localResults)
bar = progressbar.ProgressBar(maxval=total_reads, widgets=[ ' [', progressbar.Timer(), '] ', progressbar.Bar(), ' (', progressbar.ETA(), ') ' ]).start() except: # if BAM not indexed or some other issue, don't show progress bar print( 'Could not get total reads (BAM may not be indexed), skipping progress bar...' ) show_progress_bar = False read_number = 0 for read in pysam.Samfile(bam): # Progress bar if show_progress_bar and (read_number % 100000 == 0 or read_number == total_reads - 1): bar.update(read_number) read_number += 1 # Ignore mapped reads (can't belong to guide transcripts...) ## unless the user wants all reads processed if not read.is_unmapped and not args.all_reads: continue seq = read.seq.upper() tags = dict(read.tags) cell = tags.get('CB', None)
# elif len(words)==2 and words[1].split("=")[0]=="/label": # region_names[location]=words[1].split("=")[1].replace('"','') # for region in regions: # try: # print regions[region], region_names[region], region_lengths[region] # except StandardError: # print regions[region], region, region_lengths[region] print "Finding regions in bam files" sys.stdout.flush() for filename in args: print "\t"+filename+"..." sys.stdout.flush() if filename.split(".")[-1]=="bam": samfile = pysam.Samfile( filename, "rb" ) elif filename.split(".")[-1]=="sam": samfile = pysam.Samfile( filename, "r" ) else: print filename, "not a readable bam file" continue refs=samfile.references lengths=samfile.lengths if len(refs)!=len(refseqs): DoError("bam has different number of reference sequences to reference fasta file") else: for ref in refs: if not ref in refseqs: DoError("bam and reference fasta file do not match")
def sam_to_allele_counts(sam_fname, paired=False, qual_min=30, max_reads=-1, max_isize = 700, VERBOSE = 0, fwd_primer_regions = None, rev_primer_regions = None): ''' calculates the allele counts for a set of mapped reads parameters: sam_fname -- sam or bam file with mapped reads paired -- differentiates between read one or two if True otherwise the two reads are lumped together max_isize -- maximal insert sizes to consider. this can be used to remove artifactual mappings qual_min -- Ignore bases with quality less than qmin ''' import pysam from collections import defaultdict alpha = nuc_alpha def ac_array(length, paired): if paired: return np.zeros((2,2,6,length), dtype =int) else: return np.zeros((2,6,length), dtype = int) # Note: the data structure for inserts is a nested dict with: # position --> string --> read type --> count # (dict) (dict) (list) (int) def insertion_data_structure(paired): if paired: return defaultdict(lambda: defaultdict(lambda: np.zeros((2,2), int))) else: return defaultdict(lambda: defaultdict(lambda: np.zeros(2, int))) # Open BAM or SAM file with pysam.Samfile(sam_fname) as samfile: ac = [] refs = {} for nref in xrange(samfile.nreferences): if VERBOSE: print("allocating for:", samfile.getrname(nref), "length:", samfile.lengths[nref]) refs[nref]=samfile.getrname(nref) ac.append((samfile.getrname(nref), ac_array(samfile.lengths[nref], paired), insertion_data_structure(paired))) # Iterate over single reads for i, read in enumerate(samfile): # Max number of reads if i == max_reads: if VERBOSE >= 2: print('Max reads reached:', max_reads) break if read.is_unmapped or np.abs(read.isize)>max_isize or read.is_secondary or read.is_supplementary: continue # Print output if (VERBOSE > 2) and (not ((i +1) % 10000)): print(i+1) # Read CIGARs (they should be clean by now) if paired: counts = ac[read.rname][1][int(read.is_read2),int(read.is_reverse)] insertion = ac[read.rname][2] else: counts = ac[read.rname][1][int(read.is_reverse)] insertion = ac[read.rname][2] seq = np.fromstring(read.seq, 'S1') qual = np.fromstring(read.qual, np.int8) - 33 not_primer = np.ones_like(seq, 'bool') pos = read.pos # all legit reads should be FR or RF! if rev_primer_regions: if read.is_reverse or np.abs(read.isize)==seq.shape[0]: read_end = pos + seq.shape[0] for b,e in rev_primer_regions[refs[read.rname]]: p_length = e-b if read_end-b>0 and read_end-b<p_length: not_primer[-(read_end-b):]=False break if fwd_primer_regions: if (not read.is_reverse) or np.abs(read.isize)==seq.shape[0]: for b,e in fwd_primer_regions[refs[read.rname]]: p_length = e-b if pos-b>0 and pos-b<p_length: not_primer[:e-pos]=False break # if pos+len(seq)>7267: # import ipdb;ipdb.set_trace() # Iterate over CIGARs for ic, (block_type, block_len) in enumerate(read.cigar): if block_type==4: # softclip seq = seq[block_len:] qual = qual[block_len:] # not the difference here: the reported position starts after the softclip. hence the not_primer is already correct not_primer = not_primer[:-block_len] continue if block_type==5: # hard clip continue # Check for pos: it should never exceed the length of the fragment # if (block_type in [0, 1, 2]) and (pos >= length): # raise ValueError('Pos exceeded the length of the fragment') # Inline block if block_type == 0: seqb = seq[:block_len] qualb = qual[:block_len] not_primerb = not_primer[:block_len] # Increment counts for j, a in enumerate(alpha): posa = ((seqb == a) & (qualb >= qual_min) & (not_primerb)).nonzero()[0] if len(posa): counts[j,pos + posa] += 1 # Chop off this block if ic != len(read.cigar) - 1: seq = seq[block_len:] qual = qual[block_len:] not_primer = not_primer[block_len:] pos += block_len # Deletion elif block_type == 2: # Increment gap counts counts[4, pos:pos + block_len] += 1 # Chop off pos, but not sequence pos += block_len # Insertion # an insert @ pos 391 means that seq[:391] is BEFORE the insert, # THEN the insert, FINALLY comes seq[391:] elif block_type == 1: seqb = seq[:block_len] qualb = qual[:block_len] not_primerb = not_primer[:block_len] # Accept only high-quality inserts if (qualb >= qual_min).all(): if paired: insertion[pos][seqb.tostring()][int(read.is_read2), int(read.is_reverse)] += 1 else: insertion[pos][seqb.tostring()][int(read.is_reverse)] += 1 # Chop off seq, but not pos if ic != len(read.cigar) - 1: seq = seq[block_len:] qual = qual[block_len:] not_primer = not_primer[block_len:] # Other types of cigar? else: if VERBOSE>2: print("unrecognized CIGAR type:", read.cigarstring) #raise ValueError('CIGAR type '+str(block_type)+' not recognized') return ac
def estimateInsertSizeDistribution(bamfile, alignments=10000, n=10, method="picard", similarity_threshold=1.0, max_chunks=1000): '''estimate insert size from a subset of alignments in a bam file. Several methods are implemented. picard The method works analogous to picard by restricting the estimates to a core distribution. The core distribution is defined as all values that lie within n-times the median absolute deviation of the full data set. convergence The method works similar to ``picard``, but continues reading `alignments` until the mean and standard deviation stabilize. The values returned are the median mean and median standard deviation encountered. The method `convergence` is suited to RNA-seq data, as insert sizes fluctuate siginificantly depending on the current region being looked at. Only mapped and proper pairs are considered in the computation. Returns ------- mean : float Mean of insert sizes. stddev : float Standard deviation of insert sizes. npairs : int Number of read pairs used for the estimation method : string Estimation method similarity_threshold : float Similarity threshold to apply. max_chunks : int Maximum number of chunks of size `alignments` to be used in the convergence method. ''' assert isPaired(bamfile), \ 'can only estimate insert size from' \ 'paired bam files' samfile = pysam.Samfile(bamfile) def get_core_distribution(inserts, n): # compute median absolute deviation raw_median = numpy.median(inserts) raw_median_dev = numpy.median(numpy.absolute(inserts - raw_median)) # set thresholds threshold_min = max(0, raw_median - n * raw_median_dev) threshold_max = raw_median + n * raw_median_dev # define core distribution return inserts[numpy.logical_and(inserts >= threshold_min, inserts <= threshold_max)] if method == "picard": # only get first read in pair to avoid double counting inserts = numpy.array([ read.template_length for read in samfile.head(n=alignments) if read.is_proper_pair and not read.is_unmapped and not read.mate_is_unmapped and not read.is_read1 and not read.is_duplicate and read.template_length > 0 ]) core = get_core_distribution(inserts, n) return numpy.mean(core), numpy.std(core), len(inserts) elif method == "convergence": means, stds, counts = [], [], [] last_mean = 0 iteration = 0 while iteration < max_chunks: inserts = numpy.array([ read.template_length for read in samfile.head(n=alignments, multiple_iterators=False) if read.is_proper_pair and not read.is_unmapped and not read.mate_is_unmapped and not read.is_read1 and not read.is_duplicate and read.template_length > 0 ]) core = get_core_distribution(inserts, n) means.append(numpy.mean(core)) stds.append(numpy.std(core)) counts.append(len(inserts)) mean_core = get_core_distribution(numpy.array(means), 2) mm = numpy.mean(mean_core) if abs(mm - last_mean) < similarity_threshold: break last_mean = mm return numpy.median(means), numpy.median(stds), sum(counts) else: raise ValueError("unknown method '%s'" % method)
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--input-bam", dest="input_bam_file", type="string", help="input bam file [%default]") parser.add_option("-o", "--output-bam", dest="output_bam_file", type="string", help="input bam file [%default].") parser.add_option("-r", "--max-read-length", dest="max_read_length", type="int", help="maximum read length [%default].") parser.add_option( "-m", "--output-mode", dest="output_mode", type="choice", choices=["buffered", "direct"], help="output mode for files. 'buffered' will output reads in correct " "sort order, 'direct' will require the output BAM file to be sorted separately." "[%default].") parser.add_option( "--region", dest="region", type="string", help= "genomic region, only split in BAM file within this region [%default]." ) parser.set_defaults( input_bam_file="-", output_bam_file="-", max_read_length=100, default_quality_score=10, region=None, output_mode="buffered", ) (options, args) = E.start(parser, argv) pysam_in = pysam.Samfile(options.input_bam_file, "rb") pysam_out = pysam.Samfile(options.output_bam_file, "wb", template=pysam_in) max_read_length = options.max_read_length bam2bam_split_reads(pysam_in, pysam_out, default_quality_score=options.default_quality_score, max_read_length=options.max_read_length, output_mode=options.output_mode) E.stop()
def main(): usage = "%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option( "-i", "--input", action="store", type="string", dest="input_files", help= 'Input BAM file(s). "-i" takes these input: 1) a single BAM file. 2) "," separated BAM files (no spaces allowed). 3) directory containing one or more bam files. 4) plain text file containing the path of one or more bam files (Each row is a BAM file path). All BAM files should be sorted and indexed using samtools. [required]' ) parser.add_option( "-r", "--refgene", action="store", type="string", dest="ref_gene_model", help= "Reference gene model in BED format. Must be strandard 12-column BED file. [required]" ) parser.add_option( "-c", "--minCov", action="store", type="int", dest="minimum_coverage", default=10, help="Minimum number of read mapped to a transcript. default=%default") parser.add_option( "-n", "--sample-size", action="store", type="int", dest="sample_size", default=100, help= "Number of equal-spaced nucleotide positions picked from mRNA. Note: if this number is larger than the length of mRNA (L), it will be halved until it's smaller than L. default=%default" ) parser.add_option( "--names", dest="sample_names", action="store", type="string", help= "sample names, comma separated (no spaces allowed); number must match the number of provided bam_files" ) parser.add_option( "-s", "--subtract-background", action="store_true", dest="subtract_bg", help= "Subtract background noise (estimated from intronic reads). Only use this option if there are substantial intronic reads." ) (options, args) = parser.parse_args() # if '-s' was set if options.subtract_bg: exon_ranges = union_exons(options.ref_gene_model) if options.sample_size < 0: print >> sys.stderr, "Number of nucleotide can't be negative" sys.exit(0) elif options.sample_size > 1000: print >> sys.stderr, "Warning: '-n' is too large! Please try smaller '-n' valeu if program is running slow." if not (options.input_files and options.ref_gene_model): parser.print_help() sys.exit(0) if not os.path.exists(options.ref_gene_model): print >> sys.stderr, '\n\n' + options.ref_gene_model + " does NOT exists" + '\n' parser.print_help() sys.exit(0) printlog("Get BAM file(s) ...") bamfiles = options.input_files.split(",") if len(bamfiles) <= 0: print >> sys.stderr, "No BAM file found, exit." sys.exit(0) else: print >> sys.stderr, "Total %d BAM file(s):" % len(bamfiles) for f in bamfiles: print >> sys.stderr, "\t" + f names = options.sample_names.split(",") if len(names) != len(bamfiles): print >> sys.stderr, "[ERROR] Number of bam files does not match number of names" sys.exit(2) # print header sys.stdout.write("transcript") for i in names: sys.stdout.write("\t%s" % i) print >> sys.stdout, "" sample_TINS_per_transcript = {} for f_idx in range(len(bamfiles)): f = bamfiles[f_idx] printlog("Processing " + f) sample_name = names[f_idx] samfile = pysam.Samfile(f, "rb") finish = 0 noise_level = 0.0 for gname, i_chr, i_tx_start, i_tx_end, intron_size, pick_positions in genomic_positions( refbed=options.ref_gene_model, sample_size=options.sample_size): finish += 1 if gname not in sample_TINS_per_transcript: sample_TINS_per_transcript[gname] = [] # check minimum reads coverage if check_min_reads(samfile, i_chr, i_tx_start, i_tx_end, options.minimum_coverage) is not True: sample_TINS_per_transcript[gname].append(0.0) continue # estimate background noise if '-s' was specified if options.subtract_bg: intron_signals = estimate_bg_noise(i_chr, i_tx_start, i_tx_end, samfile, exon_ranges) if intron_size > 0: noise_level = intron_signals / intron_size coverage = genebody_coverage(samfile, i_chr, sorted(pick_positions), noise_level) # test --> #for a,b in zip(sorted(pick_positions),coverage): # print >>sys.stderr, str(a) + '\t' + str(b) # <-- test tin1 = tin_score(cvg=coverage, l=len(pick_positions)) sample_TINS_per_transcript[gname].append(tin1) if finish % 500 == 0: print >> sys.stderr, " %d transcripts finished" % (finish) samfile.close() # print table for ex in sample_TINS_per_transcript: print >> sys.stdout, "%s\t%s" % (ex, "\t".join( map(str, sample_TINS_per_transcript[ex])))
is_dupe = 0x400 #PCR or optical duplicate def reheader(in_sam, keepstr='dmel'): new_header = in_sam.header.copy() new_SQ = [ sub_dict for sub_dict in new_header['SQ'] if keepstr in sub_dict['SN'] ] new_header['SQ'] = new_SQ return new_header #Operate on each file independently for fname in sys.argv[1:]: data = defaultdict(lambda: [None, None]) infile = pysam.Samfile(fname) outfile = pysam.Samfile(fname[:-4] + '_rescued_unsorted.bam', 'wb', header=reheader(infile, fname[-8:-4])) irefs = infile.references orefs = outfile.references maxval, start = get_bam_length(infile) # For progress bar goodness pbar = ProgressBar( maxval=maxval - start, widgets=[fname, ': ', Percentage(), ' ', Bar(), ' ', ETA(), ' ']) pbar.start()
import sys import pysam from uuid import uuid4 inbamfn = sys.argv[1] outbamfn = sys.argv[2] inbam = pysam.Samfile(inbamfn, 'rb') outbam = pysam.Samfile(outbamfn, 'wb', template=inbam) paired = {} n = 0 p = 0 u = 0 w = 0 m = 0 for read in inbam.fetch(until_eof=True): n += 1 if read.is_paired: p += 1 if read.qname in paired: uuid = paired[read.qname] del paired[read.qname] read.qname = uuid outbam.write(read) w += 1 m += 1 else: newname = str(uuid4())
def main(): prepare_optparser() (options, args) = prepare_optparser().parse_args() try: raw_bam = args[0] ref_Splice = options.ref_Splice out_file = options.out_file except IndexError: prepare_optparser().print_help() sys.exit(1) if not os.path.isfile("%s.bai" % (raw_bam)): shell_info = "samtools index %s" % (raw_bam) print >> sys.stderr, shell_info p = subprocess.Popen(shell_info, shell='True') while 1: run_cnt = 0 if p.poll() is None: run_cnt += 1 time.sleep(3) if run_cnt == 0: break f_sam = pysam.Samfile(raw_bam, "rb") f_refSplice = open(ref_Splice, "r") f_out_file = open(out_file, "w") total_circ = 0 pass_PE_circ = 0 notP_PE_circ = 0 total_read = 0 pass_PE_read = 0 notP_PE_read = 0 for line in f_refSplice: line = line.strip('\n') f = line.split() chrom = f[0] beg = int(f[1]) end = int(f[1]) + 1 cnt_junc = 0 cnt_linear = 0 record = f_sam.fetch(reference=chrom, start=beg - 1, end=beg + 1) for rec in record: rec_beg = rec.reference_start idx = rec_beg - beg for pair in rec.cigar: ctag = pair[0] leng = pair[1] pos = beg + idx is_linear = 0 is_junction = 0 if ctag == 0: for i in xrange(leng): is_overlap = is_intersect(beg, end, beg + idx) if is_overlap: is_linear = 1 idx += 1 elif ctag == 1: continue elif ctag == 2: for i in xrange(leng): idx += 1 elif ctag == 3: for i in xrange(leng): is_overlap = is_intersect(beg, end, beg + idx) if is_overlap: # print rec, i if i < 2 or (leng - i) < 2: is_junction = 1 idx += 1 cnt_junc += is_junction cnt_linear += is_linear print >> f_out_file, "%s\t%d\t%d" % (line, cnt_junc, cnt_linear) f_refSplice.close() f_sam.close() f_out_file.close()
ATCGmap = gzip.open(ATCGmap_fname, 'wb') CGmap_fname = options.CGmap_file or ((options.output_prefix or options.infilename) + '.CGmap.gz') CGmap = gzip.open(CGmap_fname, 'wb') # to improve the performance options_RM_CCGG = options.RM_CCGG options_read_no = options.read_no options_RM_SX = options.RM_SX options_RM_OVERLAP = options.RM_OVERLAP wiggle_fname = options.wig_file or ((options.output_prefix or options.infilename) + '.wig') wiggle = open(wiggle_fname, 'w') wiggle.write('type wiggle_0\n') sorted_input = pysam.Samfile(sorted_input_filename, 'rb') chrom = None nucs = ['A', 'T', 'C', 'G', 'N'] ATCG_fwd = dict((n, 0) for n in nucs) ATCG_rev = dict((n, 0) for n in nucs) # Define the context and subcontext exchanging dictionary ContextTable={"CAA":"CHH", "CAC":"CHH", "CAG":"CHG", "CAT":"CHH", "CCA":"CHH", "CCC":"CHH", "CCG":"CHG", "CCT":"CHH", "CGA":"CG", "CGC":"CG", "CGG":"CG", "CGT":"CG", "CTA":"CHH", "CTC":"CHH", "CTG":"CHG", "CTT":"CHH"} # SubContextTable={"CAA":"CA", "CAC":"CA", "CAG":"CA", "CAT":"CA", "CCA":"CC", "CCC":"CC", "CCG":"CC", "CCT":"CC", "CGA":"CG", "CGC":"CG", "CGG":"CG", "CGT":"CG",
def bam2bed(bam_file, bed_file, track_header, interact_file, q_cut): ''' Convert BAM file into standard BED file. The alignment blocks of paired-end reads will be merged into a single BED entry (row). Parameters ---------- bam_file : str Name of input BAM file. bed_file : str Name of output BED file. interact_file : str Name of the output Interact file. track_header : bool If True, add track header line to the bed file. q_cut : int Mapping quality score cutoff. ''' #key is read_id, value is list of "aligned gapless blocks" block_list = {} samfile = pysam.Samfile(bam_file, 'rb') strandness = collections.defaultdict(list) interChrom_IDs = set() try: while (1): aligned_read = next(samfile) read_type = aligned_read.get_tag( tag="SR" ) #1 : only supported by split read; 2: only supported by read pair; 3: supported by both fusion_name = aligned_read.get_tag(tag="FN") chrom = aligned_read.reference_name if aligned_read.is_paired: mate_chrom = aligned_read.next_reference_name if aligned_read.is_reverse: s = '-' else: s = '+' read_id = aligned_read.query_name if read_id.endswith('/1') or read_id.endswith('/2'): read_id = read_id[:-2] #remove last 2 chars # In the output BED file # for intra chrom fusions, pair reads will be merged into a single bed record # for inter chrom fusions, pair reads will each has its own separated record key = fusion_name + '_@_' + read_id + ' ' + chrom strandness[key].append(s) if key not in block_list: block_list[key] = aligned_read.get_blocks( ) #[(46650521, 46650555), (46650631, 46650645)]; chr1 46650522 66 34M76N14M else: block_list[key].extend(aligned_read.get_blocks()) if chrom != mate_chrom: interChrom_IDs.add(fusion_name + '_@_' + read_id) except StopIteration: print("Done", file=sys.stderr) samfile.seek(0) #key is read_id, value is list of sorted, non-consecutive "aligned gapless blocks" sorted_block_list = {} for k, v in block_list.items(): #print (k + '\t' + str(v)) tmp = sorted(list(set(v)), key=lambda tup: tup[0] ) #remove redundancy; sort coordinates (small to large) tmp = [list(i) for i in tmp] #tuple list to list list sorted_block_list[k] = list(consec(tmp)) #combine consecutive regions ##################################### # convert sorted block list into BED ##################################### OUT = open(bed_file, 'w') if track_header: print( 'track name="Supporting reads of Intra-Chrom gene fusion" description="Alignment blocks from the paired reads were combined" visibility=2 itemRgb="On"', file=OUT) for id, blocks in sorted_block_list.items(): (name, chrom) = id.split(' ') chromStart = blocks[0][0] chromEnd = blocks[-1][-1] score = 0 if id in strandness: tmp = list(set(strandness[id])) if len(tmp) == 1: strand = tmp[0] else: strand = '.' else: strand = '.' #split read : orange if read_type == 1: itemRgb = '255,128,0' #paired read : blue elif read_type == 2: itemRgb = '0,102,204' #both : red elif read_type == 3: itemRgb = '255,0,0' else: itemRgb = '0,0,0' thickStart = chromStart thickEnd = chromEnd blockCount = len(blocks) blockSizes = ','.join([str(i[1] - i[0]) for i in blocks]) blockStarts = ','.join([str(i[0] - chromStart) for i in blocks]) bed_blocks = [ chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts ] print("\t".join([str(i) for i in bed_blocks]), file=OUT) OUT.close() ##################################### # convert BED into Interact ##################################### OUT2 = open(interact_file, 'w') print( 'track type=interact name="Supporting reads of gene fusions" description="Connection map of gene fusion reads" maxHeightPixels=200:200:50 visibility=full', file=OUT2) InterChrom_list = collections.defaultdict( list ) #k is read_id, value is list of tuples (chrom, st, end, name, score, strand) for line in open(bed_file, 'r'): if line.startswith('#'): continue if line.startswith('track'): continue if line.startswith('browser'): continue exon_blocks = [] f = line.strip().split() chrom = f[0] chrom_start = int(f[1]) name = f[3] #sourceName,targetName = name.split('_@_')[0].split('--') score = f[4] strand = f[5] blockSizes = [int(i) for i in f[10].strip(',').split(',')] blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ] for base, offset in zip(blockStarts, blockSizes): exon_blocks.append([chrom, base, base + offset, strand]) if name not in interChrom_IDs: for indx in range(0, len(exon_blocks) - 1): block1 = exon_blocks[indx] block2 = exon_blocks[indx + 1] chrom = chrom chromStart = min(block1[1], block2[1]) chromEnd = max(block1[2], block2[2]) name = name score = 100 value = 100.0 exp = 'Intra_Chrom_fusion' color = 'blue' sourceChrom, sourceStart, sourceEnd, sourceStrand = block1 sourceName = sourceChrom + ':' + str(sourceStart) + '-' + str( sourceEnd) targetChrom, targetStart, targetEnd, targetStrand = block2 targetName = targetChrom + ':' + str(targetStart) + '-' + str( targetEnd) print("\t".join([ str(i) for i in (chrom, chromStart, chromEnd, name, score, value, exp, color, sourceChrom, sourceStart, sourceEnd, sourceName, sourceStrand, targetChrom, targetStart, targetEnd, targetName, targetStrand) ]), file=OUT2) else: InterChrom_list[name].append(exon_blocks) for k, v in InterChrom_list.items(): name = k #print (name) score = 100 value = 100.0 exp = 'Inter_Chrom_fusion' color = 'red' if len(v) != 2: #paired reads must have two block_list continue for block1 in v[0]: sourceChrom, sourceStart, sourceEnd, sourceStrand = block1 sourceName = sourceChrom + ':' + str(sourceStart) + '-' + str( sourceEnd) for block2 in v[1]: targetChrom, targetStart, targetEnd, targetStrand = block2 targetName = targetChrom + ':' + str(targetStart) + '-' + str( targetEnd) print("\t".join([ str(i) for i in (sourceChrom, sourceStart, sourceEnd, name, score, value, exp, color, targetChrom, targetStart, targetEnd, targetName, targetStrand, sourceChrom, sourceStart, sourceEnd, sourceName, sourceStrand) ]), file=OUT2) OUT2.close()
import string import pysam # ------------------------------------ # constants # ------------------------------------ # ------------------------------------ # Misc functions # ------------------------------------ # ------------------------------------ # Classes # ------------------------------------ # ------------------------------------ # Main # ------------------------------------ if __name__ == "__main__": if len(sys.argv) == 1: sys.exit("Example:" + sys.argv[0] + " *.bam/sam >*.bed ") sam = pysam.Samfile(sys.argv[1], sys.argv[1].endswith("bam") and 'rb' or 'r') for read in sam: if not read.is_unmapped: print "%s\t%d\t%d\t%s\t%d\t%s" % ( sam.references[read.tid], read.pos, read.aend, read.qname, read.mapq, read.is_reverse and "-" or "+") sam.close()
action='store', type=str, default=None) parser.add_argument( '--cbed', dest='cbed', help= "Optional bedfile defining 1000 10kbp genomic windows for coverage calcualtion", metavar='FILE', action='store', type=str, default=None) args = parser.parse_args() if os.path.splitext(args.bam[0])[-1] == '.cram': bamFile = pysam.Samfile(args.bam[0], 'rc') else: bamFile = pysam.Samfile(args.bam[0], 'rb') cbam = None if args.cbam is not None: if os.path.splitext(args.cbam[0])[-1] == '.cram': cbam = pysam.Samfile(args.cbam, 'rc') else: cbam = pysam.Samfile(args.cbam, 'rb') cbed = args.cbed coverage_stats_file = open(hg.DATA_REPO + "/coverage.stats") cstats = None cb = bamFile if cbam is not None: cb = cbam
def filter(cfg,bamFileIn,bamFileOut): print("consensus filter: starting...") # get params deleteLocalFiles = cfg.deleteLocalFiles # constants for read pair accounting NUM_PRIMER_SIDE_NOT_MAPPED = 0 NUM_RANDOM_SIDE_NOT_MAPPED = 1 NUM_R1_R2_NOT_AT_SAME_LOCUS = 2 NUM_R1_R2_SAME_ORIENTATION = 3 NUM_SPLIT_ALIGNMENT = 4 NUM_LOW_MAPQ = 5 NUM_LT_25BP_ALIGNED = 6 NUM_WRITTEN_OUT = 7 NUM_METRICS_TOTAL = 8 # open BAM read alignment files bamIn = pysam.Samfile(bamFileIn , "rb") bamOut = pysam.Samfile(bamFileOut, "wb", template=bamIn) # loop over read alignments readPairCounts = [0] * NUM_METRICS_TOTAL for read in bamIn: # this is dangerous, but drop these for now if read.is_secondary or read.is_supplementary: continue # crash if read is not paired if not read.is_paired: print((read.qname)) raise Exception("read not paired!") # this should be R1 read1 = read # get mate, assuming mate is the next record in the BAM file while True: read = next(bamIn) if not read.is_secondary and not read.is_supplementary: break # this should be R2 read2 = read # debug check if read1.qname != read2.qname: print((read1.qname, read2.qname)) raise Exception("read mate is not next in BAM record order!") # debug check if not read1.is_read1 or not read2.is_read2: raise Exception("R1/R2 mixed up!") # skip but count unmapped R1 reads, even if R2 mapped. Need to look at these later... if read1.is_unmapped: readPairCounts[NUM_PRIMER_SIDE_NOT_MAPPED] += 1 continue # skip but count unmapped R2 reads if read2.is_unmapped: readPairCounts[NUM_RANDOM_SIDE_NOT_MAPPED] += 1 continue # skip reads not mapped to same chrom chrom1 = bamIn.getrname(read1.tid) chrom2 = bamIn.getrname(read2.tid) if chrom1 != chrom2: readPairCounts[NUM_R1_R2_NOT_AT_SAME_LOCUS] += 1 continue # skip reads not mapped to same locus locRead1 = int(read1.aend) - 1 if read1.is_reverse else read1.pos locRead2 = int(read2.aend) - 1 if read2.is_reverse else read2.pos if abs(locRead1 - locRead2) > 2000: readPairCounts[NUM_R1_R2_NOT_AT_SAME_LOCUS] += 1 continue # skip pairs with odd alignment orientation if read1.is_reverse == read2.is_reverse: readPairCounts[NUM_R1_R2_SAME_ORIENTATION] += 1 continue # drop read pair if either end has a supplementary split alignment if read1.has_tag("SA") or read2.has_tag("SA"): readPairCounts[NUM_SPLIT_ALIGNMENT] += 1 continue # drop read pair if R1 or R2 read has low mapq if read2.mapq < 17 or read1.mapq < 17: readPairCounts[NUM_LOW_MAPQ] += 1 continue # require some significant alignment to genome if read2.aend - read2.pos < 25 or read1.aend - read1.pos < 25: readPairCounts[NUM_LT_25BP_ALIGNED] += 1 continue # output bamOut.write(read1) bamOut.write(read2) readPairCounts[NUM_WRITTEN_OUT] += 1 # done bamOut.close() bamIn.close() # delete input BAM file if local if deleteLocalFiles and len(os.path.dirname(bamFileIn)) == 0: os.remove(bamFileIn) # report drop totals print(("{} read fragments dropped, primer side read not mapped".format(readPairCounts[NUM_PRIMER_SIDE_NOT_MAPPED]))) print(("{} read fragments dropped, random side read not mapped".format(readPairCounts[NUM_RANDOM_SIDE_NOT_MAPPED]))) print(("{} read fragments dropped, R1 and R2 not mapped to same locus".format(readPairCounts[NUM_R1_R2_NOT_AT_SAME_LOCUS]))) print(("{} read fragments dropped, FF or RR mapping orientation".format(readPairCounts[NUM_R1_R2_SAME_ORIENTATION]))) print(("{} read fragments dropped, split alignment".format(readPairCounts[NUM_SPLIT_ALIGNMENT]))) print(("{} read fragments dropped, low mapping quality MAPQ < 17".format(readPairCounts[NUM_LOW_MAPQ]))) print(("{} read fragments dropped, less than 25 bp aligned to genome".format(readPairCounts[NUM_LT_25BP_ALIGNED]))) print(("{} read fragments written".format(readPairCounts[NUM_WRITTEN_OUT])))
def getNumberOfAlignments(bamfile): '''return number of alignments in bamfile. ''' samfile = pysam.Samfile(bamfile) return samfile.mapped
def main(): description = """ disambiguate.py disambiguates between two organisms that have alignments from the same source of fastq files. An example where this might be useful is as part of an explant RNA/DNA-Seq workflow where an informatics approach is used to distinguish between human and mouse RNA/DNA reads. For reads that have aligned to both organisms, the functionality is based on comparing quality scores from either Tophat of BWA. Read name is used to collect all alignments for both mates (_1 and _2) and compared between human and mouse alignments. For Tophat (default, can be changed using option -a) and Hisat2, the sum of the tags XO, NM and NH is evaluated and the lowest sum wins the paired end reads. For equal scores (both mates, both species), the reads are assigned as ambiguous. The alternative algorithm (STAR, bwa) disambiguates (for aligned reads) by tags AS (alignment score, higher better), followed by NM (edit distance, lower better). The output directory will contain four files:\n ...disambiguatedSpeciesA.bam: Reads that could be assigned to species A ...disambiguatedSpeciesB.bam: Reads that could be assigned to species B ...ambiguousSpeciesA.bam: Reads aligned to species A that also aligned \n\tto B but could not be uniquely assigned to either ...ambiguousSpeciesB.bam: Reads aligned to species B that also aligned \n\tto A but could not be uniquely assigned to either ..._summary.txt: A summary of unique read names assigned to species A, B \n\tand ambiguous. Examples: disambiguate.py test/human.bam test/mouse.bam disambiguate.py -s mysample1 test/human.bam test/mouse.bam """ parser = ArgumentParser(description=description, formatter_class=RawTextHelpFormatter) parser.add_argument('A', help='Input BAM file for species A.') parser.add_argument('B', help='Input BAM file for species B.') parser.add_argument('-o', '--output-dir', default="disambres", help='Output directory.') parser.add_argument('-i', '--intermediate-dir', default="intermfiles", help='Location to store intermediate files') parser.add_argument( '-d', '--no-sort', action='store_true', default=False, help='Disable BAM file sorting. Use this option if the ' 'files have already been name sorted.') parser.add_argument( '-s', '--prefix', default='', help='A prefix (e.g. sample name) to use for the output ' 'BAM files. If not provided, the input BAM file prefix ' 'will be used. Do not include .bam in the prefix.') parser.add_argument('-a', '--aligner', default='tophat', choices=('tophat', 'hisat2', 'bwa', 'star'), help='The aligner used to generate these reads. Some ' 'aligners set different tags.') args = parser.parse_args() #code numhum = nummou = numamb = 0 #starttime = time.clock() # parse inputs humanfilename = args.A mousefilename = args.B samplenameprefix = args.prefix outputdir = args.output_dir intermdir = args.intermediate_dir disablesort = args.no_sort disambalgo = args.aligner supportedalgorithms = set(['tophat', 'hisat2', 'bwa', 'star']) # check existence of input BAM files if not (file_exists(humanfilename) and file_exists(mousefilename)): sys.stderr.write( "\nERROR in disambiguate.py: Two existing input BAM files " "must be specified as positional arguments\n") sys.exit(2) if len(samplenameprefix) < 1: humanprefix = path.basename(humanfilename.replace(".bam", "")) mouseprefix = path.basename(mousefilename.replace(".bam", "")) else: if samplenameprefix.endswith(".bam"): samplenameprefix = samplenameprefix[0:samplenameprefix.rfind( ".bam" )] # the above if is not stricly necessary for this to work humanprefix = samplenameprefix mouseprefix = samplenameprefix samplenameprefix = None # clear variable if disambalgo.lower() not in supportedalgorithms: print(disambalgo + " is not a supported disambiguation scheme at the moment.") sys.exit(2) if disablesort: humanfilenamesorted = humanfilename # assumed to be sorted externally... mousefilenamesorted = mousefilename # assumed to be sorted externally... else: if not path.isdir(intermdir): makedirs(intermdir) humanfilenamesorted = path.join( intermdir, humanprefix + ".speciesA.namesorted.bam") mousefilenamesorted = path.join( intermdir, mouseprefix + ".speciesB.namesorted.bam") if not path.isfile(humanfilenamesorted): pysam.sort("-n", "-m", "2000000000", "-o", humanfilenamesorted, humanfilename) if not path.isfile(mousefilenamesorted): pysam.sort("-n", "-m", "2000000000", "-o", mousefilenamesorted, mousefilename) # read in human reads and form a dictionary myHumanFile = pysam.Samfile(humanfilenamesorted, "rb") myMouseFile = pysam.Samfile(mousefilenamesorted, "rb") if not path.isdir(outputdir): makedirs(outputdir) myHumanUniqueFile = pysam.Samfile(path.join( outputdir, humanprefix + ".disambiguatedSpeciesA.bam"), "wb", template=myHumanFile) myHumanAmbiguousFile = pysam.Samfile(path.join( outputdir, humanprefix + ".ambiguousSpeciesA.bam"), "wb", template=myHumanFile) myMouseUniqueFile = pysam.Samfile(path.join( outputdir, mouseprefix + ".disambiguatedSpeciesB.bam"), "wb", template=myMouseFile) myMouseAmbiguousFile = pysam.Samfile(path.join( outputdir, mouseprefix + ".ambiguousSpeciesB.bam"), "wb", template=myMouseFile) summaryFile = open(path.join(outputdir, humanprefix + '_summary.txt'), 'w') #initialise try: nexthumread = myHumanFile.next() nextmouread = myMouseFile.next() except StopIteration: print("No reads in one or either of the input files") sys.exit(2) EOFmouse = EOFhuman = False prevHumID = '-+=RANDOMSTRING=+-' prevMouID = '-+=RANDOMSTRING=+-' while not EOFmouse & EOFhuman: while not (nat_cmp(nexthumread.qname, nextmouread.qname) == 0): # check order between current human and mouse qname (find a point where they're identical, i.e. in sync) while nat_cmp( nexthumread.qname, nextmouread.qname ) > 0 and not EOFmouse: # mouse is "behind" human, output to mouse disambiguous myMouseUniqueFile.write(nextmouread) if not nextmouread.qname == prevMouID: nummou += 1 # increment mouse counter for unique only prevMouID = nextmouread.qname try: nextmouread = myMouseFile.next() except StopIteration: EOFmouse = True while nat_cmp( nexthumread.qname, nextmouread.qname ) < 0 and not EOFhuman: # human is "behind" mouse, output to human disambiguous myHumanUniqueFile.write(nexthumread) if not nexthumread.qname == prevHumID: numhum += 1 # increment human counter for unique only prevHumID = nexthumread.qname try: nexthumread = myHumanFile.next() except StopIteration: EOFhuman = True if EOFhuman or EOFmouse: break # at this point the read qnames are identical and/or we've reached EOF humlist = list() moulist = list() if nat_cmp(nexthumread.qname, nextmouread.qname) == 0: humlist.append(nexthumread) nexthumread = read_next_reads( myHumanFile, humlist ) # read more reads with same qname (the function modifies humlist directly) if nexthumread == None: EOFhuman = True moulist.append(nextmouread) nextmouread = read_next_reads( myMouseFile, moulist ) # read more reads with same qname (the function modifies moulist directly) if nextmouread == None: EOFmouse = True # perform comparison to check mouse, human or ambiguous if len(moulist) > 0 and len(humlist) > 0: myAmbiguousness = disambiguate(humlist, moulist, disambalgo) if myAmbiguousness < 0: # mouse nummou += 1 # increment mouse counter for myRead in moulist: myMouseUniqueFile.write(myRead) elif myAmbiguousness > 0: # human numhum += 1 # increment human counter for myRead in humlist: myHumanUniqueFile.write(myRead) else: # ambiguous numamb += 1 # increment ambiguous counter for myRead in moulist: myMouseAmbiguousFile.write(myRead) for myRead in humlist: myHumanAmbiguousFile.write(myRead) if EOFhuman: #flush the rest of the mouse reads while not EOFmouse: myMouseUniqueFile.write(nextmouread) if not nextmouread.qname == prevMouID: nummou += 1 # increment mouse counter for unique only prevMouID = nextmouread.qname try: nextmouread = myMouseFile.next() except StopIteration: #print("3") EOFmouse = True if EOFmouse: #flush the rest of the human reads while not EOFhuman: myHumanUniqueFile.write(nexthumread) if not nexthumread.qname == prevHumID: numhum += 1 # increment human counter for unique only prevHumID = nexthumread.qname try: nexthumread = myHumanFile.next() except StopIteration: EOFhuman = True summaryFile.write( "sample\tunique species A pairs\tunique species B pairs\tambiguous pairs\n" ) summaryFile.write(humanprefix + "\t" + str(numhum) + "\t" + str(nummou) + "\t" + str(numamb) + "\n") summaryFile.close() myHumanFile.close() myMouseFile.close() myHumanUniqueFile.close() myHumanAmbiguousFile.close() myMouseUniqueFile.close() myMouseAmbiguousFile.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-f", "--filename", dest="filename", type="string", help="bamfile") parser.add_option("-a", "--aligner", dest="aligner", type="string", help="bamfile", default="bwa") parser.add_option("-r", "--output-report", type="string", dest="report", help="bamfile", default="") parser.add_option("-o", "--outfile", dest="outfile", type="string", help="bamfile", default="") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) # Check the aligner is supported if options.aligner != "bwa": raise ValueError( "Currently only bwa is supported as aligner specific flags are used" ) # Check that either a report or outfile name has been specified if options.report == "" and options.outfile == "": raise ValueError("Nothing to do") # Analyse the bamfile samfile = pysam.Samfile(options.filename, "rb") uniq_map, best_map, uORb_map = {}, {}, {} properly_paired = 0 for read in samfile.fetch(): if read.is_proper_pair: tagd = dict(read.tags) u, b, key = False, False, read.qname if tagd["XT"] == "U": u = True uniq_map[key] = 1 if "X0" in tagd: if tagd["X0"] == 1: b = True best_map[key] = 1 if u is True or b is True: uORb_map[key] = 1 properly_paired += 1 samfile.close() npp = properly_paired / 2 E.info("No proper pairs: %s" % npp) # Write a tabular report if report name given if options.report != "": E.info("Writing report on no. proper pairs with unique/best reads") def _row(x, npp=npp): name, d = x n = len(d.keys()) pc = float(n) / npp * 100 line = "%s\t%i\t%.2f" % (name, n, pc) return (line) header = "\t".join( ["pair_criteria", "n_proper_pairs", "percent_proper_pairs"]) with open(options.report, "w") as report: report.write(header + "\n") for x in [("unique", uniq_map), ("best", best_map), ("unique_or_best", uORb_map)]: report.write(_row(x) + "\n") # Create new bam containing uniquely mapping read pairs # if outfile specified if options.outfile != "": E.info("Writing proper pairs with unique or best read to %s" % options.outfile) samfile = pysam.Samfile(options.filename, "rb") outbam = pysam.Samfile(options.outfile, "wb", template=samfile) for read in samfile.fetch(): if read.is_proper_pair: if read.qname in uORb_map: outbam.write(read) samfile.close() outbam.close()
#parse fasta input of valid clusters with min_size = 2! cluster_ids = set() total_lines = 0 for cluster in clusters: if cluster.startswith('C'): split_line = cluster.split('\t') if int(split_line[2]) > 1: cluster_ids.add(split_line[1]) else: total_lines += 1 clusters.seek(0) print len(cluster_ids) header = create_SAM_header(clusters) outsam = pysam.Samfile(outsam, 'wh', header=header) n = 0 cluster_list = seq_in.keys() for line in clusters: #clusters contains an ordered list of sequences which should be processed #Forward reads /1 or merged fastq reads always represent crick reads. #These have number below CRICK_MAX if not n % 1000 and n: print "processed %s out of %s lines" % (n, total_lines) # make_ref(cluster_records) if line.startswith('S') or line.startswith('H'): n += 1 if line.split('\t')[1] not in cluster_ids: continue cluster_instance = Cluster_obj(line)
def crossmap_bam_file(mapping, chainfile, infile, outfile_prefix, chrom_size, IS_size=200, IS_std=30.0, fold=3, addtag=True): ''' Description ----------- Convert genome coordinates (in BAM/SAM format) between assemblies. BAM/SAM format: http://samtools.sourceforge.net/ chrom_size is target chromosome size Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. chainfile : file Input chain format file. infile : file Input BAM, SAM or CRAM foramt file. outfile_prefix : str Output prefix. chrom_size : dict Chromosome size of the *target* assembly, used to build bam header. IS_size : int Average insert size of pair-end sequencing. IS_std : float Stanadard deviation of insert size. fold : float A mapped pair is considered as \"proper pair\" if both ends mapped to different strand and the distance between them is less then fold * stdev from the mean. addtag : bool if addtag is set to True, will add tags to each alignmnet: Q = QC (QC failed) N = unmapped (originally unmapped or originally mapped but failed to liftover to new assembly) M = multiple mapped (alignment can be liftover to multiple places) U = unique mapped (alignment can be liftover to only 1 place) tags for pair-end sequencing include: QF: QC failed NN: both read1 and read2 unmapped NU: read1 unmapped, read2 unique mapped NM: read1 unmapped, multiple mapped UN: read1 uniquely mapped, read2 unmap UU: both read1 and read2 uniquely mapped UM: read1 uniquely mapped, read2 multiple mapped MN: read1 multiple mapped, read2 unmapped MU: read1 multiple mapped, read2 unique mapped MM: both read1 and read2 multiple mapped tags for single-end sequencing include: QF: QC failed SN: unmaped SM: multiple mapped SU: uniquely mapped ''' # determine the input file format (BAM, CRAM or SAM) file_type = '' if infile.lower().endswith('.bam'): file_type = 'BAM' comments = ['ORIGINAL_BAM_FILE=' + infile] samfile = pysam.Samfile(infile, 'rb') if len(samfile.header) == 0: print("BAM file has no header section. Exit!", file=sys.stderr) sys.exit(1) elif infile.lower().endswith('.cram'): file_type = 'CRAM' comments = ['ORIGINAL_CRAM_FILE=' + infile] samfile = pysam.Samfile(infile, 'rc') if len(samfile.header) == 0: print("CRAM file has no header section. Exit!", file=sys.stderr) sys.exit(1) elif infile.lower().endswith('.sam'): file_type = 'SAM' comments = ['ORIGINAL_SAM_FILE=' + infile] samfile = pysam.Samfile(infile, 'r') if len(samfile.header) == 0: print("SAM file has no header section. Exit!", file=sys.stderr) sys.exit(1) else: print( "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'.", file=sys.stderr) sys.exit(1) comments.append('CHAIN_FILE=' + chainfile) sam_ori_header = samfile.header.to_dict() # chromosome ID style of the original BAM file chrom_style = sam_ori_header['SQ'][0]['SN'] # either 'chr1' or '1' # update chrom_size of target genome target_chrom_sizes = {} for n, l in chrom_size.items(): target_chrom_sizes[update_chromID(chrom_style, n)] = l (new_header, name_to_id) = sam_header.bam_header_generator( orig_header=sam_ori_header, chrom_size=target_chrom_sizes, prog_name="CrossMap", prog_ver=__version__, format_ver=1.0, sort_type='coordinate', co=comments) # write to file if outfile_prefix is not None: if file_type == 'BAM': OUT_FILE = pysam.Samfile(outfile_prefix + '.bam', "wb", header=new_header) logging.info("Liftover BAM file \"%s\" to \"%s\"" % (infile, outfile_prefix + '.bam')) elif file_type == 'CRAM': OUT_FILE = pysam.Samfile(outfile_prefix + '.bam', "wb", header=new_header) logging.info("Liftover CRAM file \"%s\" to \"%s\"" % (infile, outfile_prefix + '.bam')) elif file_type == 'SAM': OUT_FILE = pysam.Samfile(outfile_prefix + '.sam', "wh", header=new_header) logging.info("Liftover SAM file \"%s\" to \"%s\"" % (infile, outfile_prefix + '.sam')) else: logging.error( "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'." ) sys.exit(1) # write to screen else: if file_type == 'BAM': OUT_FILE = pysam.Samfile('-', "wb", header=new_header) logging.info("Liftover BAM file: %s" % infile) elif file_type == 'CRAM': OUT_FILE = pysam.Samfile('-', "wb", header=new_header) logging.info("Liftover CRAM file: %s" % infile) elif file_type == 'SAM': OUT_FILE = pysam.Samfile('-', "w", header=new_header) logging.info("Liftover SAM file: %s" % infile) else: logging.error( "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'." ) sys.exit(1) QF = 0 NN = 0 NU = 0 NM = 0 UN = 0 UU = 0 UM = 0 MN = 0 MU = 0 MM = 0 SN = 0 SM = 0 SU = 0 total_item = 0 try: while (1): total_item += 1 old_alignment = next(samfile) new_alignment = pysam.AlignedRead() # create AlignedRead object new_alignment.query_name = old_alignment.query_name # 1st column. read name. new_alignment.query_sequence = old_alignment.query_sequence # 10th column. read sequence. all bases. new_alignment.query_qualities = old_alignment.query_qualities # 11th column. read sequence quality. all bases. new_alignment.set_tags(old_alignment.get_tags()) # 12 - columns # by default pysam will change RG:Z to RG:A, which can cause downstream failures with GATK and freebayes # Thanks Wolfgang Resch <*****@*****.**> identified this bug and provided solution. try: rg, rgt = old_alignment.get_tag("RG", with_value_type=True) except KeyError: pass else: new_alignment.set_tag("RG", str(rg), rgt) ## Pair-end sequencing if old_alignment.is_paired: new_alignment.flag = 0x1 #pair-end in sequencing if old_alignment.is_read1: new_alignment.flag = new_alignment.flag | 0x40 elif old_alignment.is_read2: new_alignment.flag = new_alignment.flag | 0x80 if old_alignment.is_qcfail: new_alignment.flag = new_alignment.flag | 0x200 new_alignment.reference_id = -1 #3 new_alignment.reference_start = 0 #4 new_alignment.mapping_quality = 255 #5 new_alignment.cigartuples = old_alignment.cigartuples #6 new_alignment.next_reference_id = -1 #7 new_alignment.next_reference_start = 0 #8 new_alignment.template_length = 0 #9 QF += 1 if addtag: new_alignment.set_tag(tag="QF", value=0) OUT_FILE.write(new_alignment) continue #================================== # R1 originally unmapped #================================== elif old_alignment.is_unmapped: new_alignment.flag = new_alignment.flag | 0x4 #2 new_alignment.reference_id = -1 #3 new_alignment.reference_start = 0 #4 new_alignment.mapping_quality = 255 #5 new_alignment.cigartuples = old_alignment.cigartuples #6 # R1 & R2 originally unmapped if old_alignment.mate_is_unmapped: new_alignment.next_reference_id = -1 #7 new_alignment.next_reference_start = 0 #8 new_alignment.template_length = 0 #9 NN += 1 if addtag: new_alignment.set_tag(tag="NN", value=0) OUT_FILE.write(new_alignment) continue # R1 unmap, R2 is mapped else: try: read2_chr = samfile.get_reference_name( old_alignment.next_reference_id) read2_strand = '-' if old_alignment.mate_is_reverse else '+' read2_start = old_alignment.next_reference_start read2_end = read2_start + 1 read2_maps = map_coordinates( mapping, read2_chr, read2_start, read2_end, read2_strand) except: read2_maps = None #------------------------------------ # R1 unmapped, R2 failed to liftover #------------------------------------ if read2_maps is None: new_alignment.next_reference_id = -1 #7 new_alignment.next_reference_start = 0 #8 new_alignment.template_length = 0 #9 NN += 1 if addtag: new_alignment.set_tag(tag="NN", value=0) OUT_FILE.write(new_alignment) continue #------------------------------------ # R1 unmapped, R2 unique #------------------------------------ elif len(read2_maps) == 2: # 2-9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.reference_id = name_to_id[ read2_maps[1] [0]] #recommend to set the RNAME of unmapped read to its mate's new_alignment.reference_start = read2_maps[1][ 1] #recommend to set the POS of unmapped read to its mate's new_alignment.mapping_quality = old_alignment.mapping_quality new_alignment.cigartuples = old_alignment.cigartuples new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 NU += 1 if addtag: new_alignment.set_tag(tag="NU", value=0) OUT_FILE.write(new_alignment) continue #------------------------------------ # R1 unmapped, R2 multiple #------------------------------------ else: if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 # 2-9 new_alignment.flag = new_alignment.flag | 0x100 new_alignment.reference_id = name_to_id[ read2_maps[1][0]] new_alignment.reference_start = read2_maps[1][1] new_alignment.mapping_quality = old_alignment.mapping_quality new_alignment.cigartuples = old_alignment.cigartuples new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 NM += 1 if addtag: new_alignment.set_tag(tag="NM", value=0) OUT_FILE.write(new_alignment) continue #================================== # R1 is originally mapped #================================== else: try: read1_chr = samfile.get_reference_name( old_alignment.reference_id) read1_strand = '-' if old_alignment.is_reverse else '+' read1_start = old_alignment.reference_start read1_end = old_alignment.reference_end read1_maps = map_coordinates(mapping, read1_chr, read1_start, read1_end, read1_strand) except: read1_maps = None if not old_alignment.mate_is_unmapped: try: read2_chr = samfile.get_reference_name( old_alignment.next_reference_id) read2_strand = '-' if old_alignment.mate_is_reverse else '+' read2_start = old_alignment.next_reference_start read2_end = read2_start + 1 read2_maps = map_coordinates( mapping, read2_chr, read2_start, read2_end, read2_strand) except: read2_maps = None #------------------------------------ # R1 failed to liftover #------------------------------------ if read1_maps is None: # read2 is unmapped or failed to convertion if old_alignment.mate_is_unmapped or (read2_maps is None): # col2 - col9 new_alignment.flag = new_alignment.flag | 0x4 #2 new_alignment.reference_id = -1 #3 new_alignment.reference_start = 0 #4 new_alignment.mapping_quality = 255 #5 new_alignment.cigartuples = old_alignment.cigartuples #6 new_alignment.next_reference_id = -1 #7 new_alignment.next_reference_start = 0 #8 new_alignment.template_length = 0 #9 if addtag: new_alignment.set_tag(tag="NN", value=0) NN += 1 OUT_FILE.write(new_alignment) continue # read2 is unique mapped elif len(read2_maps) == 2: # col2 - col9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.reference_id = name_to_id[ read2_maps[1] [0]] #recommend to set the RNAME of unmapped read to its mate's new_alignment.reference_start = read2_maps[1][ 1] #recommend to set the POS of unmapped read to its mate's new_alignment.mapping_quality = old_alignment.mapping_quality new_alignment.cigartuples = old_alignment.cigartuples new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] #start new_alignment.template_length = 0 NU += 1 if addtag: new_alignment.set_tag(tag="NU", value=0) OUT_FILE.write(new_alignment) continue # read2 is multiple mapped else: # col2 - col9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.flag = new_alignment.flag | 0x100 new_alignment.reference_id = name_to_id[ read2_maps[1][0]] new_alignment.reference_start = read2_maps[1][1] new_alignment.mapping_quality = 255 # mapq not available new_alignment.cigartuples = old_alignment.cigartuples new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] #start new_alignment.template_length = 0 NM += 1 if addtag: new_alignment.set_tag(tag="NM", value=0) OUT_FILE.write(new_alignment) continue #------------------------------------ # R1 uniquely mapped #------------------------------------ elif len(read1_maps) == 2: # col2 - col5 if read1_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x10 new_alignment.reference_id = name_to_id[read1_maps[1] [0]] new_alignment.reference_start = read1_maps[1][1] new_alignment.mapping_quality = old_alignment.mapping_quality if read1_maps[0][3] != read1_maps[1][ 3]: # opposite strand # 6 new_alignment.cigartuples = old_alignment.cigartuples[:: -1] #reverse cigar tuple # 10 new_alignment.query_sequence = revcomp_DNA( old_alignment.query_sequence ) #reverse complement read sequence # 11 new_alignment.query_qualities = old_alignment.query_qualities[:: -1] #reverse quality string elif read1_maps[0][3] == read1_maps[1][ 3]: # same strand # 6 new_alignment.cigartuples = old_alignment.cigartuples # R2 unmapped before or after conversion if (old_alignment.mate_is_unmapped) or (read2_maps is None): #2,7-9 new_alignment.flag = new_alignment.flag | 0x8 new_alignment.next_reference_id = name_to_id[ read1_maps[1][0]] new_alignment.next_reference_start = read1_maps[1][ 1] new_alignment.template_length = 0 UN += 1 if addtag: new_alignment.set_tag(tag="UN", value=0) OUT_FILE.write(new_alignment) continue # R2 is unique mapped elif len(read2_maps) == 2: # 2,7-9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] #chrom new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = abs( new_alignment.reference_start - new_alignment.next_reference_start ) + old_alignment.reference_length # 2 if (read2_maps[1][3] != read1_maps[1][3]) and ( new_alignment.template_length <= IS_size + fold * IS_std) and ( new_alignment.template_length >= IS_size - fold * IS_std): new_alignment.flag = new_alignment.flag | 0x2 UU += 1 if addtag: new_alignment.set_tag(tag="UU", value=0) OUT_FILE.write(new_alignment) continue # R2 is multiple mapped else: # 2 (strand) if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 # 2 (secondary alignment) new_alignment.flag = new_alignment.flag | 0x100 #7-9 new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 UM += 1 if addtag: new_alignment.set_tag(tag="UM", value=0) OUT_FILE.write(new_alignment) continue #------------------------------------ # R1 multiple mapped #----------------------------------- elif len(read1_maps) > 2 and len(read1_maps) % 2 == 0: # 2 new_alignment.flag = new_alignment.flag | 0x100 if read1_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x10 # 3-5 new_alignment.tid = name_to_id[read1_maps[1] [0]] #chrom new_alignment.pos = read1_maps[1][1] #start new_alignment.mapq = 255 if read1_maps[0][3] != read1_maps[1][ 3]: # opposite strand # 6 new_alignment.cigartuples = old_alignment.cigartuples[:: -1] #reverse cigar tuple # 10 new_alignment.query_sequence = revcomp_DNA( old_alignment.query_sequence ) #reverse complement read sequence # 11 new_alignment.query_qualities = old_alignment.query_qualities[:: -1] #reverse quality string elif read1_maps[0][3] == read1_maps[1][ 3]: # same strand # 6 new_alignment.cigartuples = old_alignment.cigartuples # (1) R2 is unmapped if (old_alignment.mate_is_unmapped) or (read2_maps is None): #2,7-9 new_alignment.flag = new_alignment.flag | 0x8 new_alignment.next_reference_id = name_to_id[ read1_maps[1][0]] new_alignment.next_reference_start = read1_maps[1][ 1] new_alignment.template_length = 0 MN += 1 if addtag: new_alignment.set_tag(tag="MN", value=0) OUT_FILE.write(new_alignment) continue # (2) read2 is unique mapped elif len(read2_maps) == 2: # 2,7-9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] #chrom new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 MU += 1 if addtag: new_alignment.set_tag(tag="MU", value=0) OUT_FILE.write(new_alignment) continue # (3) R2 is multiple mapped else: # 2,7-9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 # 2 (secondary alignment) new_alignment.flag = new_alignment.flag | 0x100 new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] #chrom new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 MM += 1 if addtag: new_alignment.set_tag(tag="MM", value=0) OUT_FILE.write(new_alignment) continue # Singel end sequencing else: # 7-9 new_alignment.next_reference_id = -1 new_alignment.next_reference_start = 0 new_alignment.template_length = 0 # (1) originally unmapped if old_alignment.is_unmapped: # 2-6 new_alignment.flag = new_alignment.flag | 0x4 new_alignment.reference_id = -1 new_alignment.reference_start = 0 new_alignment.mapping_quality = 255 new_alignment.cigartuples = old_alignment.cigartuples SN += 1 if addtag: new_alignment.set_tag(tag="SN", value=0) OUT_FILE.write(new_alignment) continue else: new_alignment.flag = 0x0 read_chr = samfile.get_reference_name( old_alignment.reference_id) read_strand = '-' if old_alignment.is_reverse else '+' read_start = old_alignment.reference_start read_end = old_alignment.reference_end read_maps = map_coordinates(mapping, read_chr, read_start, read_end, read_strand) # (2) unmapped afte liftover if read_maps is None: new_alignment.flag = new_alignment.flag | 0x4 new_alignment.reference_id = -1 new_alignment.reference_start = 0 new_alignment.mapping_quality = 255 SN += 1 if addtag: new_alignment.set_tag(tag="SN", value=0) OUT_FILE.write(new_alignment) continue # (3) unique mapped if len(read_maps) == 2: if read_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x10 if read_maps[0][3] != read_maps[1][3]: # 6 new_alignment.cigartuples = old_alignment.cigartuples[:: -1] #reverse cigar tuple # 10 new_alignment.query_sequence = revcomp_DNA( old_alignment.query_sequence ) #reverse complement read sequence # 11 try: new_alignment.query_qualities = old_alignment.query_qualities[:: -1] #reverse quality string except: new_alignment.query_qualities = [] else: # 6 new_alignment.cigartuples = old_alignment.cigartuples # 3-5 new_alignment.reference_id = name_to_id[read_maps[1] [0]] new_alignment.reference_start = read_maps[1][1] new_alignment.mapping_quality = old_alignment.mapping_quality SU += 1 if addtag: new_alignment.set_tag(tag="SU", value=0) OUT_FILE.write(new_alignment) continue # (4) multiple mapped if len(read_maps) > 2 and len(read_maps) % 2 == 0: new_alignment.flag = new_alignment.flag | 0x100 if read_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x10 if read_maps[0][3] != read_maps[1][3]: # 6 new_alignment.cigartuples = old_alignment.cigartuples[:: -1] #reverse cigar tuple # 10 new_alignment.query_sequence = revcomp_DNA( old_alignment.query_sequence ) #reverse complement read sequence # 11 new_alignment.query_qualities = old_alignment.query_qualities[:: -1] #reverse quality string else: # 6 new_alignment.cigartuples = old_alignment.cigartuples # 3-5 new_alignment.tid = name_to_id[read_maps[1][0]] new_alignment.pos = read_maps[1][1] new_alignment.mapq = old_alignment.mapq SM += 1 if addtag: new_alignment.set_tag(tag="SM", value=0) OUT_FILE.write(new_alignment) continue except StopIteration: logging.info("Done!") OUT_FILE.close() if outfile_prefix is not None: if file_type == "BAM" or file_type == "CRAM": try: logging.info( 'Sort "%s" and save as "%s"' % (outfile_prefix + '.bam', outfile_prefix + '.sorted.bam')) pysam.sort("-o", outfile_prefix + '.sorted.bam', outfile_prefix + '.bam') except: logging.warning("output BAM file was NOT sorted") try: logging.info('Index "%s" ...' % (outfile_prefix + '.sorted.bam')) pysam.index(outfile_prefix + '.sorted.bam', outfile_prefix + '.sorted.bam.bai') except: logging.warning("output BAM file was NOT indexed.") print("\nTotal alignments:" + str(total_item - 1)) print("\tQC failed: " + str(QF)) if max(NN, NU, NM, UN, UU, UM, MN, MU, MM) > 0: print("\tPaired-end reads:") print("\t\tR1 unique, R2 unique (UU): " + str(UU)) print("\t\tR1 unique, R2 unmapp (UN): " + str(UN)) print("\t\tR1 unique, R2 multiple (UM): " + str(UM)) print("\t\tR1 multiple, R2 multiple (MM): " + str(MM)) print("\t\tR1 multiple, R2 unique (MU): " + str(MU)) print("\t\tR1 multiple, R2 unmapped (MN): " + str(MN)) print("\t\tR1 unmap, R2 unmap (NN): " + str(NN)) print("\t\tR1 unmap, R2 unique (NU): " + str(NU)) print("\t\tR1 unmap, R2 multiple (NM): " + str(NM)) if max(SN, SU, SM) > 0: print("\tSingle-end reads:") print("\t\tUniquley mapped (SU): " + str(SU)) print("\t\tMultiple mapped (SM): " + str(SM)) print("\t\tUnmapped (SN): " + str(SN))
import traceback import os if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-v", dest="verbose", action="store_true") parser.add_argument("unmapped_bam", nargs="+") parser.add_argument("fastq", help="e.g. test.fastq") args = parser.parse_args() try: with open(args.fastq, "w") as fastq_file: for bam_file in args.unmapped_bam: if os.path.exists(bam_file): try: samfile = pysam.Samfile(bam_file, mode="rb", check_header=False, check_sq=False) for x in samfile.fetch(until_eof=True): fastq_file.write("@%s\n%s\n+\n%s\n" % (x.qname, x.seq, x.qual)) samfile.close() except Exception: traceback.print_exc() except Exception: traceback.print_exc()
if (args.filter is not None): with open(args.filter) as filterFile: filt = {l.strip(): True for l in filterFile} printed = {} for regionStr in args.region: region = Tools.FormatRegion(regionStr) if (region is None): print "malformatted region " + ' '.join(regionStr) sys.exit(1) region = (region[0], region[1] - args.slop, region[2] + args.slop) samFile = pysam.Samfile(args.bam) nAlns = samFile.count(region[0], region[1], region[2]) if (nAlns <= args.max or args.subsample): if (nAlns > args.max and args.subsample): lengths = [] index = 0 tmpPrinted = {} for aln in samFile.fetch(region[0], region[1], region[2]): if (aln.mapq < args.minqv): continue if (args.primary and aln.flag & 256 != 0): continue if (aln.qname in tmpPrinted): continue tmpPrinted[aln.qname] = True
def add_tag_to_bam(tmp_bam, out_bam, tag_map): sam_handle = pysam.Samfile(tmp_bam) out_handle = pysam.Samfile(out_bam, 'wb', template=sam_handle) for rec in sam_handle: rec.set_tag('BX', tag_map[rec.qname]) out_handle.write(rec)
Date : March 18, 2016 Author : Heather Landry Remove reads in bam file resulting from PCR duplicates. This script records all barcodes and coordinates at a specific position. For every bam line, if the barcode and coordinate has not been seen previously, it will print; if the barcode and position has been seen previously, it will not print to a new file. use : python removePCRdupsFromBAM.py iBAM (input BAM file with only unique alignments) [1] oBAM (output BAM file containing only non duplicated reads) [2] """ import sys, pysam, os, numpy, re iBAM = pysam.Samfile(sys.argv[1], 'rb') oBAM = pysam.Samfile(sys.argv[2], 'wb', template=iBAM) MB = set() # read through starting bam file for read in iBAM: mb = read.qname.split('_MolecularBarcode:')[1] chrom = iBAM.getrname(read.tid) # selecting the 3' position for pos strand if read.is_reverse: start = read.aend std='pos' # selecting the 3' position for neg strand
def do_local_assembly(self, root_ctg, asmrootdir_path): self.logger.log( 'assembling barcoded reads for seed {}'.format(root_ctg)) ctg_size_map = util.get_fasta_sizes(self.options.ctgfasta_path) # check enough read/barcode coverage to warrant denovo assembly numreads = 0 bcodes = set() fhandle = pysam.Samfile(self.options.reads_ctg_bam_path, 'rb') for read in fhandle.fetch(root_ctg): if read.is_unmapped or read.mapq < 10: continue numreads += 1 bcode = util.get_barcode(read) if bcode != None: bcodes.add(bcode) fhandle.close() size = ctg_size_map[root_ctg] cov = 95. * numreads / size if cov < 10. or len(bcodes) < 30.: self.logger.log( 'seed {} contig does not have high enough coverage'.format( root_ctg)) self.logger.log(' - {} bcodes, {}x'.format(len(bcodes), cov)) out_path = os.path.join(asmrootdir_path, 'local-asm-merged.fa') util.touch(out_path) return # create a local assembler for this root contig asm = LocalAssembler( root_ctg, self.options.ctgfasta_path, self.options.reads_ctg_bam_path, self.options.input_fqs, asmrootdir_path, self.logger, ) self.logger.log('determing local assemblies') local_asms = asm.gen_local_cands() self.logger.log(' - found {} candidates'.format(len(local_asms))) # do not locally assemble with other seeds that are lexicographcially # smaller than the root. these will get run in other bins seed_ctgs = set( filter( lambda (c): (c > root_ctg and ctg_size_map[c] >= MIN_SEED_SIZE), ctg_size_map.keys(), )) self.logger.log('performing local assemblies') local_asm_results = asm.assemble(local_asms, filt_ctgs=seed_ctgs) self.logger.log(' - finished {}'.format(len(local_asms))) # merge output contigs from local assemblies self.logger.log('merge long output contigs from local assemblies') mergedasm_path = os.path.join(asmrootdir_path, 'local-asm-merged.fa') total_asm_contigs = 0 total_asm_bp = 0 with open(mergedasm_path, 'w') as fout: for i, (local_asm, contig_path) in enumerate(local_asm_results): if contig_path == None: self.logger.log( 'contig path for local asm {} not generated'.format( str(local_asm))) continue fasta = pysam.FastaFile(contig_path) for contig in sorted( fasta.references, key=lambda (c): fasta.get_reference_length(c), reverse=True, ): seq = str(fasta.fetch(contig).upper()) if len(seq) < 2000: break total_asm_contigs += 1 total_asm_bp += len(seq) link_name = local_asm.link_ctg if link_name == None: link_name = 'seed' fout.write('>{}.{}${}.{}\n'.format(local_asm.root_ctg, link_name, contig, i)) fout.write(str(seq) + '\n') self.logger.log(' - {} contigs covering {} bases'.format( total_asm_contigs, total_asm_bp)) pass_path = os.path.join(asmrootdir_path, 'pass') util.touch(pass_path)
def processReads(samfile_path, exonTrees, repeatTrees, chimericBedFile): print("Processing all reads",file=sys.stderr) sys.stderr.flush() # Input Bam File samfile = pysam.Samfile( samfile_path, "rb" ) readIterator = samfile.fetch() # Horrible code to extract total number of reads in BAM file #readCount = sum([ eval('+'.join(l.rstrip('\n').split('\t')[2:]) ) for l in pysam.idxstats(samfile_path) ]) #print(" " + str(readCount) + " reads in BAM file",file=sys.stderr) localResults = {} count = 0 nextPerc = 5.0 # Go through every read in the bam file for read in readIterator: # Go through all read pairs that are: # Reads are paired # Pair on same chromosome # map quailty greater then zero (not multi-mapping) # [ add check to see read pair map quality as well ?] #if (read.is_proper_pair and read.is_read1 and read.tid==read.mrnm): if (read.is_read1 and read.is_paired and read.tid==read.rnext and int(read.mapq)>0): # Get chromosome for reads chr = samfile.getrname(read.tid) # Parse chromosome name to remove chr ('chr3' --> '3') # to remove chr (chr3 --> 3) chr = chr.replace("chr", "") # Skip reads not on chr 1-22,X,Y valid = list(range(1,23)) # chr1 - chr 22 valid.append('X') # chrX #valid.append('Y') # chrY if (chr not in str(valid)): # if read is not canonon chromosome skip continue # Get start coordinates for both reads start1 = read.pos start2 = read.mpos # [ Artem - Try working with spliced reads ] # Get end coordinates for both reads # [ Artem - Check if aligned length is used ] end1 = start1 + read.rlen end2 = start2 + read.rlen # At the moment, it will fault here if there is no exon # information for the chromosome # Exon results is a list of exons (rows in the exon file) that # intersect with read1/2 exon_results1 = exonTrees[str(chr)].findRange([start1,end1]) exon_results2 = exonTrees[str(chr)].findRange([start2,end2]) # Repeat results is a list of repeats (rows in the repeat file) that # intersect with read1/2 repeat_results1 = repeatTrees[str(chr)].findRange([start1,end1]) repeat_results2 = repeatTrees[str(chr)].findRange([start2,end2]) # Get TRUE/FALSE if reads 1/2 intersect with exons or repeats e1 = (len(exon_results1) > 0) e2 = (len(exon_results2) > 0) r1 = (len(repeat_results1) > 0) r2 = (len(repeat_results2) > 0) # Classify read1 as D/E/R/. if (e1 and r1): type1 = "D" elif (e1): type1 = "E" elif (r1): type1 = "R" else: type1 = "." # Classify read2 as D/E/R/. if (e2 and r2): type2 = "D" elif (e2): type2 = "E" elif (r2): type2 = "R" else: type2 = "." # Sort (so "RE" becomes "ER") type = "".join(sorted(type1 + type2)) # Is Read Chimeric? if chimericBedFile != 0 and ((e1 and r2) or (r1 and e2)): # Is Chy feature_start = min(start1, start2) feature_end = max(end1, end2) gap = abs(start1-start2) line = str(chr) + "\t" + str(feature_start) + "\t" + str(feature_end) + "\tchimericread\t960\t.\t" + str(feature_start) + "\t" + str(feature_end) + "\t0,0,250\t2\t" + str(read.rlen) + "," + str(read.rlen) + "\t0," + str(gap) chimericBedFile.write(line + "\n") # Zips up exon / repeat IDs pairs1 = list(zip(exon_results1, repeat_results2)) pairs2 = list(zip(exon_results2, repeat_results1)) pairs = pairs1 + pairs2 # Use these (exonID,repeatID) pairs as key to dictionary # and store read type in dictionary for p in pairs: result = localResults.get(p, []) result.append(type) localResults[p] = result count = count + 1 #perc = round((count/float(readCount))*100.0,1) # Print status to standard output #if (perc >= nextPerc): # print(" " + str(perc) + "% (" + str(datetime.time(datetime.now())) + ")",file=sys.stderr) # nextPerc = nextPerc + 5.0 return localResults
def main(): usage = "%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option( "-i", "--input", action="store", type="string", dest="input_files", help= 'Input BAM file(s). "-i" takes these input: 1) a single BAM file. 2) "," separated BAM files (no spaces allowed). 3) directory containing one or more bam files. 4) plain text file containing the path of one or more bam files (Each row is a BAM file path). All BAM files should be sorted and indexed using samtools. [required]' ) parser.add_option( "-r", "--refgene", action="store", type="string", dest="ref_gene_model", help= "Reference gene model in BED format. Must be strandard 12-column BED file. [required]" ) parser.add_option( "-c", "--minCov", action="store", type="int", dest="minimum_coverage", default=10, help="Minimum number of read mapped to a transcript. default=%default") parser.add_option( "-n", "--sample-size", action="store", type="int", dest="sample_size", default=100, help= "Number of equal-spaced nucleotide positions picked from mRNA. Note: if this number is larger than the length of mRNA (L), it will be halved until it's smaller than L. default=%default" ) parser.add_option( "-s", "--subtract-background", action="store_true", dest="subtract_bg", help= "Subtract background noise (estimated from intronic reads). Only use this option if there are substantial intronic reads." ) (options, args) = parser.parse_args() # if '-s' was set if options.subtract_bg: exon_ranges = union_exons(options.ref_gene_model) if options.sample_size < 0: print >> sys.stderr, "Number of nucleotide can't be negative" sys.exit(0) elif options.sample_size > 1000: print >> sys.stderr, "Warning: '-n' is too large! Please try smaller '-n' valeu if program is running slow." if not (options.input_files and options.ref_gene_model): parser.print_help() sys.exit(0) if not os.path.exists(options.ref_gene_model): print >> sys.stderr, '\n\n' + options.ref_gene_model + " does NOT exists" + '\n' parser.print_help() sys.exit(0) printlog("Get BAM file(s) ...") bamfiles = sorted(getBamFiles.get_bam_files(options.input_files)) if len(bamfiles) <= 0: print >> sys.stderr, "No BAM file found, exit." sys.exit(0) else: print >> sys.stderr, "Total %d BAM file(s):" % len(bamfiles) for f in bamfiles: print >> sys.stderr, "\t" + f for f in bamfiles: printlog("Processing " + f) SUM = open(os.path.basename(f).replace('bam', '') + 'summary.txt', 'w') print >> SUM, "\t".join( ['Bam_file', 'TIN(mean)', 'TIN(median)', 'TIN(stdev)']) OUT = open(os.path.basename(f).replace('bam', '') + 'tin.xls', 'w') print >> OUT, "\t".join( ["geneID", "chrom", "tx_start", "tx_end", "TIN"]) samfile = pysam.Samfile(f, "rb") sample_TINs = [] #sample level TIN, values are from different genes finish = 0 noise_level = 0.0 for gname, i_chr, i_tx_start, i_tx_end, intron_size, pick_positions in genomic_positions( refbed=options.ref_gene_model, sample_size=options.sample_size): finish += 1 # check minimum reads coverage if check_min_reads(samfile, i_chr, i_tx_start, i_tx_end, options.minimum_coverage) is not True: print >> OUT, '\t'.join([ str(i) for i in (gname, i_chr, i_tx_start, i_tx_end, 0.0) ]) continue # estimate background noise if '-s' was specified if options.subtract_bg: intron_signals = estimate_bg_noise(i_chr, i_tx_start, i_tx_end, samfile, exon_ranges) if intron_size > 0: noise_level = intron_signals / intron_size coverage = genebody_coverage(samfile, i_chr, sorted(pick_positions), noise_level) #for a,b in zip(sorted(pick_positions),coverage): # print str(a) + '\t' + str(b) tin1 = tin_score(cvg=coverage, l=len(pick_positions)) sample_TINs.append(tin1) print >> OUT, '\t'.join( [str(i) for i in (gname, i_chr, i_tx_start, i_tx_end, tin1)]) print >> sys.stderr, " %d transcripts finished\r" % (finish), print >> SUM, "\t".join([ str(i) for i in (os.path.basename(f), mean(sample_TINs), median(sample_TINs), std(sample_TINs)) ]) OUT.close() SUM.close() samfile.close()
def main(args, outs): def read_clusters(it): while True: head = str(it.next().strip()) seq1 = str(it.next().strip()) trim1 = seq1[:args.trim_length] seq1 = seq1[args.trim_length:] q1 = str(it.next().strip()) trim_q1 = q1[:args.trim_length] q1 = q1[args.trim_length:] seq2 = str(it.next().strip()) q2 = str(it.next().strip()) bc = str(it.next().strip()) rx = bc.split(',') bc = rx[0] if len(rx) > 1: rx = rx[1] else: rx = rx[0] bc = None bcq = str(it.next().strip()) si = str(it.next().strip()) siq = str(it.next().strip()) lines = [ head, seq1, q1, seq2, q2, rx, bc, bcq, si, siq, trim1, trim_q1 ] yield (bc, lines) try: version = martian.get_pipelines_version() except NameError: version = 'unknown' def make_rg_header(packed_rg_string): '''Make the RG header, matching how it's done in Lariat.''' result = packed_rg_string.split(':') if len(result) != 5: raise Exception( "RG string must have this format - sample_id:library_id:gem_group:flowcell:lane" ) sample_id, library_id, gem_group, flowcell, lane = result return { 'ID': packed_rg_string, 'SM': sample_id, 'LB': library_id, 'PU': gem_group, 'PL': 'ILLUMINA' } #return '@RG\\tID:{0}\\tSM:{1}\\tLB:{2}.{3}\\tPU:{0}\\tPL:ILLUMINA'.format(packed_rg_string, sample_id, library_id, gem_group) header = { 'HD': { 'VN': '1.3', 'SO': 'unknown' }, 'RG': [make_rg_header(rg_string) for rg_string in args.read_groups], 'PG': [{ 'ID': 'make_unaligned_bam', 'PN': '10X longranger/make_unaligned_bam', 'VN': version }], 'CO': [ '10x_bam_to_fastq:R1(RX:QX,TR:TQ,SEQ:QUAL)', '10x_bam_to_fastq:R2(SEQ:QUAL)', '10x_bam_to_fastq:I1(BC:QT)' ] } if args.output_format == "bam": out_bam = pysam.Samfile(outs.barcoded_unaligned, mode='wb', header=header) out_fastq = None elif args.output_format == "fastq": out_fastq = open(outs.barcoded, 'w') out_bam = None else: martian.exit("MAKE_UNALIGNED_OUPUT: invalid output format: '%s'" % args.output_format) def wfq(head, seq, qual): out_fastq.write(head) out_fastq.write("\n") out_fastq.write(seq) out_fastq.write("\n+\n") out_fastq.write(qual) out_fastq.write("\n") # Open FASTQ input chunk proc = subprocess.Popen(["gunzip", "--stdout", args.read_chunk], stdout=subprocess.PIPE) reader = proc.stdout num_pairs = 0 correct_bc_pairs = 0 for (bc, fields) in read_clusters(reader): (head, seq1, q1, seq2, q2, rx, bc, bcq, si, siq, trim, trim_qual) = fields head_parts = head.split(" ") qname = head_parts[0] rg = head_parts[-1] tags1 = [('RG', str(rg)), (SAMPLE_INDEX_TAG, si), (SAMPLE_INDEX_QUAL_TAG, siq)] tags2 = [('RG', str(rg)), (SAMPLE_INDEX_TAG, si), (SAMPLE_INDEX_QUAL_TAG, siq)] if len(trim) > 0: tags1.append((TRIM_TAG, str(trim))) tags1.append((TRIM_QUAL_TAG, str(trim_qual))) num_pairs += 1 if bc: tags1.append((PROCESSED_BARCODE_TAG, bc)) tags2.append((PROCESSED_BARCODE_TAG, bc)) correct_bc_pairs += 1 tags1.append((RAW_BARCODE_TAG, rx)) tags1.append((RAW_BARCODE_QUAL_TAG, bcq)) tags2.append((RAW_BARCODE_TAG, rx)) tags2.append((RAW_BARCODE_QUAL_TAG, bcq)) if out_bam is not None: # Read 1 a = pysam.AlignedRead() a.qname = qname a.seq = seq1 a.qual = q1 # Unmapped R1 a.is_unmapped = True a.is_read1 = True a.tid = -1 a.pos = -1 a.mapq = 0 a.cigar = [(4, len(seq1))] a.mrnm = -1 a.mpos = -1 a.tlen = -1 a.tags = tags1 # Read 2 b = pysam.AlignedRead() b.qname = qname b.seq = seq2 b.qual = q2 b.is_unmapped = True b.is_read2 = True b.tid = -1 b.pos = -1 b.mapq = 0 b.cigar = [(4, len(seq2))] b.mrnm = -1 b.mpos = -1 b.tlen = -1 b.tags = tags2 out_bam.write(a) out_bam.write(b) if out_fastq is not None: header = qname if bc: bc_header = "%s:Z:%s" % (PROCESSED_BARCODE_TAG, bc) header = header + " " + bc_header wfq(header, seq1, q1) wfq(header, seq2, q2) if out_bam is not None: out_bam.close() if out_fastq is not None: out_fastq.close() outs.num_pairs = num_pairs outs.correct_bc_pairs = correct_bc_pairs
def main(): if len(sys.argv) < 5: print( 'usage: python %s bedfilename chrField BAMfilename chrom.sizes outputfilename [-nomulti] [-RPM] [-stranded +|-] [-readLength min max] [-printSum] [-uniqueBAM] [-mappabilityNormalize mappability.wig readLength] [-noNH samtools]' % sys.argv[0]) print('Note: the script will divide multireads by their multiplicity') print('\t-printSum option only working together with the RPM option') print( '\tuse the uniqueBAM option if the BAM file contains only unique alignments; this will save a lot of memory' ) print( '\tuse the -mappabilityNormalize option to get mappability normalized RPKMs (it will not do anything to the RPMs; not that a mappability track that goes from 0 to the read length is assumed' ) print( '\tuse the -noNH option and supply a path to samtools in order to have the file converted to one that has NH tags' ) print( '\tthe stranded option will normalized against all reads, not just reads on the indicated strand' ) sys.exit(1) bed = sys.argv[1] fieldID = int(sys.argv[2]) SAM = sys.argv[3] chromSize = sys.argv[4] outfilename = sys.argv[5] chromInfoList = [] linelist = open(chromSize) for line in linelist: fields = line.strip().split('\t') chr = fields[0] start = 0 end = int(fields[1]) chromInfoList.append((chr, start, end)) noMulti = False if '-nomulti' in sys.argv: noMulti = True print('will discard multi-read alignments') doReadLength = False if '-readLength' in sys.argv: doReadLength = True minRL = int(sys.argv[sys.argv.index('-readLength') + 1]) maxRL = int(sys.argv[sys.argv.index('-readLength') + 2]) print('will only consider reads between', minRL, 'and', maxRL, 'bp length') ORLL = 0 doPrintSum = False doStranded = False if '-stranded' in sys.argv: doStranded = True thestrand = sys.argv[sys.argv.index('-stranded') + 1] print('will only consider', thestrand, 'strand reads') doRPM = False if '-RPM' in sys.argv: doRPM = True print('will output RPMs') if '-printSum' in sys.argv: doPrintSum = True RPMSum = 0 doUniqueBAM = False if '-uniqueBAM' in sys.argv: print('will treat all alignments as unique') doUniqueBAM = True TotalReads = 0 pass samfile = pysam.Samfile(SAM, "rb") try: print('testing for NH tags presence') for alignedread in samfile.fetch(): multiplicity = alignedread.opt('NH') print('file has NH tags') break except: if '-noNH' in sys.argv: print( 'no NH: tags in BAM file, will replace with a new BAM file with NH tags' ) samtools = sys.argv[sys.argv.index('-noNH') + 1] BAMpreporcessingScript = sys.argv[0].rpartition( '/')[0] + '/bamPreprocessing.py' cmd = 'python ' + BAMpreporcessingScript + ' ' + SAM + ' ' + SAM + '.NH' os.system(cmd) cmd = 'rm ' + SAM os.system(cmd) cmd = 'mv ' + SAM + '.NH' + ' ' + SAM os.system(cmd) cmd = samtools + ' index ' + SAM os.system(cmd) else: if doUniqueBAM: pass else: print('no NH: tags in BAM file, exiting') sys.exit(1) doMappabilityCorrection = False if not doRPM and '-mappabilityNormalize' in sys.argv: doMappabilityCorrection = True print('will correct for mappability') mappability = sys.argv[sys.argv.index('-mappabilityNormalize') + 1] readLength = int(sys.argv[sys.argv.index('-mappabilityNormalize') + 2]) WantedDict = {} MappabilityRegionDict = {} lineslist = open(bed) i = 0 print('inputting regions') for line in lineslist: if line[0] == '#': continue i += 1 if i % 1000 == 0: print(i, 'regions inputted') fields = line.strip().split('\t') if len(fields) < fieldID + 2: continue chr = fields[fieldID] try: left = int(fields[fieldID + 1]) right = int(fields[fieldID + 2]) except: print('problem with region, skipping:', line.strip()) if left >= right: print('problem with region, skipping:', chr, left, right) continue if MappabilityRegionDict.has_key(chr): pass else: MappabilityRegionDict[chr] = {} WantedDict[chr] = {} MappabilityRegionDict[chr][(left, right)] = 0 for j in range(left, right): WantedDict[chr][j] = 0 lineslist = open(mappability) print('inputting mappability') i = 0 for line in lineslist: if line.startswith('#'): continue i += 1 if i % 1000000 == 0: print(str(i / 1000000) + 'M lines processed') fields = line.strip().split('\t') if len(fields) == 1: fields = line.strip().split(' ') chr = fields[0] left = int(fields[1]) right = int(fields[2]) score = float(fields[3]) if WantedDict.has_key(chr): pass else: continue for j in range(left, right): if WantedDict[chr].has_key(j): WantedDict[chr][j] = score print('calculating mappable fractions') for chr in MappabilityRegionDict.keys(): for (left, right) in MappabilityRegionDict[chr].keys(): TotalScore = 0.0 for j in range(left, right): TotalScore += WantedDict[chr][j] Score = TotalScore / (right - left) MappabilityRegionDict[chr][(left, right)] = Score / readLength WantedDict = {} regionDict = {} Unique = 0 UniqueSplices = 0 Multi = 0 MultiSplices = 0 if doUniqueBAM and not doReadLength: TotalReads = 0 for chrStats in pysam.idxstats(SAM): fields = chrStats.strip().split('\t') chr = fields[0] reads = int(fields[2]) if chr != '*': TotalReads += reads UniqueReads = TotalReads else: MultiplicityDict = {} UniqueReads = 0 i = 0 samfile = pysam.Samfile(SAM, "rb") for (chr, start, end) in chromInfoList: try: for alignedread in samfile.fetch(chr, start, end): i += 1 if i % 5000000 == 0: print( str(i / 1000000) + 'M alignments processed', chr, start, end) fields = str(alignedread).split('\t') if doReadLength: if len(alignedread.seq) > maxRL or len( alignedread.seq) < minRL: ORLL += 1 continue if doUniqueBAM: TotalReads += 1 continue if alignedread.opt('NH') == 1: UniqueReads += 1 continue ID = fields[0] if alignedread.is_read1: ID = ID + '/1' if alignedread.is_read2: ID = ID + '/2' if MultiplicityDict.has_key(ID): MultiplicityDict[ID] += 1 else: MultiplicityDict[ID] = 1 except: print('problem with region:', chr, start, end, 'skipping') if doReadLength: print(ORLL, 'alignments outside of read length limits') if doUniqueBAM: pass else: TotalReads = UniqueReads + len(MultiplicityDict.keys()) print(TotalReads, UniqueReads) normalizeBy = TotalReads / 1000000. outfile = open(outfilename, 'w') lineslist = open(bed) i = 0 for line in lineslist: i += 1 if i % 10000 == 0: print(i, 'regions processed') if line[0] == '#': continue fields = line.strip().split('\t') if len(fields) < fieldID + 2: continue chr = fields[fieldID] try: left = int(fields[fieldID + 1]) right = int(fields[fieldID + 2]) except: print('problem with region, skipping:', line.strip()) if left >= right: print('problem with region, skipping:', chr, left, right) continue reads = 0 try: for alignedread in samfile.fetch(chr, left, right): fields2 = str(alignedread).split('\t') if doReadLength: if len(alignedread.seq) > maxRL or len( alignedread.seq) < minRL: continue ID = fields2[0] if doStranded: if alignedread.is_reverse: s = '-' else: s = '+' if s != thestrand: continue if alignedread.is_read1: ID = ID + '/1' if alignedread.is_read2: ID = ID + '/2' if doUniqueBAM: reads += 1 else: if noMulti and alignedread.opt('NH') > 1: continue reads += 1. / alignedread.opt('NH') # print('NH, weight:', alignedread.opt('NH'), 1./alignedread.opt('NH')) except: print('problem with region:', chr, left, right, 'assigning 0 value') reads = 0 if doRPM: score = reads / normalizeBy # print(chr, right - left, normalizeBy) else: try: score = reads / (((right - left) / 1000.) * normalizeBy) except: print('region of size 0, skipping:', line.strip()) continue if doPrintSum: RPMSum += score outline = line.strip() + '\t' + str(score) if doMappabilityCorrection: outline = outline + '\t' + str( MappabilityRegionDict[chr][(left, right)]) if MappabilityRegionDict[chr][(left, right)] == 0: outline = outline + '\t0' else: outline = outline + '\t' + str( score / MappabilityRegionDict[chr][(left, right)]) outfile.write(outline + '\n') if doPrintSum: outfile.write('#Total RPM:' + str(RPMSum) + '\n') outfile.close()