def fix_insert_size(in_bam, config): """ Tophat sets PI in the RG to be the inner distance size, but the SAM spec states should be the insert size. This fixes the RG in the alignment file generated by Tophat header to match the spec """ fixed_file = os.path.splitext(in_bam)[0] + ".pi_fixed.bam" if file_exists(fixed_file): return fixed_file header_file = os.path.splitext(in_bam)[0] + ".header.sam" read_length = bam.estimate_read_length(in_bam) bam_handle = bam.open_samfile(in_bam) header = bam_handle.header.copy() rg_dict = header['RG'][0] if 'PI' not in rg_dict: return in_bam PI = int(rg_dict.get('PI')) PI = PI + 2 * read_length rg_dict['PI'] = PI header['RG'][0] = rg_dict with pysam.Samfile(header_file, "wb", header=header) as out_handle: with bam.open_samfile(in_bam) as in_handle: for record in in_handle: out_handle.write(record) shutil.move(header_file, fixed_file) return fixed_file
def fix_insert_size(in_bam, config): """ Tophat sets PI in the RG to be the inner distance size, but the SAM spec states should be the insert size. This fixes the RG in the alignment file generated by Tophat header to match the spec """ fixed_file = os.path.splitext(in_bam)[0] + ".pi_fixed.bam" if file_exists(fixed_file): return fixed_file header_file = os.path.splitext(in_bam)[0] + ".header.sam" read_length = bam.estimate_read_length(in_bam) bam_handle= bam.open_samfile(in_bam) header = bam_handle.header.copy() rg_dict = header['RG'][0] if 'PI' not in rg_dict: return in_bam PI = int(rg_dict.get('PI')) PI = PI + 2*read_length rg_dict['PI'] = PI header['RG'][0] = rg_dict with pysam.Samfile(header_file, "wb", header=header) as out_handle: with bam.open_samfile(in_bam) as in_handle: for record in in_handle: out_handle.write(record) shutil.move(header_file, fixed_file) return fixed_file
def starts_by_depth(bam_file, data, sample_size=10000000): """ Return a set of x, y points where x is the number of reads sequenced and y is the number of unique start sites identified If sample size < total reads in a file the file will be downsampled. """ binsize = (sample_size / 100) + 1 seen_starts = set() counted = 0 num_reads = [] starts = [] buffer = [] downsampled = bam.downsample(bam_file, data, sample_size) with bam.open_samfile(downsampled) as samfile: for read in samfile: if read.is_unmapped: continue counted += 1 buffer.append(str(read.tid) + ":" + str(read.pos)) if counted % binsize == 0: seen_starts.update(buffer) buffer = [] num_reads.append(counted) starts.append(len(seen_starts)) seen_starts.update(buffer) num_reads.append(counted) starts.append(len(seen_starts)) return pd.DataFrame({"reads": num_reads, "starts": starts})
def starts_by_depth(bam_file, config, sample_size=None): """ Return a set of x, y points where x is the number of reads sequenced and y is the number of unique start sites identified If sample size < total reads in a file the file will be downsampled. """ binsize = (bam.count(bam_file, config) / 100) + 1 seen_starts = set() counted = 0 num_reads = [] starts = [] buffer = [] with bam.open_samfile(bam_file) as samfile: # unmapped reads should not be counted filtered = ifilter(lambda x: not x.is_unmapped, samfile) def read_parser(read): return ":".join([str(read.tid), str(read.pos)]) # if no sample size is set, use the whole file if not sample_size: samples = map(read_parser, filtered) else: samples = utils.reservoir_sample(filtered, sample_size, read_parser) shuffle(samples) for read in samples: counted += 1 buffer.append(read) if counted % binsize == 0: seen_starts.update(buffer) buffer = [] num_reads.append(counted) starts.append(len(seen_starts)) seen_starts.update(buffer) num_reads.append(counted) starts.append(len(seen_starts)) return pd.DataFrame({"reads": num_reads, "starts": starts})
def count_duplicate_starts(bam_file, sample_size=10000000): """ Return a set of x, y points where x is the number of reads sequenced and y is the number of unique start sites identified If sample size < total reads in a file the file will be downsampled. """ count = Counter() with bam.open_samfile(bam_file) as samfile: # unmapped reads should not be counted filtered = ifilter(lambda x: not x.is_unmapped, samfile) def read_parser(read): return ":".join([str(read.tid), str(read.pos)]) samples = utils.reservoir_sample(filtered, sample_size, read_parser) count.update(samples) return count
def _fix_sam_header(in_file, config): """ STAR outputs a duplicate cl: line in the header which breaks some downstream tools like FastQC https://groups.google.com/d/msg/rna-star/xxE4cUnafJQ/EUsgYId-dB8J This can be safely removed whenever that bug gets fixed. """ with bam.open_samfile(in_file) as in_handle: header = in_handle.header with tempfile.NamedTemporaryFile(delete=False) as header_handle: for key, line in header.items(): line_key = "@" + str(key) for line_item in line: out_line = [line_key] out_line += [":".join([str(k), str(v)]) for k, v in line_item.items() if k != "cl"] header_handle.write("\t".join(out_line) + "\n") header_name = header_handle.name header_handle.close() return bam.reheader(header_name, in_file, config)
def _fix_sam_header(in_file, config): """ STAR outputs a duplicate cl: line in the header which breaks some downstream tools like FastQC https://groups.google.com/d/msg/rna-star/xxE4cUnafJQ/EUsgYId-dB8J This can be safely removed whenever that bug gets fixed. """ with bam.open_samfile(in_file) as in_handle: header = in_handle.header with tempfile.NamedTemporaryFile(delete=False) as header_handle: for key, line in header.items(): line_key = "@" + str(key) for line_item in line: out_line = [line_key] out_line += [ ":".join([str(k), str(v)]) for k, v in line_item.items() if k != "cl" ] header_handle.write("\t".join(out_line) + "\n") header_name = header_handle.name header_handle.close() return bam.reheader(header_name, in_file, config)