def test_bedcov_split_lines(): bam_filename = "./pysam_data/ex1.bam" bed_filename = "./pysam_data/ex1.bed" lines = pysam.bedcov(bed_filename, bam_filename, split_lines=True) # Test pysam 0.8.X style output, which returns a list of lines for line in lines: fields = line.split('\t') assert len(fields) in [4, 5], "bedcov should give tab delimited output with 4 or 5 fields. Split line (%s) gives %d fields." % (fields, len(fields))
def bedcov(bed_fname, bam_fname): """Calculate depth of all regions in a BED file via samtools (pysam) bedcov. i.e. mean pileup depth across each region. """ # Count bases in each region; exclude 0-MAPQ reads try: lines = pysam.bedcov(bed_fname, bam_fname, '-Q', '1') except pysam.SamtoolsError as exc: raise ValueError("Failed processing %r coverages in %r regions. PySAM error: %s" % (bam_fname, bed_fname, exc)) if not lines: raise ValueError("BED file %r sequence IDs don't match any in BAM file %r" % (bed_fname, bam_fname)) # Return an iterable... for line in lines: try: chrom, start_s, end_s, name, basecount_s = line.split('\t') except: raise RuntimeError("Bad line from bedcov:\n" + line) start, end, basecount = map(int, (start_s, end_s, basecount_s.strip())) span = end - start if span > 0: # Algebra from above count = basecount / READ_LEN mean_depth = basecount / span else: # User-supplied bins might be oddly constructed count = mean_depth = 0 yield chrom, start, end, name, count, mean_depth
def bedcov(bed_fname, bam_fname, min_mapq): """Calculate depth of all regions in a BED file via samtools (pysam) bedcov. i.e. mean pileup depth across each region. """ # Count bases in each region; exclude low-MAPQ reads if min_mapq > 0: bedcov_args = ['-Q', str(min_mapq)] else: bedcov_args = [] try: lines = pysam.bedcov(bed_fname, bam_fname, *bedcov_args) except pysam.SamtoolsError as exc: raise ValueError("Failed processing %r coverages in %r regions. PySAM error: %s" % (bam_fname, bed_fname, exc)) if not lines: raise ValueError("BED file %r sequence IDs don't match any in BAM file %r" % (bed_fname, bam_fname)) # Return an iterable... for line in lines: try: chrom, start_s, end_s, name, basecount_s = line.split('\t') except: raise RuntimeError("Bad line from bedcov:\n" + line) start, end, basecount = map(int, (start_s, end_s, basecount_s.strip())) span = end - start if span > 0: # Algebra from above count = basecount / READ_LEN mean_depth = basecount / span else: # User-supplied bins might be oddly constructed count = mean_depth = 0 yield chrom, start, end, name, count, mean_depth
def parallelCov(cov_args): (bam, bed) = cov_args sampleCov = [] output = {} mq = 0 #can param min mapping quality if desired id = os.path.basename(bam) id = id.split('_')[0] print("Generating coverage metrics for: " + id) sys.stdout.flush() id = id + "_mean_cvg" cmd = [bed, bam] cmd.extend(['-Q', bytes(mq)]) bstring = pysam.bedcov(*cmd, split_lines=False) lines = bstring.splitlines() targets = [] for line in lines: fields = line.split('\t') chr = fields[0] start = int(fields[1]) end = int(fields[2]) target = fields[3] coverage = int(fields[4]) intlen = float(end - start) meancov = str(coverage / intlen) target = chr + ":" + str(start) + "-" + str(end) targets.append(target) sampleCov.append(meancov) output.update({id: sampleCov}) output.update({'Target': targets}) return (output)
def test_bedcov(): bam_filename = "./pysam_data/ex1.bam" bed_filename = "./pysam_data/ex1.bed" bedcov_string = pysam.bedcov(bed_filename, bam_filename, split_lines=False) # Test pysam 0.9.X style output, which returns a string that needs to be split by \n lines = bedcov_string.splitlines() for line in lines: fields = line.split('\t') assert len(fields) in [4, 5], "bedcov should give tab delimited output with 4 or 5 fields. Split line (%s) gives %d fields." % (fields, len(fields))
def compute_coverage_with_samtools(args, input_bam, mean_coverage): # Generate GC and chromosome normalized coverage for the entire BAM file # Based on https://www.biostars.org/p/92744/ logging.info( "Computing chromosome and GC normalized coverage for %s with samtools and bedtools", input_bam, ) with tempfile.NamedTemporaryFile(mode="w", suffix=".bed", dir=args.tempdir) as window_bed_file: # pylint: disable=unexpected-keyword-arg windows_bed = bed.BedTool().window_maker(g=args.genome, w=args.gc_window_size, output=window_bed_file.name) windows_table = ( # pylint: disable=no-member,unexpected-keyword-arg bed.BedTool(pysam.bedcov(window_bed_file.name, input_bam, "--reference", args.reference), from_string=True).nucleotide_content( fi=args.reference).to_dataframe( index_col=False, header=0, usecols=[0, 1, 2, 3, 7, 8, 10, 11, 12], names=[ "chrom", "start", "end", "bases", "num_C", "num_G", "num_N", "num_oth", "seq_len", ], dtype={"chrom": str}, )) # Remove windows with no alignable data windows_table["align_len"] = (windows_table.seq_len - windows_table.num_N - windows_table.num_oth) windows_table = windows_table[windows_table.align_len != 0] # Compute normalized coverage by chromosome norm_coverage_by_chrom = (windows_table.groupby("chrom").apply( samtools_norm_coverage_group, mean_coverage).to_dict()) # Compute normalized coverage by GC bin gc_fraction = np.round( (windows_table.num_G + windows_table.num_C) / windows_table.align_len, 2) norm_coverage = (windows_table.bases / windows_table.align_len) / mean_coverage norm_coverage_by_gc = (norm_coverage.groupby(gc_fraction).agg( ["count", "mean"]).to_dict()) return norm_coverage_by_chrom, norm_coverage_by_gc
def test_bedcov_split_lines(): bam_filename = os.path.join(BAM_DATADIR, "ex1.bam") bed_filename = os.path.join(BAM_DATADIR, "ex1.bed") # Test pysam 0.8.X style output, which returns a list of lines lines = pysam.bedcov(bed_filename, bam_filename, split_lines=True) for line in lines: fields = line.split('\t') assert len(fields) in [4, 5], \ ("bedcov should give tab delimited output with 4 or 5 fields. " "Split line (%s) gives %d fields." % (fields, len(fields)))
def test_bedcov(): bam_filename = os.path.join(BAM_DATADIR, "ex1.bam") bed_filename = os.path.join(BAM_DATADIR, "ex1.bed") # Test pysam 0.9.X style output, which returns a string that needs to be split by \n bedcov_string = pysam.bedcov(bed_filename, bam_filename, split_lines=False) lines = bedcov_string.splitlines() for line in lines: fields = line.split('\t') assert len(fields) in [4, 5], \ ("bedcov should give tab delimited output with 4 or 5 fields. " "Split line (%s) gives %d fields." % (fields, len(fields)))
def test_bedcov_split_lines(): bam_filename = "./pysam_data/ex1.bam" bed_filename = "./pysam_data/ex1.bed" lines = pysam.bedcov( bed_filename, bam_filename, split_lines=True ) # Test pysam 0.8.X style output, which returns a list of lines for line in lines: fields = line.split('\t') assert len(fields) in [ 4, 5 ], "bedcov should give tab delimited output with 4 or 5 fields. Split line (%s) gives %d fields." % ( fields, len(fields))
def test_bedcov(): bam_filename = "./pysam_data/ex1.bam" bed_filename = "./pysam_data/ex1.bed" bedcov_string = pysam.bedcov( bed_filename, bam_filename, split_lines=False ) # Test pysam 0.9.X style output, which returns a string that needs to be split by \n lines = bedcov_string.splitlines() for line in lines: fields = line.split('\t') assert len(fields) in [ 4, 5 ], "bedcov should give tab delimited output with 4 or 5 fields. Split line (%s) gives %d fields." % ( fields, len(fields))
def pysam_depth(bam, bed): "get number of total base in bed region" if not os.path.isfile(bam + '.bai'): raise Exception('index for BAM file %s isn\'t found' %(bam)) cmd = [bed, bam]# ,'-Q', bytes(5)] try: raw = pysam.bedcov(*cmd, split_lines=False) except pysam.SamtoolsError as exc: raise ValueError("Failed processing %r coverages in %r regions. " "PySAM error: %s" % (bam, bed, exc)) return map(lambda x: int(x.split('\t')[-1]), raw.rstrip().split('\n'))
def bedcov(bed_fname, bam_fname, min_mapq): """Calculate depth of all regions in a BED file via samtools (pysam) bedcov. i.e. mean pileup depth across each region. """ # Count bases in each region; exclude low-MAPQ reads if min_mapq > 0: bedcov_args = ['-Q', str(min_mapq)] else: bedcov_args = [] try: lines = pysam.bedcov(bed_fname, bam_fname, *bedcov_args) except pysam.SamtoolsError as exc: raise ValueError("Failed processing %r coverages in %r regions. PySAM error: %s" % (bam_fname, bed_fname, exc)) if not lines: raise ValueError("BED file %r sequence IDs don't match any in BAM file %r" % (bed_fname, bam_fname)) # Return an iterable... if isinstance(lines, basestring): lines = lines.splitlines() for line in lines: fields = line.split('\t', 5) if len(fields) == 5: chrom, start_s, end_s, gene, basecount_s = fields elif len(fields) == 4: chrom, start_s, end_s, basecount_s = fields gene = "-" else: raise RuntimeError("Bad line from bedcov:\n" + line) start, end, basecount = list(map(int, (start_s, end_s, basecount_s.strip()))) span = end - start if span > 0: # Algebra from above count = basecount / READ_LEN mean_depth = basecount / span else: # User-supplied bins might be oddly constructed count = mean_depth = 0 row = (chrom, start, end, gene, math.log(mean_depth, 2) if mean_depth else NULL_LOG2_COVERAGE, mean_depth) yield count, row
def bedcov(bed_fname, bam_fname, min_mapq): """Calculate depth of all regions in a BED file via samtools (pysam) bedcov. i.e. mean pileup depth across each region. """ # Count bases in each region; exclude low-MAPQ reads cmd = [bed_fname, bam_fname] if min_mapq and min_mapq > 0: cmd.extend(['-Q', bytes(min_mapq)]) try: raw = pysam.bedcov(*cmd, split_lines=False) except pysam.SamtoolsError as exc: raise ValueError("Failed processing %r coverages in %r regions. " "PySAM error: %s" % (bam_fname, bed_fname, exc)) if not raw: raise ValueError("BED file %r chromosome names don't match any in " "BAM file %r" % (bed_fname, bam_fname)) columns = detect_bedcov_columns(raw) table = pd.read_table(StringIO(raw), names=columns, usecols=columns) return table
def bedcov(bed_fname, bam_fname, min_mapq): """Calculate depth of all regions in a BED file via samtools (pysam) bedcov. i.e. mean pileup depth across each region. """ # Count bases in each region; exclude low-MAPQ reads if min_mapq > 0: bedcov_args = ['-Q', str(min_mapq)] else: bedcov_args = [] try: lines = pysam.bedcov(bed_fname, bam_fname, *bedcov_args) except pysam.SamtoolsError as exc: raise ValueError("Failed processing %r coverages in %r regions. PySAM error: %s" % (bam_fname, bed_fname, exc)) if not lines: raise ValueError("BED file %r sequence IDs don't match any in BAM file %r" % (bed_fname, bam_fname)) # Return an iterable... if isinstance(lines, basestring): lines = lines.splitlines() for line in lines: fields = line.split('\t') if len(fields) == 5: chrom, start_s, end_s, name, basecount_s = fields elif len(fields) == 4: chrom, start_s, end_s, basecount_s = fields name = "-" else: raise RuntimeError("Bad line from bedcov:\n" + line) start, end, basecount = list(map(int, (start_s, end_s, basecount_s.strip()))) span = end - start if span > 0: # Algebra from above count = basecount / READ_LEN mean_depth = basecount / span else: # User-supplied bins might be oddly constructed count = mean_depth = 0 yield chrom, start, end, name, count, mean_depth
outfile = os.fdopen(fd, "w") outfile.close() if k % chunk_size: outfile.close() yield name def bedcov(bam_fname, bed_fname): ## pysam.bedcov ===> 'chr1\t200\t300\t2050\n' """Calculate depth of all regions in a BED file via samtools (pysam) bedcov. i.e. mean pileup depth across each region. """ # Count bases in each region; exclude low-MAPQ reads cmd = [bed_fname, bam_fname] if min_mapq and min_mapq > 0: cmd.extend(['-Q', bytes(min_mapq)]) try: raw = pysam.bedcov(*cmd, split_lines=False) except pysam.SamtoolsError as exc: raise ValueError("Failed processing %r coverages in %r regions. " "PySAM error: %s" % (bam_fname, bed_fname, exc)) if not raw: raise ValueError("BED file %r chromosome names don't match any in " "BAM file %r" % (bed_fname, bam_fname)) columns = detect_bedcov_columns(raw) table = pd.read_csv(StringIO(raw), sep='\t', names=columns, usecols=columns) #****************** return table def detect_bedcov_columns(text): """Determine which 'bedcov' output columns to keep. Format is the input BED plus a final appended column with the count of basepairs mapped within each row's region. The input BED might have 3