示例#1
0
def get_median_read_coverage(output_path, num_threads, overwrite_files):
    """ Given the read alignments, use samtools stats to return an approximate median coverage value. """
    log("Calculating global read coverage")
    if os.path.isfile(output_path + "c_reads_against_query.s.bam.stats"):
        if not overwrite_files:
            log("retaining pre-existing file: " + output_path + "c_reads_against_query.s.bam.stats")
        else:
            log("overwriting pre-existing file: " + output_path + "c_reads_against_query.s.bam.stats")
            st = pysam.stats("-@", str(num_threads), output_path + "c_reads_against_query.s.bam")
            with open(output_path + "c_reads_against_query.s.bam.stats", "w") as f:
                f.write(st)
    else:
        st = pysam.stats("-@", str(num_threads), output_path + "c_reads_against_query.s.bam")
        with open(output_path + "c_reads_against_query.s.bam.stats", "w") as f:
            f.write(st)

    # Get the coverage histogram (for 1 to 1k)
    covs = []
    with open(output_path + "c_reads_against_query.s.bam.stats") as f:
        for line in f:
            if line.startswith("COV"):
                covs.append(int(line.split("\t")[3]))

    # Get the median from the histogram
    covs = np.asarray(covs, dtype=np.int32)

    # Remove the last value, which is a catch-all for coverages > 1k
    covs = covs[:-1]
    mid = sum(covs) // 2
    cs = 0
    for i in range(len(covs)):
        cs += covs[i]
        if cs >= mid:
            return i
    raise ValueError("Unable to calculate read coverage. Check SAM/BAM files and stats file.")
示例#2
0
 def insert_size(this, bamFile, threads=1):
     result_str = pysam.stats("-@", str(threads), bamFile)
     insert_size = float(
         re.search('SN\s+insert size average:\s+(\S+)',
                   result_str).group(1))
     insert_std = float(
         re.search('SN\s+insert size standard deviation:\s+(\S+)',
                   result_str).group(1))
     return (insert_size, insert_std)
示例#3
0
def samtools_stats(x):
    """Extract insert size of PE BAM file
    samtools stats {in.bam} | grep 'insert size average' 

    output of command: samtools stats
    Summary Numbers
    grep ^SN | cut -f 2- | grep 'insert size average'
    """
    stat = pysam.stats('-@', '8', x)
    d = {}
    for line in stat.split('\n'):
        # check
        if not line.startswith('SN'):
            continue
        # stat
        sn, group, num = line.strip().split('\t')[0:3]
        group = group.strip(':')
        d[group] = num
    return d