예제 #1
0
def _small_file_innerdist(start,
                          fastq_file,
                          pair_file,
                          ref_file,
                          out_base,
                          out_dir,
                          config,
                          remove_workdir=False):
    work_dir = os.path.join(out_dir, "innerdist_estimate")
    if os.path.exists(work_dir):
        shutil.rmtree(work_dir)
    safe_makedir(work_dir)
    extra_args = ["-s", str(start), "-u", "250000"]
    bowtie_runner = _select_bowtie_version(config)
    out_sam = bowtie_runner.align(fastq_file, pair_file, ref_file, out_base,
                                  work_dir, config, extra_args)
    dists = []
    with closing(pysam.Samfile(out_sam)) as work_sam:
        for read in work_sam:
            if read.is_proper_pair and read.is_read1:
                dists.append(abs(read.isize) - 2 * read.rlen)
    if dists:
        dist_stats = Stats(dists)
        return int(round(dist_stats.mean())), int(
            round(dist_stats.standard_deviation()))
    else:
        return None, None
예제 #2
0
def calc_paired_insert_stats(in_bam):
    """Retrieve statistics for paired end read insert distances.

    MAD is the Median Absolute Deviation: http://en.wikipedia.org/wiki/Median_absolute_deviation
    """
    dists = []
    with closing(pysam.Samfile(in_bam, "rb")) as in_pysam:
        for read in in_pysam:
            if read.is_proper_pair and read.is_read1:
                dists.append(abs(read.isize))
    # remove outliers
    med = Stats(dists).median()
    filter_dists = filter(lambda x: x < med + 10 * med, dists)
    median = Stats(filter_dists).median()
    return {"mean": Stats(filter_dists).mean(), "std": Stats(filter_dists).standard_deviation(),
            "median": median,
            "mad": Stats([abs(x - median) for x in filter_dists]).median()}
예제 #3
0
def _estimate_paired_innerdist(fastq_file, pair_file, ref_file, out_base,
                               out_dir, config):
    """Use Bowtie to estimate the inner distance of paired reads.
    """
    # skip initial reads for large file, but not for smaller
    dists = _bowtie_for_innerdist("1000000", fastq_file, pair_file, ref_file,
                                  out_base, out_dir, config)
    if len(dists) == 0:
        dists = _bowtie_for_innerdist("1", fastq_file, pair_file, ref_file,
                                      out_base, out_dir, config, True)
    dist_stats = Stats(dists)
    return int(round(dist_stats.mean())), int(
        round(dist_stats.standard_deviation()))
예제 #4
0
 def descriptive_stats(xs):
     if len(xs) < 2:
         return xs
     calc = Stats(xs)
     parts = [
         "min: %s" % min(xs),
         "5%%: %s" % calc.percentile(5),
         "25%%: %s" % calc.percentile(25),
         "median: %s" % calc.percentile(50),
         "75%%: %s" % calc.percentile(75),
         "95%%: %s" % calc.percentile(95),
         "99%%: %s" % calc.percentile(99),
         "max: %s" % max(xs)
     ]
     return "\n".join(["  " + x for x in parts])