def _small_file_innerdist(start, fastq_file, pair_file, ref_file, out_base, out_dir, config, remove_workdir=False): work_dir = os.path.join(out_dir, "innerdist_estimate") if os.path.exists(work_dir): shutil.rmtree(work_dir) safe_makedir(work_dir) extra_args = ["-s", str(start), "-u", "250000"] bowtie_runner = _select_bowtie_version(config) out_sam = bowtie_runner.align(fastq_file, pair_file, ref_file, out_base, work_dir, config, extra_args) dists = [] with closing(pysam.Samfile(out_sam)) as work_sam: for read in work_sam: if read.is_proper_pair and read.is_read1: dists.append(abs(read.isize) - 2 * read.rlen) if dists: dist_stats = Stats(dists) return int(round(dist_stats.mean())), int( round(dist_stats.standard_deviation())) else: return None, None
def calc_paired_insert_stats(in_bam): """Retrieve statistics for paired end read insert distances. MAD is the Median Absolute Deviation: http://en.wikipedia.org/wiki/Median_absolute_deviation """ dists = [] with closing(pysam.Samfile(in_bam, "rb")) as in_pysam: for read in in_pysam: if read.is_proper_pair and read.is_read1: dists.append(abs(read.isize)) # remove outliers med = Stats(dists).median() filter_dists = filter(lambda x: x < med + 10 * med, dists) median = Stats(filter_dists).median() return {"mean": Stats(filter_dists).mean(), "std": Stats(filter_dists).standard_deviation(), "median": median, "mad": Stats([abs(x - median) for x in filter_dists]).median()}
def _estimate_paired_innerdist(fastq_file, pair_file, ref_file, out_base, out_dir, config): """Use Bowtie to estimate the inner distance of paired reads. """ # skip initial reads for large file, but not for smaller dists = _bowtie_for_innerdist("1000000", fastq_file, pair_file, ref_file, out_base, out_dir, config) if len(dists) == 0: dists = _bowtie_for_innerdist("1", fastq_file, pair_file, ref_file, out_base, out_dir, config, True) dist_stats = Stats(dists) return int(round(dist_stats.mean())), int( round(dist_stats.standard_deviation()))
def descriptive_stats(xs): if len(xs) < 2: return xs calc = Stats(xs) parts = [ "min: %s" % min(xs), "5%%: %s" % calc.percentile(5), "25%%: %s" % calc.percentile(25), "median: %s" % calc.percentile(50), "75%%: %s" % calc.percentile(75), "95%%: %s" % calc.percentile(95), "99%%: %s" % calc.percentile(99), "max: %s" % max(xs) ] return "\n".join([" " + x for x in parts])