def compress(samples, run_parallel): """Perform compression of output files for long term storage. """ to_cram = [] finished = [] for data in [x[0] for x in samples]: if "cram" in dd.get_archive(data) or "cram-lossless" in dd.get_archive(data): to_cram.append([data]) else: finished.append([data]) crammed = run_parallel("archive_to_cram", to_cram) return finished + crammed
def compress(samples, run_parallel): """Perform compression of output files for long term storage. """ to_cram = [] finished = [] for data in [x[0] for x in samples]: if "cram" in dd.get_archive(data) or "cram-lossless" in dd.get_archive( data): to_cram.append([data]) else: finished.append([data]) crammed = run_parallel("archive_to_cram", to_cram) return finished + crammed
def compress(in_bam, data): """Compress a BAM file to CRAM, providing indexed CRAM file. Does 8 bin compression of quality score and read name removal using Staden io_lib if `cram` specified: https://github.com/jkbonfield/io_lib Otherwise does `cram-lossless` which only converts to CRAM. """ out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "archive")) out_file = os.path.join(out_dir, "%s.cram" % os.path.splitext(os.path.basename(in_bam))[0]) cores = dd.get_num_cores(data) ref_file = dd.get_ref_file(data) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: compress_type = dd.get_archive(data) scramble = config_utils.get_program("scramble", data["config"]) cmd = [scramble, "-I", "bam", "-O", "cram", "-9", "-X", "archive", "-r", ref_file, "-V", "3.0", "-t", cores, in_bam, tx_out_file] compressed = False if "cram" in compress_type: try: cmd.extend(["-B", "-n"]) do.run(cmd, "Compress BAM to CRAM: quality score binning") compressed = True except subprocess.CalledProcessError: pass if not compressed: do.run(cmd, "Compress BAM to CRAM: lossless") index(out_file, data["config"]) return out_file
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d)) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples]) checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or not dd.get_aligner(d)) for d in samples]) checkpoints["archive"] = any([dd.get_archive(d) for d in samples]) checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples]) checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples]) checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples) return checkpoints
def compress(in_bam, data): """Compress a BAM file to CRAM, providing indexed CRAM file. Does 8 bin compression of quality score and read name removal using bamUtils squeeze if `cram` specified: http://genome.sph.umich.edu/wiki/BamUtil:_squeeze Otherwise does `cram-lossless` which only converts to CRAM. """ out_file = "%s.cram" % os.path.splitext(in_bam)[0] cores = dd.get_num_cores(data) ref_file = dd.get_ref_file(data) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: compress_type = dd.get_archive(data) samtools = config_utils.get_program("samtools", data["config"]) try: bam_cmd = config_utils.get_program("bam", data["config"]) except config_utils.CmdNotFound: bam_cmd = None to_cram = ("{samtools} view -T {ref_file} -@ {cores} " "-C -x BD -x BI -o {tx_out_file}") compressed = False if "cram" in compress_type and bam_cmd: try: cmd = ( "{bam_cmd} squeeze --in {in_bam} --out -.ubam --keepDups " "--binQualS=2,10,20,25,30,35,70 --binMid | " + to_cram) do.run(cmd.format(**locals()), "Compress BAM to CRAM: quality score binning") compressed = True # Retry failures avoiding using bam squeeze which can cause issues except subprocess.CalledProcessError: pass if not compressed: cmd = (to_cram + " {in_bam}") do.run(cmd.format(**locals()), "Compress BAM to CRAM: lossless") index(out_file, data["config"]) return out_file
def compress(in_bam, data): """Compress a BAM file to CRAM, providing indexed CRAM file. Does 8 bin compression of quality score and read name removal using bamUtils squeeze if `cram` specified: http://genome.sph.umich.edu/wiki/BamUtil:_squeeze Otherwise does `cram-lossless` which only converts to CRAM. """ out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "archive")) out_file = os.path.join(out_dir, "%s.cram" % os.path.splitext(os.path.basename(in_bam))[0]) cores = dd.get_num_cores(data) ref_file = dd.get_ref_file(data) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: compress_type = dd.get_archive(data) samtools = config_utils.get_program("samtools", data["config"]) try: bam_cmd = config_utils.get_program("bam", data["config"]) except config_utils.CmdNotFound: bam_cmd = None to_cram = ("{samtools} view -T {ref_file} -@ {cores} " "-C -x BD -x BI -o {tx_out_file}") compressed = False if "cram" in compress_type and bam_cmd: try: cmd = ("{bam_cmd} squeeze --in {in_bam} --out -.ubam --keepDups " "--binQualS=2,10,20,25,30,35,70 --binMid | " + to_cram) do.run(cmd.format(**locals()), "Compress BAM to CRAM: quality score binning") compressed = True # Retry failures avoiding using bam squeeze which can cause issues except subprocess.CalledProcessError: pass if not compressed: cmd = (to_cram + " {in_bam}") do.run(cmd.format(**locals()), "Compress BAM to CRAM: lossless") index(out_file, data["config"]) return out_file