def align(fastq_file, pair_file, ref_file, names, align_dir, data): assert data["analysis"].lower().startswith( "wgbs-seq"), "No comparible alignment." config = data["config"] sample = dd.get_sample_name(data) out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_dir = os.path.join(align_dir, "%s_bismark" % dd.get_lane(data)) if not ref_file: logger.error( "bismark index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners bismark --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(align_dir, "{0}.bam".format(sample)) if file_exists(final_out): data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] return data bismark = config_utils.get_program("bismark", config) # bismark uses 5 threads/sample and ~12GB RAM/sample (hg38) resources = config_utils.get_resources("bismark", data["config"]) max_cores = resources.get("cores", 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) n = min(max(int(max_cores / 5), 1), max(int(max_mem / config_utils.convert_to_bytes("12G")), 1)) kit = kits.KITS.get(dd.get_kit(data), None) directional = "--non_directional" if kit and not kit.is_directional else "" other_opts = resources.get("options", []) other_opts = " ".join([str(x) for x in other_opts]).strip() fastq_files = " ".join([fastq_file, pair_file ]) if pair_file else fastq_file safe_makedir(align_dir) cmd = "{bismark} {other_opts} {directional} --bowtie2 --temp_dir {tx_out_dir} --gzip --multicore {n} -o {tx_out_dir} --unmapped {ref_file} {fastq_file}" if pair_file: fastq_file = "-1 %s -2 %s" % (fastq_file, pair_file) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") if not raw_bam: with tx_tmpdir() as tx_out_dir: run_message = "Running Bismark aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) shutil.move(tx_out_dir, out_dir) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") process_bam = _process_bam(raw_bam[0], fastq_files, sample, dd.get_sam_ref(data), config) utils.symlink_plus(process_bam, final_out) data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] return data
def _cram_to_fastq_region(cram_file, work_dir, base_name, region, data): """Convert CRAM to fastq in a specified region. """ ref_file = tz.get_in(["reference", "fasta", "base"], data) resources = config_utils.get_resources("bamtofastq", data["config"]) cores = tz.get_in(["config", "algorithm", "num_cores"], data, 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores rext = "-%s" % region.replace(":", "_").replace("-", "_") if region else "full" out_s, out_p1, out_p2, out_o1, out_o2 = [ os.path.join(work_dir, "%s%s-%s.fq.gz" % (base_name, rext, fext)) for fext in ["s1", "p1", "p2", "o1", "o2"] ] if not utils.file_exists(out_p1): with file_transaction(data, out_s, out_p1, out_p2, out_o1, out_o2) as \ (tx_out_s, tx_out_p1, tx_out_p2, tx_out_o1, tx_out_o2): cram_file = objectstore.cl_input(cram_file) sortprefix = "%s-sort" % utils.splitext_plus(tx_out_s)[0] cmd = ( "bamtofastq filename={cram_file} inputformat=cram T={sortprefix} " "gz=1 collate=1 colsbs={max_mem} exclude=SECONDARY,SUPPLEMENTARY " "F={tx_out_p1} F2={tx_out_p2} S={tx_out_s} O={tx_out_o1} O2={tx_out_o2} " "reference={ref_file}") if region: cmd += " ranges='{region}'" do.run(cmd.format(**locals()), "CRAM to fastq %s" % region if region else "") return [[out_p1, out_p2, out_s]]
def align(fastq_file, pair_file, ref_file, names, align_dir, data): assert data["analysis"].lower().startswith("wgbs-seq"), "No comparible alignment." config = data["config"] sample = dd.get_sample_name(data) out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_dir = os.path.join(align_dir, "%s_bismark" % dd.get_lane(data)) if not ref_file: logger.error("bismark index not found. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners bismark --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(align_dir, "{0}.bam".format(sample)) if file_exists(final_out): data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] data = dd.update_summary_qc(data, "bismark", base=data["bam_report"]) return data bismark = config_utils.get_program("bismark", config) # bismark uses 5 threads/sample and ~12GB RAM/sample (hg38) resources = config_utils.get_resources("bismark", data["config"]) max_cores = dd.get_num_cores(data) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) / (1024.0 * 1024.0) instances = calculate_bismark_instances(max_cores, max_mem * max_cores) # override instances if specified in the config if resources and resources.get("bismark_threads"): instances = resources.get("bismark_threads") logger.info(f"Using {instances} bismark instances - overriden by resources") bowtie_threads = 1 if resources and resources.get("bowtie_threads"): bowtie_threads = resources.get("bowtie_threads") logger.info(f"Using {bowtie_threads} bowtie threads per bismark instance") kit = kits.KITS.get(dd.get_kit(data), None) directional = "--non_directional" if kit and not kit.is_directional else "" other_opts = resources.get("options", []) other_opts = " ".join([str(x) for x in other_opts]).strip() fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file safe_makedir(align_dir) cmd = "{bismark} {other_opts} {directional} --bowtie2 --temp_dir {tx_out_dir} --gzip --parallel {instances} -p {bowtie_threads} -o {tx_out_dir} --unmapped {ref_file} {fastq_file} " if pair_file: fastq_file = "-1 %s -2 %s" % (fastq_file, pair_file) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") if not raw_bam: with tx_tmpdir() as tx_out_dir: run_message = "Running Bismark aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) shutil.move(tx_out_dir, out_dir) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") # don't process bam in the bismark pipeline! utils.symlink_plus(raw_bam[0], final_out) data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] data = dd.update_summary_qc(data, "bismark", base=data["bam_report"]) return data
def _bgzip_from_bam(bam_file, dirs, config, is_retry=False, output_infix=''): """Create bgzipped fastq files from an input BAM file. """ # tools bamtofastq = config_utils.get_program("bamtofastq", config) resources = config_utils.get_resources("bamtofastq", config) cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores bgzip = tools.get_bgzip_cmd(config, is_retry) # files work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file_1 = os.path.join( work_dir, "%s%s-1.fq.gz" % (os.path.splitext(os.path.basename(bam_file))[0], output_infix)) out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz") needs_retry = False if is_retry or not utils.file_exists(out_file_1): if not bam.is_paired(bam_file): out_file_2 = None with file_transaction(config, out_file_1) as tx_out_file: for f in [tx_out_file, out_file_1, out_file_2]: if f and os.path.exists(f): os.remove(f) fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file) sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0] if bam.is_paired(bam_file): fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2) out_str = ( "F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null " "O2=/dev/null collate=1 colsbs={max_mem}") else: out_str = "S=>({fq1_bgzip_cmd})" bam_file = objectstore.cl_input(bam_file) cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str try: do.run(cmd.format(**locals()), "BAM to bgzipped fastq", checks=[do.file_reasonable_size(tx_out_file, bam_file)], log_error=False) except subprocess.CalledProcessError as msg: if not is_retry and "deflate failed" in str(msg): logger.info( "bamtofastq deflate IO failure preparing %s. Retrying with single core." % (bam_file)) needs_retry = True else: logger.exception() raise if needs_retry: return _bgzip_from_bam(bam_file, dirs, config, is_retry=True) else: return [ x for x in [out_file_1, out_file_2] if x is not None and utils.file_exists(x) ]
def _bgzip_from_bam(bam_file, dirs, data, is_retry=False, output_infix=''): """Create bgzipped fastq files from an input BAM file. """ # tools config = data["config"] bamtofastq = config_utils.get_program("bamtofastq", config) resources = config_utils.get_resources("bamtofastq", config) cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores bgzip = tools.get_bgzip_cmd(config, is_retry) # files work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file_1 = os.path.join(work_dir, "%s%s-1.fq.gz" % (os.path.splitext(os.path.basename(bam_file))[0], output_infix)) out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz") needs_retry = False if is_retry or not utils.file_exists(out_file_1): if not bam.is_paired(bam_file): out_file_2 = None with file_transaction(config, out_file_1) as tx_out_file: for f in [tx_out_file, out_file_1, out_file_2]: if f and os.path.exists(f): os.remove(f) fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file) prep_cmd = _seqtk_fastq_prep_cl(data, read_num=0) if prep_cmd: fq1_bgzip_cmd = prep_cmd + " | " + fq1_bgzip_cmd sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0] if bam.is_paired(bam_file): prep_cmd = _seqtk_fastq_prep_cl(data, read_num=1) fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2) if prep_cmd: fq2_bgzip_cmd = prep_cmd + " | " + fq2_bgzip_cmd out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null " "O2=/dev/null collate=1 colsbs={max_mem}") else: out_str = "S=>({fq1_bgzip_cmd})" bam_file = objectstore.cl_input(bam_file) extra_opts = " ".join([str(x) for x in resources.get("options", [])]) cmd = "{bamtofastq} filename={bam_file} T={sortprefix} {extra_opts} " + out_str try: do.run(cmd.format(**locals()), "BAM to bgzipped fastq", checks=[do.file_reasonable_size(tx_out_file, bam_file)], log_error=False) except subprocess.CalledProcessError as msg: if not is_retry and "deflate failed" in str(msg): logger.info("bamtofastq deflate IO failure preparing %s. Retrying with single core." % (bam_file)) needs_retry = True else: logger.exception() raise if needs_retry: return _bgzip_from_bam(bam_file, dirs, data, is_retry=True) else: return [x for x in [out_file_1, out_file_2] if x is not None and utils.file_exists(x)]
def _cram_to_fastq_region(cram_file, work_dir, base_name, region, data): """Convert CRAM to fastq in a specified region. """ ref_file = tz.get_in(["reference", "fasta", "base"], data) resources = config_utils.get_resources("bamtofastq", data["config"]) cores = tz.get_in(["config", "algorithm", "num_cores"], data, 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores rext = "-%s" % region.replace(":", "_").replace("-", "_") if region else "full" out_s, out_p1, out_p2, out_o1, out_o2 = [os.path.join(work_dir, "%s%s-%s.fq.gz" % (base_name, rext, fext)) for fext in ["s1", "p1", "p2", "o1", "o2"]] if not utils.file_exists(out_p1): with file_transaction(data, out_s, out_p1, out_p2, out_o1, out_o2) as \ (tx_out_s, tx_out_p1, tx_out_p2, tx_out_o1, tx_out_o2): cram_file = objectstore.cl_input(cram_file) sortprefix = "%s-sort" % utils.splitext_plus(tx_out_s)[0] cmd = ("bamtofastq filename={cram_file} inputformat=cram T={sortprefix} " "gz=1 collate=1 colsbs={max_mem} exclude=SECONDARY,SUPPLEMENTARY " "F={tx_out_p1} F2={tx_out_p2} S={tx_out_s} O={tx_out_o1} O2={tx_out_o2} " "reference={ref_file}") if region: cmd += " ranges='{region}'" do.run(cmd.format(**locals()), "CRAM to fastq %s" % region if region else "") return [[out_p1, out_p2, out_s]]