def align_bam(in_bam, ref_file, names, align_dir, data): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used for input and output max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3, "decrease") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): with utils.curdir_tmpdir() as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)]) return out_file
def align(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform a BWA alignment, generating a SAM file. """ assert not data.get("align_split"), "Do not handle split alignments with non-piped bwa" config = data["config"] sai1_file = os.path.join(align_dir, "%s_1.sai" % names["lane"]) sai2_file = (os.path.join(align_dir, "%s_2.sai" % names["lane"]) if pair_file else None) sam_file = os.path.join(align_dir, "%s.sam" % names["lane"]) if not utils.file_exists(sam_file): if not utils.file_exists(sai1_file): with file_transaction(sai1_file) as tx_sai1_file: _run_bwa_align(fastq_file, ref_file, tx_sai1_file, config) if sai2_file and not utils.file_exists(sai2_file): with file_transaction(sai2_file) as tx_sai2_file: _run_bwa_align(pair_file, ref_file, tx_sai2_file, config) align_type = "sampe" if sai2_file else "samse" rg_info = novoalign.get_rg_info(names) sam_cl = [config_utils.get_program("bwa", config), align_type, "-r", "'%s'" % rg_info, ref_file, sai1_file] if sai2_file: sam_cl.append(sai2_file) sam_cl.append(fastq_file) if sai2_file: sam_cl.append(pair_file) with file_transaction(sam_file) as tx_sam_file: cmd = "{cl} > {out_file}".format(cl=" ".join(sam_cl), out_file=tx_sam_file) do.run(cmd, "bwa {align_type}".format(**locals()), None) return sam_file
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. TODO: Use streaming with new development version of SNAP to feed into structural variation preparation de-duplication. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) assert not data.get( "align_split"), "Split alignments not supported with SNAP" snap = config_utils.get_program("snap", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) resources = config_utils.get_resources("snap", data["config"]) rg_info = novoalign.get_rg_info(names) is_paired = bam.is_paired(fastq_file) if fastq_file.endswith( ".bam") else pair_file if not utils.file_exists(out_file): with postalign.tobam_cl(data, out_file, is_paired) as (tobam_cl, tx_out_file): cmd_name = "paired" if is_paired else "single" cmd = ("{snap} {cmd_name} {index_dir} {fastq_file} {pair_file} " "-R '{rg_info}' -t {num_cores} -M -o -sam - | ") do.run( cmd.format(**locals()) + tobam_cl, "SNAP alignment: %s" % names["sample"]) data["work_bam"] = out_file return data
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. TODO: Use streaming with new development version of SNAP to feed into structural variation preparation de-duplication. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) assert not data.get("align_split"), "Split alignments not supported with SNAP" snap = config_utils.get_program("snap", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) resources = config_utils.get_resources("snap", data["config"]) max_mem = resources.get("memory", "1G") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: with utils.curdir_tmpdir(data) as work_dir: if fastq_file.endswith(".bam"): cmd_name = "paired" if bam.is_paired(fastq_file) else "single" else: cmd_name = "single" if not pair_file else "paired" cmd = ("{snap} {cmd_name} {index_dir} {fastq_file} {pair_file} " "-rg '{rg_info}' -t {num_cores} -sa -so -sm {max_mem} -o {tx_out_file}") do.run(cmd.format(**locals()), "SNAP alignment: %s" % names["sample"]) data["work_bam"] = out_file return data
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. """ umi_ext = "-cumi" if "umi_bam" in data else "" out_file = os.path.join(align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data), umi_ext)) num_cores = data["config"]["algorithm"].get("num_cores", 1) rg_info = novoalign.get_rg_info(names) preset = "sr" pair_file = pair_file if pair_file else "" if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): # If a single index present, index_dir points to that index_file = None if index_dir and os.path.isfile(index_dir): index_dir = os.path.dirname(index_dir) index_file = os.path.join(index_dir, "%s-%s.mmi" % (dd.get_genome_build(data), preset)) if not index_file or not os.path.exists(index_file): index_file = dd.get_ref_file(data) cmd = ("minimap2 -a -x {preset} -R '{rg_info}' -t {num_cores} {index_file} " "{fastq_file} {pair_file} | ") do.run(cmd.format(**locals()) + tobam_cl, "minimap2 alignment: %s" % dd.get_sample_name(data)) data["work_bam"] = out_file return data
def fixrg(in_bam, names, ref_file, dirs, data): """Fix read group in a file, using samtools addreplacerg. addreplacerg does not remove the old read group, causing confusion when checking. We use reheader to work around this """ work_dir = utils.safe_makedir( os.path.join(dirs["work"], "bamclean", dd.get_sample_name(data))) out_file = os.path.join( work_dir, "%s-fixrg.bam" % utils.splitext_plus(os.path.basename(in_bam))[0]) if not utils.file_uptodate(out_file, in_bam): with file_transaction(data, out_file) as tx_out_file: rg_info = novoalign.get_rg_info(names) new_header = "%s-header.txt" % os.path.splitext(out_file)[0] do.run( "samtools view -H {in_bam} | grep -v ^@RG > {new_header}". format(**locals()), "Create empty RG header: %s" % dd.get_sample_name(data)) cmd = ( "samtools reheader {new_header} {in_bam} | " "samtools addreplacerg -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} -" ) do.run(cmd.format(**locals()), "Fix read groups: %s" % dd.get_sample_name(data)) return out_file
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) qual_format = data["config"]["algorithm"].get("quality_format", "").lower() if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file = alignprep.split_namedpipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.split_namedpipe_cl(pair_file, data) else: final_file = None if qual_format == "illumina": fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data) rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): # If we cannot do piping, use older bwa aln approach if not _can_use_mem(fastq_file, data): out_file = _align_backtrack(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) else: out_file = _align_mem(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) data["work_bam"] = out_file return data
def remove_extracontigs(in_bam, data): """Remove extra contigs (non chr1-22,X,Y) from an input BAM. These extra contigs can often be arranged in different ways, causing incompatibility issues with GATK and other tools. This also fixes the read group header as in fixrg. This does not yet handle mapping over 1 -> chr1 issues since this requires a ton of search/replace which slows down conversion. """ work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data))) out_file = os.path.join(work_dir, "%s-noextras.bam" % utils.splitext_plus(os.path.basename(in_bam))[0]) if not utils.file_exists(out_file): out_file = os.path.join(work_dir, "%s-noextras.bam" % dd.get_sample_name(data)) if not utils.file_uptodate(out_file, in_bam): with file_transaction(data, out_file) as tx_out_file: target_chroms = _target_chroms_and_header(in_bam, data) str_chroms = " ".join(target_chroms) rg_info = novoalign.get_rg_info(data["rgnames"]) bcbio_py = sys.executable ref_file = dd.get_ref_file(data) local_bam = os.path.join(os.path.dirname(tx_out_file), os.path.basename(in_bam)) cores = dd.get_cores(data) utils.symlink_plus(in_bam, local_bam) bam.index(local_bam, data["config"]) cmd = ("samtools view -@ {cores} -h {local_bam} {str_chroms} | " """{bcbio_py} -c 'from bcbio.pipeline import cleanbam; """ """cleanbam.fix_header("{ref_file}")' | """ "samtools view -@ {cores} -u - | " "samtools addreplacerg -@ {cores} -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} - ") do.run(cmd.format(**locals()), "bamprep, remove extra contigs: %s" % dd.get_sample_name(data)) return out_file
def align_bam(in_bam, ref_file, names, align_dir, config): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "768M") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): _check_samtools_version() with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") cmd = cmd.format(**locals()) do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [do.file_nonempty(tx_out_file)]) return out_file
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. TODO: Use streaming with new development version of SNAP to feed into structural variation preparation de-duplication. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) assert not data.get( "align_split"), "Split alignments not supported with SNAP" snap = config_utils.get_program("snap", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) resources = config_utils.get_resources("snap", data["config"]) max_mem = resources.get("memory", "1G") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: with utils.curdir_tmpdir(data) as work_dir: if fastq_file.endswith(".bam"): cmd_name = "paired" if bam.is_paired( fastq_file) else "single" else: cmd_name = "single" if not pair_file else "paired" cmd = ( "{snap} {cmd_name} {index_dir} {fastq_file} {pair_file} " "-rg '{rg_info}' -t {num_cores} -sa -so -sm {max_mem} -o {tx_out_file}" ) do.run(cmd.format(**locals()), "SNAP alignment: %s" % names["sample"]) data["work_bam"] = out_file return data
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) qual_format = data["config"]["algorithm"].get("quality_format", "").lower() if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file = alignprep.split_namedpipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.split_namedpipe_cl(pair_file, data) else: final_file = None if qual_format == "illumina": fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data) rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): # If we cannot do piping, use older bwa aln approach if ("bwa-mem" in tz.get_in(["config", "algorithm", "tools_off"], data, []) or not _can_use_mem(fastq_file, data)): out_file = _align_backtrack(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) else: out_file = _align_mem(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) data["work_bam"] = out_file return data
def align_bam(in_bam, ref_file, names, align_dir, config): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used for input and output max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3, "decrease") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): novoalign.check_samtools_version(config) with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") cmd = cmd.format(**locals()) do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [do.file_nonempty(tx_out_file)]) return out_file
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used alongside alignment max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): novoalign.check_samtools_version(config) with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ("{bwa} mem -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} " "{fastq_file} {pair_file} " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") cmd = cmd.format(**locals()) do.run(cmd, "bwa mem alignment from fastq: %s" % names["sample"], None, [do.file_nonempty(tx_out_file)]) return out_file
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" # back compatible -- older files were named with lane information, use sample name now out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) if not utils.file_exists(out_file): out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) qual_format = data["config"]["algorithm"].get("quality_format", "").lower() min_size = None if data.get("align_split") or fastq_file.endswith(".sdf"): if fastq_file.endswith(".sdf"): min_size = rtg.min_read_size(fastq_file) final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None if qual_format == "illumina": fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data) rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): # If we cannot do piping, use older bwa aln approach if ("bwa-mem" not in dd.get_tools_on(data) and ("bwa-mem" in dd.get_tools_off(data) or not _can_use_mem(fastq_file, data, min_size))): out_file = _align_backtrack(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) else: out_file = _align_mem(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) data["work_bam"] = out_file return data
def _get_bwa_mem_cmd(data, out_file, ref_file, fastq1, fastq2=""): """Perform piped bwa mem mapping potentially with alternative alleles in GRCh38 + HLA typing. Commands for HLA post-processing: base=TEST run-HLA $base.hla > $base.hla.top cat $base.hla.HLA*.gt | grep ^GT | cut -f2- > $base.hla.all rm -f $base.hla.HLA*gt rm -f $base.hla.HLA*gz """ alt_file = ref_file + ".alt" if utils.file_exists(alt_file): bwakit_dir = os.path.dirname(os.path.realpath(utils.which("run-bwamem"))) hla_base = os.path.join(utils.safe_makedir(os.path.join(os.path.dirname(out_file), "hla")), os.path.basename(out_file) + ".hla") alt_cmd = (" | {bwakit_dir}/k8 {bwakit_dir}/bwa-postalt.js -p {hla_base} {alt_file}") else: alt_cmd = "" bwa = config_utils.get_program("bwa", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) bwa_resources = config_utils.get_resources("bwa", data["config"]) bwa_params = (" ".join([str(x) for x in bwa_resources.get("options", [])]) if "options" in bwa_resources else "") rg_info = novoalign.get_rg_info(data["rgnames"]) pairing = "-p" if not fastq2 else "" # Restrict seed occurances to 1/2 of default, manage memory usage for centromere repeats in hg38 # https://sourceforge.net/p/bio-bwa/mailman/message/31514937/ # http://ehc.ac/p/bio-bwa/mailman/message/32268544/ mem_usage = "-c 250" bwa_cmd = ("{bwa} mem {pairing} {mem_usage} -M -t {num_cores} {bwa_params} -R '{rg_info}' -v 1 " "{ref_file} {fastq1} {fastq2} ") return (bwa_cmd + alt_cmd).format(**locals())
def remove_extracontigs(in_bam, data): """Remove extra contigs (non chr1-22,X,Y) from an input BAM. These extra contigs can often be arranged in different ways, causing incompatibility issues with GATK and other tools. This also fixes the read group header as in fixrg. """ work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data))) out_file = os.path.join( work_dir, "%s-noextras.bam" % utils.splitext_plus(os.path.basename(in_bam))[0]) if not utils.file_uptodate(out_file, in_bam): with file_transaction(data, out_file) as tx_out_file: target_chroms = [ x.name for x in ref.file_contigs(dd.get_ref_file(data)) if chromhacks.is_autosomal_or_sex(x.name) ] str_chroms = " ".join(target_chroms) comma_chroms = ",".join(target_chroms) rg_info = novoalign.get_rg_info(data["rgnames"]) bcbio_py = sys.executable cmd = ( "samtools view -h {in_bam} {str_chroms} | " """{bcbio_py} -c 'from bcbio.pipeline import cleanbam; """ """cleanbam.fix_header("{comma_chroms}")' | """ "samtools view -u - | " "samtools addreplacerg -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} - " ) do.run( cmd.format(**locals()), "bamprep, remove extra contigs: %s" % dd.get_sample_name(data)) return out_file
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. """ umi_ext = "-cumi" if "umi_bam" in data else "" out_file = os.path.join(align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data), umi_ext)) num_cores = data["config"]["algorithm"].get("num_cores", 1) rg_info = novoalign.get_rg_info(names) preset = "sr" pair_file = pair_file if pair_file else "" if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): index_file = None # Skip trying to use indices now as they provide only slight speed-ups # and give inconsitent outputs in BAM headers # If a single index present, index_dir points to that # if index_dir and os.path.isfile(index_dir): # index_dir = os.path.dirname(index_dir) # index_file = os.path.join(index_dir, "%s-%s.mmi" % (dd.get_genome_build(data), preset)) if not index_file or not os.path.exists(index_file): index_file = dd.get_ref_file(data) cmd = ("minimap2 -a -x {preset} -R '{rg_info}' -t {num_cores} {index_file} " "{fastq_file} {pair_file} | ") do.run(cmd.format(**locals()) + tobam_cl, "minimap2 alignment: %s" % dd.get_sample_name(data)) data["work_bam"] = out_file return data
def remove_extracontigs(in_bam, data): """Remove extra contigs (non chr1-22,X,Y) from an input BAM. These extra contigs can often be arranged in different ways, causing incompatibility issues with GATK and other tools. This also fixes the read group header as in fixrg. This does not yet handle mapping over 1 -> chr1 issues since this requires a ton of search/replace which slows down conversion. """ work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data))) out_file = os.path.join(work_dir, "%s-noextras.bam" % utils.splitext_plus(os.path.basename(in_bam))[0]) if not utils.file_uptodate(out_file, in_bam): with file_transaction(data, out_file) as tx_out_file: target_chroms = _target_chroms_and_header(in_bam, data) str_chroms = " ".join(target_chroms) rg_info = novoalign.get_rg_info(data["rgnames"]) bcbio_py = sys.executable ref_file = dd.get_ref_file(data) cmd = ("samtools view -h {in_bam} {str_chroms} | " """{bcbio_py} -c 'from bcbio.pipeline import cleanbam; """ """cleanbam.fix_header("{ref_file}")' | """ "samtools view -u - | " "samtools addreplacerg -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} - ") do.run(cmd.format(**locals()), "bamprep, remove extra contigs: %s" % dd.get_sample_name(data)) return out_file
def align_bam(in_bam, ref_file, names, align_dir, config): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "768M") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): _check_samtools_version() with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") logger.info(cmd.format(**locals())) subprocess.check_call(cmd.format(**locals()), shell=True) return out_file
def align(fastq_file, pair_file, ref_file, names, align_dir, data, extra_args=None): """Do standard or paired end alignment with bowtie. """ num_hits = 1 if data["analysis"].lower().startswith("smallrna-seq"): num_hits = 1000 config = data['config'] out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls( fastq_file, pair_file, data) else: final_file = None if fastq_file.endswith(".gz"): fastq_file = "<(gunzip -c %s)" % fastq_file if pair_file: pair_file = "<(gunzip -c %s)" % pair_file if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): cl = [config_utils.get_program("bowtie", config)] cl += _bowtie_args_from_config(data) cl += extra_args if extra_args is not None else [] cl += [ "-q", "-v", 2, "-k", num_hits, "-X", 2000, # default is too selective for most data "--best", "--strata", "--sam", ref_file ] if pair_file: cl += ["-1", fastq_file, "-2", pair_file] else: cl += [fastq_file] cl = [str(i) for i in cl] fix_rg_cmd = r"samtools addreplacerg -r '%s' -" % novoalign.get_rg_info( names) cmd = " ".join(cl) + " | " + fix_rg_cmd + " | " + tobam_cl do.run(cmd, "Running Bowtie on %s and %s." % (fastq_file, pair_file), data) return out_file
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" # back compatible -- older files were named with lane information, use sample name now if names["lane"] != dd.get_sample_name(data): out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) else: out_file = None if not out_file or not utils.file_exists(out_file): umi_ext = "-cumi" if "umi_bam" in data else "" out_file = os.path.join( align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data), umi_ext)) qual_format = data["config"]["algorithm"].get("quality_format", "").lower() min_size = None if data.get("align_split") or fastq_file.endswith(".sdf"): if fastq_file.endswith(".sdf"): min_size = rtg.min_read_size(fastq_file) final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls( fastq_file, pair_file, data) else: final_file = None if qual_format == "illumina": fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data) rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): # If we cannot do piping, use older bwa aln approach if ("bwa-mem" not in dd.get_tools_on(data) and ("bwa-mem" in dd.get_tools_off(data) or not _can_use_mem(fastq_file, data, min_size))): out_file = _align_backtrack(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) else: if is_precollapsed_bam( data) or not hla_on(data) or needs_separate_hla(data): out_file = _align_mem(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) else: out_file = _align_mem_hla(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) data["work_bam"] = out_file # bwakit will corrupt the non-HLA alignments in a UMI collapsed BAM file # (see https://github.com/bcbio/bcbio-nextgen/issues/3069) if needs_separate_hla(data): hla_file = os.path.join(os.path.dirname(out_file), "HLA-" + os.path.basename(out_file)) hla_file = _align_mem_hla(fastq_file, pair_file, ref_file, hla_file, names, rg_info, data) data["hla_bam"] = hla_file return data
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) qual_format = data["config"]["algorithm"].get("quality_format", "").lower() if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file = alignprep.split_namedpipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.split_namedpipe_cl(pair_file, data) else: final_file = None if qual_format == "illumina": fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data) samtools = config_utils.get_program("samtools", data["config"]) bwa = config_utils.get_program("bwa", data["config"]) resources = config_utils.get_resources("samtools", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) # adjust memory for samtools since used alongside alignment max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): # If we cannot do piping, use older bwa aln approach if not can_pipe(fastq_file, data): return align(fastq_file, pair_file, ref_file, names, align_dir, data) else: with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ( "{bwa} mem -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} " "{fastq_file} {pair_file} " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}" ) cmd = cmd.format(**locals()) do.run( cmd, "bwa mem alignment from fastq: %s" % names["sample"], None, [ do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file) ]) data["work_bam"] = out_file return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data, extra_args=None): """Do standard or paired end alignment with bowtie. """ num_hits = 1 if data["analysis"].lower().startswith("smallrna-seq"): num_hits = 1000 config = data['config'] out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None if fastq_file.endswith(".gz"): fastq_file = "<(gunzip -c %s)" % fastq_file if pair_file: pair_file = "<(gunzip -c %s)" % pair_file if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): cl = [config_utils.get_program("bowtie", config)] cl += _bowtie_args_from_config(data) cl += extra_args if extra_args is not None else [] cl += ["-q", "-v", 2, "-k", num_hits, "-X", 2000, # default is too selective for most data "--best", "--strata", "--sam", ref_file] if pair_file: cl += ["-1", fastq_file, "-2", pair_file] else: cl += [fastq_file] cl = [str(i) for i in cl] fix_rg_cmd = r"samtools addreplacerg -r '%s' -" % novoalign.get_rg_info(data["rgnames"]) if fix_rg_cmd: cmd = " ".join(cl) + " | " + fix_rg_cmd + " | " + tobam_cl else: cmd = " ".join(cl) + " | " + tobam_cl do.run(cmd, "Running Bowtie on %s and %s." % (fastq_file, pair_file), data) return out_file
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. Pipes in input, handling paired and split inputs, using interleaving magic from: https://biowize.wordpress.com/2015/03/26/the-fastest-darn-fastq-decoupling-procedure-i-ever-done-seen/ """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) num_cores = data["config"]["algorithm"].get("num_cores", 1) resources = config_utils.get_resources("snap", data["config"]) rg_info = novoalign.get_rg_info(names) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) fastq_file = fastq_file[2:-1] if pair_file: pair_file = pair_file[2:-1] stream_input = (r"paste <({fastq_file} | paste - - - -) " r"<({pair_file} | paste - - - -) | tr '\t' '\n'") else: stream_input = fastq_file[2:-1] else: assert fastq_file.endswith(".gz") if pair_file: stream_input = (r"paste <(zcat {fastq_file} | paste - - - -) " r"<(zcat {pair_file} | paste - - - -) | tr '\t' '\n'") else: stream_input = "zcat {fastq_file}" pair_file = pair_file if pair_file else "" if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): if pair_file: sub_cmd = "paired" input_cmd = "-pairedInterleavedFastq -" else: sub_cmd = "single" input_cmd = "-fastq -" stream_input = stream_input.format(**locals()) cmd = ("{stream_input} | snap-aligner {sub_cmd} {index_dir} {input_cmd} " "-R '{rg_info}' -t {num_cores} -M -o -sam - | ") do.run(cmd.format(**locals()) + tobam_cl, "SNAP alignment: %s" % names["sample"]) data["work_bam"] = out_file return data
def fixrg(in_bam, names, ref_file, dirs, data): """Fix read group in a file, using samtools addreplacerg. addreplacerg does not remove the old read group, causing confusion when checking. We use reheader to work around this """ work_dir = utils.safe_makedir(os.path.join(dirs["work"], "bamclean", dd.get_sample_name(data))) out_file = os.path.join(work_dir, "%s-fixrg.bam" % utils.splitext_plus(os.path.basename(in_bam))[0]) if not utils.file_uptodate(out_file, in_bam): with file_transaction(data, out_file) as tx_out_file: rg_info = novoalign.get_rg_info(names) new_header = "%s-header.txt" % os.path.splitext(out_file)[0] do.run("samtools view -H {in_bam} | grep -v ^@RG > {new_header}".format(**locals()), "Create empty RG header: %s" % dd.get_sample_name(data)) cmd = ("samtools reheader {new_header} {in_bam} | " "samtools addreplacerg -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} -") do.run(cmd.format(**locals()), "Fix read groups: %s" % dd.get_sample_name(data)) return out_file
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used for input and output max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3, "decrease").upper() bwa_resources = config_utils.get_resources("bwa", data["config"]) bwa_params = (" ".join([str(x) for x in bwa_resources.get("options", [])]) if "options" in bwa_resources else "") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): with tx_tmpdir(data) as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix in_bam = utils.remote_cl_input(in_bam) cmd = ( "{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa} mem -p -M -t {num_cores} {bwa_params} -R '{rg_info}' -v 1 {ref_file} - | " ) cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [ do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam) ]) return out_file
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) qual_format = data["config"]["algorithm"].get("quality_format", "").lower() if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file = alignprep.split_namedpipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.split_namedpipe_cl(pair_file, data) else: final_file = None if qual_format == "illumina": fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data) samtools = config_utils.get_program("samtools", data["config"]) bwa = config_utils.get_program("bwa", data["config"]) resources = config_utils.get_resources("samtools", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) # adjust memory for samtools since used alongside alignment max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): # If we cannot do piping, use older bwa aln approach if not can_pipe(fastq_file, data): return align(fastq_file, pair_file, ref_file, names, align_dir, data) else: with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ("{bwa} mem -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} " "{fastq_file} {pair_file} " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") cmd = cmd.format(**locals()) do.run(cmd, "bwa mem alignment from fastq: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)]) data["work_bam"] = out_file return data
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. TODO: Use streaming with new development version of SNAP to feed into structural variation preparation de-duplication. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) assert not data.get("align_split"), "Split alignments not supported with SNAP" snap = config_utils.get_program("snap", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) resources = config_utils.get_resources("snap", data["config"]) rg_info = novoalign.get_rg_info(names) is_paired = bam.is_paired(fastq_file) if fastq_file.endswith(".bam") else pair_file if not utils.file_exists(out_file): with postalign.tobam_cl(data, out_file, is_paired) as (tobam_cl, tx_out_file): cmd_name = "paired" if is_paired else "single" cmd = ("{snap} {cmd_name} {index_dir} {fastq_file} {pair_file} " "-R '{rg_info}' -t {num_cores} -M -o -sam - | ") do.run(cmd.format(**locals()) + tobam_cl, "SNAP alignment: %s" % names["sample"]) data["work_bam"] = out_file return data
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "768M") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): _check_samtools_version() with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ("{bwa} mem -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} " "{fastq_file} {pair_file} " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") logger.info(cmd.format(**locals())) subprocess.check_call(cmd.format(**locals()), shell=True) return out_file
def remove_extracontigs(in_bam, data): """Remove extra contigs (non chr1-22,X,Y) from an input BAM. These extra contigs can often be arranged in different ways, causing incompatibility issues with GATK and other tools. This also fixes the read group header as in fixrg. """ work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data))) out_file = os.path.join(work_dir, "%s-noextras.bam" % utils.splitext_plus(os.path.basename(in_bam))[0]) if not utils.file_uptodate(out_file, in_bam): with file_transaction(data, out_file) as tx_out_file: target_chroms = [x.name for x in ref.file_contigs(dd.get_ref_file(data)) if chromhacks.is_autosomal_or_sex(x.name)] str_chroms = " ".join(target_chroms) comma_chroms = ",".join(target_chroms) rg_info = novoalign.get_rg_info(data["rgnames"]) bcbio_py = sys.executable cmd = ("samtools view -h {in_bam} {str_chroms} | " """{bcbio_py} -c 'from bcbio.pipeline import cleanbam; """ """cleanbam.fix_header("{comma_chroms}")' | """ "samtools view -u - | " "samtools addreplacerg -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} - ") do.run(cmd.format(**locals()), "bamprep, remove extra contigs: %s" % dd.get_sample_name(data)) return out_file
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. Pipes in input, handling paired and split inputs, using interleaving magic from: https://biowize.wordpress.com/2015/03/26/the-fastest-darn-fastq-decoupling-procedure-i-ever-done-seen/ Then converts a tab delimited set of outputs into interleaved fastq. awk changes spaces to underscores since SNAP only takes the initial name. SNAP requires /1 and /2 at the end of read names. If these are not present in the initial fastq may need to expand awk code to do this. """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) num_cores = data["config"]["algorithm"].get("num_cores", 1) resources = config_utils.get_resources("snap", data["config"]) rg_info = novoalign.get_rg_info(names) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) fastq_file = fastq_file[2:-1] if pair_file: pair_file = pair_file[2:-1] stream_input = (r"paste <({fastq_file} | paste - - - -) " r"<({pair_file} | paste - - - -) | " r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """ r"""{{ """ r"""split($1, P1, " "); split($5, P5, " "); """ r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """ r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """ r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' | sponge """) else: stream_input = fastq_file[2:-1] else: final_file = None assert fastq_file.endswith(".gz") if pair_file: stream_input = (r"paste <(zcat {fastq_file} | paste - - - -) " r"<(zcat {pair_file} | paste - - - -) | " r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """ r"""{{ """ r"""split($1, P1, " "); split($5, P5, " "); """ r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """ r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """ r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' | sponge """) else: stream_input = "zcat {fastq_file}" pair_file = pair_file if pair_file else "" if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): if pair_file: sub_cmd = "paired" input_cmd = "-pairedInterleavedFastq -" else: sub_cmd = "single" input_cmd = "-fastq -" stream_input = stream_input.format(**locals()) tmp_dir = os.path.dirname(tx_out_file) cmd = ("export TMPDIR={tmp_dir} && {stream_input} | snap-aligner {sub_cmd} {index_dir} {input_cmd} " "-R '{rg_info}' -t {num_cores} -M -o -sam - | ") do.run(cmd.format(**locals()) + tobam_cl, "SNAP alignment: %s" % names["sample"]) data["work_bam"] = out_file return data
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. Pipes in input, handling paired and split inputs, using interleaving magic from: https://biowize.wordpress.com/2015/03/26/the-fastest-darn-fastq-decoupling-procedure-i-ever-done-seen/ Then converts a tab delimited set of outputs into interleaved fastq. awk changes spaces to underscores since SNAP only takes the initial name. SNAP requires /1 and /2 at the end of read names. If these are not present in the initial fastq may need to expand awk code to do this. """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) num_cores = data["config"]["algorithm"].get("num_cores", 1) resources = config_utils.get_resources("snap", data["config"]) rg_info = novoalign.get_rg_info(names) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls( fastq_file, pair_file, data) fastq_file = fastq_file[2:-1] if pair_file: pair_file = pair_file[2:-1] stream_input = ( r"paste <({fastq_file} | paste - - - -) " r"<({pair_file} | paste - - - -) | " r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """ r"""{{ """ r"""split($1, P1, " "); split($5, P5, " "); """ r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """ r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """ r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' """) else: stream_input = fastq_file[2:-1] else: final_file = None assert fastq_file.endswith(".gz") if pair_file: stream_input = ( r"paste <(zcat {fastq_file} | paste - - - -) " r"<(zcat {pair_file} | paste - - - -) | " r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """ r"""{{ """ r"""split($1, P1, " "); split($5, P5, " "); """ r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """ r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """ r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' """) else: stream_input = "zcat {fastq_file}" pair_file = pair_file if pair_file else "" if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): if pair_file: sub_cmd = "paired" input_cmd = "-pairedInterleavedFastq -" else: sub_cmd = "single" input_cmd = "-fastq -" stream_input = stream_input.format(**locals()) tmp_dir = os.path.dirname(tx_out_file) cmd = ( "export TMPDIR={tmp_dir} && unset JAVA_HOME && {stream_input} | " "snap-aligner {sub_cmd} {index_dir} {input_cmd} " "-R '{rg_info}' -t {num_cores} -M -o -sam - | ") do.run( cmd.format(**locals()) + tobam_cl, "SNAP alignment: %s" % names["sample"]) data["work_bam"] = out_file return data