def _set_quality_flag(options, data): qual_format = dd.get_quality_format(data) if qual_format.lower() == "illumina": options["solexa1.3-quals"] = True elif qual_format.lower() == "solexa": options["solexa-quals"] = True return options
def get_fastq_files(data): """Retrieve fastq files for the given lane, ready to process. """ assert "files" in data, "Did not find `files` in input; nothing to process" ready_files = [] should_gzip = True # Bowtie does not accept gzipped fastq if 'bowtie' in data['reference'].keys(): should_gzip = False for fname in data["files"]: if fname.endswith(".bam"): if _pipeline_needs_fastq(data["config"], data): ready_files = convert_bam_to_fastq(fname, data["dirs"]["work"], data, data["dirs"], data["config"]) else: ready_files = [fname] elif objectstore.is_remote(fname): ready_files.append(fname) # Trimming does quality conversion, so if not doing that, do an explicit conversion elif not(dd.get_trim_reads(data)) and dd.get_quality_format(data) != "standard": out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq_convert")) ready_files.append(fastq.groom(fname, data, out_dir=out_dir)) else: ready_files.append(fname) ready_files = [x for x in ready_files if x is not None] if should_gzip: out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq")) ready_files = [_gzip_fastq(x, out_dir) for x in ready_files] for in_file in ready_files: if not objectstore.is_remote(in_file): assert os.path.exists(in_file), "%s does not exist." % in_file return ready_files
def align_transcriptome(fastq_file, pair_file, ref_file, data): """ bwa mem with settings for aligning to the transcriptome for eXpress/RSEM/etc """ work_bam = dd.get_work_bam(data) base, ext = os.path.splitext(work_bam) out_file = base + ".transcriptome" + ext if utils.file_exists(out_file): data = dd.set_transcriptome_bam(data, out_file) return data # bwa mem needs phred+33 quality, so convert if it is Illumina if dd.get_quality_format(data).lower() == "illumina": logger.info("bwa mem does not support the phred+64 quality format, " "converting %s and %s to phred+33.") fastq_file = fastq.groom(fastq_file, data, in_qual="fastq-illumina") if pair_file: pair_file = fastq.groom(pair_file, data, in_qual="fastq-illumina") bwa = config_utils.get_program("bwa", data["config"]) gtf_file = dd.get_gtf_file(data) gtf_fasta = index_transcriptome(gtf_file, ref_file, data) args = " ".join(_bwa_args_from_config(data["config"])) num_cores = data["config"]["algorithm"].get("num_cores", 1) samtools = config_utils.get_program("samtools", data["config"]) cmd = ("{bwa} mem {args} -a -t {num_cores} {gtf_fasta} {fastq_file} " "{pair_file} | {samtools} view -bhS - > {tx_out_file}") with file_transaction(data, out_file) as tx_out_file: message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file) do.run(cmd.format(**locals()), message) data = dd.set_transcriptome_bam(data, out_file) return data
def to_sdf(files, data): """Convert a fastq or BAM input into a SDF indexed file. """ # BAM if len(files) == 1 and files[0].endswith(".bam"): qual = [] format = ["-f", "sam-pe" if bam.is_paired(files[0]) else "sam-se"] inputs = [files[0]] # fastq else: qual = [ "-q", "illumina" if dd.get_quality_format(data).lower() == "illumina" else "sanger" ] format = ["-f", "fastq"] if len(files) == 2: inputs = ["-l", files[0], "-r", files[1]] else: assert len(files) == 1 inputs = [files[0]] work_dir = utils.safe_makedir( os.path.join(data["dirs"]["work"], "align_prep")) out_file = os.path.join( work_dir, "%s.sdf" % utils.splitext_plus(os.path.basename(os.path.commonprefix(files)))[0]) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = _rtg_cmd(["rtg", "format", "-o", tx_out_file] + format + qual + inputs) do.run(cmd, "Format inputs to indexed SDF") return out_file
def gatk_splitreads(data): """ use GATK to split reads with Ns in the CIGAR string, hard clipping regions that end up in introns """ broad_runner = broad.runner_from_config(dd.get_config(data)) ref_file = dd.get_ref_file(data) deduped_bam = dd.get_deduped_bam(data) base, ext = os.path.splitext(deduped_bam) split_bam = base + ".splitN" + ext if dd.get_quality_format(data) == "illumina": quality_flag = ["--fix_misencoded_quality_scores", "-fixMisencodedQuals"] else: quality_flag = [] if file_exists(split_bam): data = dd.set_split_bam(data, split_bam) return data with file_transaction(split_bam) as tx_split_bam: params = ["-T", "SplitNCigarReads", "-R", ref_file, "-I", deduped_bam, "-o", tx_split_bam, "-rf", "ReassignOneMappingQuality", "-RMQF", "255", "-RMQT", "60", "-rf", "UnmappedRead", "-U", "ALLOW_N_CIGAR_READS"] + quality_flag broad_runner.run_gatk(params) bam.index(split_bam, dd.get_config(data)) data = dd.set_split_bam(data, split_bam) return data
def _fastp_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with fastp (https://github.com/OpenGene/fastp) """ report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report = tx_out[0] tx_out_files = tx_out[1:] cmd = ["fastp", "--thread", dd.get_num_cores(data)] if dd.get_quality_format(data).lower() == "illumina": cmd += ["--phred64"] for i, (inf, outf) in enumerate(zip(fastq_files, tx_out_files)): if i == 0: cmd += ["-i", inf, "-o", outf] else: cmd += ["-I", inf, "-O", outf] cmd += ["--trim_poly_g", "--cut_by_quality3", "--cut_mean_quality", "5", "--disable_quality_filtering", "--length_required", str(dd.get_min_read_length(data))] for a in adapters: cmd += ["--adapter_sequence", a] if not adapters: cmd += ["--disable_adapter_trimming"] cmd += ["--json", report_file, "--report_title", dd.get_sample_name(data)] do.run(cmd, "Trimming with fastp: %s" % dd.get_sample_name(data)) return out_files, report_file
def _run_qc_tools(bam_file, data): """Run a set of third party quality control tools, returning QC directory and metrics. :param bam_file: alignments in bam format :param data: dict with all configuration information :returns: dict with output of different tools """ from bcbio.qc import (atropos, coverage, damage, fastqc, kraken, qsignature, qualimap, samtools, picard, srna, umi, variant, viral, preseq) tools = {"fastqc": fastqc.run, "atropos": atropos.run, "small-rna": srna.run, "samtools": samtools.run, "qualimap": qualimap.run, "qualimap_rnaseq": qualimap.run_rnaseq, "qsignature": qsignature.run, "coverage": coverage.run, "damage": damage.run, "variants": variant.run, "peddy": peddy.run_qc, "kraken": kraken.run, "picard": picard.run, "umi": umi.run, "viral": viral.run, "preseq": preseq.run, } qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"])) metrics = {} qc_out = utils.deepish_copy(dd.get_summary_qc(data)) for program_name in dd.get_algorithm_qc(data): if not bam_file and program_name != "kraken": # kraken doesn't need bam continue if dd.get_phenotype(data) == "germline" and program_name != "variants": continue qc_fn = tools[program_name] cur_qc_dir = os.path.join(qc_dir, program_name) out = qc_fn(bam_file, data, cur_qc_dir) qc_files = None if out and isinstance(out, dict): # Check for metrics output, two cases: # 1. output with {"metrics"} and files ("base") if "metrics" in out: metrics.update(out.pop("metrics")) # 2. a dictionary of metrics elif "base" not in out: metrics.update(out) # Check for files only output if "base" in out: qc_files = out elif out and isinstance(out, basestring) and os.path.exists(out): qc_files = {"base": out, "secondary": []} if not qc_files: qc_files = _organize_qc_files(program_name, cur_qc_dir) if qc_files: qc_out[program_name] = qc_files metrics["Name"] = dd.get_sample_name(data) metrics["Quality format"] = dd.get_quality_format(data).lower() return {"qc": qc_out, "metrics": metrics}
def _filter_bad_reads(in_bam, ref_file, data): """Use GATK filter to remove problem reads which choke GATK and Picard. """ bam.index(in_bam, data["config"]) out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0] if not utils.file_exists(out_file): with tx_tmpdir(data) as tmp_dir: with file_transaction(data, out_file) as tx_out_file: params = [ "-T", "PrintReads", "-R", ref_file, "-I", in_bam, "--out", tx_out_file, "--filter_mismatching_base_and_quals", "--filter_bases_not_stored", "--filter_reads_with_N_cigar", ] if dd.get_quality_format(data, "").lower() == "illumina": params.append("--fix_misencoded_quality_scores") jvm_opts = broad.get_gatk_framework_opts(data["config"], tmp_dir) cmd = [config_utils.get_program("gatk-framework", data["config"])] + jvm_opts + params do.run(cmd, "Filter problem reads") bam.index(out_file, data["config"]) return out_file
def _bgzip_from_fastq(data): """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already). """ in_file = data["in_file"] needs_convert = dd.get_quality_format(data).lower() == "illumina" if in_file.endswith(".gz") and not objectstore.is_remote(in_file): if needs_convert: needs_bgzip, needs_gunzip = True, True else: needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data) elif in_file.endswith(".bz2"): needs_bgzip, needs_gunzip = True, True elif objectstore.is_remote(in_file) and not tz.get_in( ["config", "algorithm", "align_split_size"], data): needs_bgzip, needs_gunzip = False, False else: needs_bgzip, needs_gunzip = True, False work_dir = utils.safe_makedir( os.path.join(data["dirs"]["work"], "align_prep")) if needs_bgzip or needs_gunzip or needs_convert or objectstore.is_remote( in_file): out_file = _bgzip_file(in_file, data["config"], work_dir, needs_bgzip, needs_gunzip, needs_convert) else: out_file = os.path.join( work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file))) utils.symlink_plus(in_file, out_file) return out_file
def _fastp_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with fastp (https://github.com/OpenGene/fastp) """ report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report = tx_out[0] tx_out_files = tx_out[1:] cmd = ["fastp", "--thread", dd.get_num_cores(data)] if dd.get_quality_format(data).lower() == "illumina": cmd += ["--phred64"] for i, (inf, outf) in enumerate(zip(fastq_files, tx_out_files)): if i == 0: cmd += ["-i", inf, "-o", outf] else: cmd += ["-I", inf, "-O", outf] cmd += ["--cut_by_quality3", "--cut_mean_quality", "5", "--length_required", str(dd.get_min_read_length(data)), "--disable_quality_filtering"] if "polyx" in dd.get_adapters(data): cmd += ["--trim_poly_x", "--poly_x_min_len", "8"] if "polyx" in dd.get_adapters(data) or "polyg" in dd.get_adapters(data): cmd += ["--trim_poly_g", "--poly_g_min_len", "8"] for a in adapters: cmd += ["--adapter_sequence", a] if not adapters: cmd += ["--disable_adapter_trimming"] cmd += ["--json", report_file, "--report_title", dd.get_sample_name(data)] do.run(cmd, "Trimming with fastp: %s" % dd.get_sample_name(data)) return out_files, report_file
def align_transcriptome(fastq_file, pair_file, ref_file, data): """ bwa mem with settings for aligning to the transcriptome for eXpress/RSEM/etc """ work_bam = dd.get_work_bam(data) base, ext = os.path.splitext(work_bam) out_file = base + ".transcriptome" + ext if utils.file_exists(out_file): data = dd.set_transcriptome_bam(data, out_file) return data # bwa mem needs phred+33 quality, so convert if it is Illumina if dd.get_quality_format(data).lower() == "illumina": logger.info("bwa mem does not support the phred+64 quality format, " "converting %s and %s to phred+33.") fastq_file = fastq.groom(fastq_file, in_qual="fastq-illumina", data=data) if pair_file: pair_file = fastq.groom(pair_file, in_qual="fastq-illumina", data=data) bwa = config_utils.get_program("bwa", data["config"]) gtf_file = dd.get_gtf_file(data) gtf_fasta = index_transcriptome(gtf_file, ref_file, data) args = " ".join(_bwa_args_from_config(data["config"])) num_cores = data["config"]["algorithm"].get("num_cores", 1) cmd = ( "{bwa} mem {args} -a -t {num_cores} {gtf_fasta} {fastq_file} " "{pair_file} | samtools view -bhS - > {tx_out_file}" ) with file_transaction(out_file) as tx_out_file: message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file) do.run(cmd.format(**locals()), message) data = dd.set_transcriptome_bam(data, out_file) return data
def _bgzip_from_fastq(data): """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already). """ in_file = data["in_file"] needs_convert = dd.get_quality_format(data).lower() == "illumina" # special case, empty files that have been cleaned if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0: needs_bgzip, needs_gunzip = False, False elif in_file.endswith(".gz") and not objectstore.is_remote(in_file): if needs_convert or dd.get_trim_ends(data): needs_bgzip, needs_gunzip = True, True else: needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data) elif in_file.endswith(".bz2"): needs_bgzip, needs_gunzip = True, True elif objectstore.is_remote(in_file) and not tz.get_in(["config", "algorithm", "align_split_size"], data): needs_bgzip, needs_gunzip = False, False else: needs_bgzip, needs_gunzip = True, False work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep")) if needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends(data) or objectstore.is_remote(in_file): out_file = _bgzip_file(in_file, data["config"], work_dir, needs_bgzip, needs_gunzip, needs_convert, data) else: out_file = os.path.join(work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file))) # We cannot symlink in CWL, but may be able to use inputs or copy if data.get("is_cwl"): # Has grabix indexes, we're okay to go if utils.file_exists(in_file + ".gbi"): return in_file else: return utils.copy_plus(in_file, out_file) else: utils.symlink_plus(in_file, out_file) return out_file
def to_sdf(files, data): """Convert a fastq or BAM input into a SDF indexed file. """ # BAM if len(files) == 1 and files[0].endswith(".bam"): qual = [] format = ["-f", "sam-pe" if bam.is_paired(files[0]) else "sam-se"] inputs = [files[0]] # fastq else: qual = ["-q", "illumina" if dd.get_quality_format(data).lower() == "illumina" else "sanger"] format = ["-f", "fastq"] if len(files) == 2: inputs = ["-l", files[0], "-r", files[1]] else: assert len(files) == 1 inputs = [files[0]] work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep")) out_file = os.path.join(work_dir, "%s.sdf" % utils.splitext_plus(os.path.basename(os.path.commonprefix(files)))[0]) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = _rtg_cmd(["rtg", "format", "-o", tx_out_file] + format + qual + inputs) do.run(cmd, "Format inputs to indexed SDF") return out_file
def _bgzip_from_fastq(data): """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already). """ in_file = data["in_file"] if isinstance(in_file, (list, tuple)): in_file = in_file[0] needs_convert = dd.get_quality_format(data).lower() == "illumina" # special case, empty files that have been cleaned if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0: needs_bgzip, needs_gunzip = False, False elif in_file.endswith(".gz") and not objectstore.is_remote(in_file): if needs_convert or dd.get_trim_ends(data): needs_bgzip, needs_gunzip = True, True else: needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data) elif in_file.endswith(".bz2"): needs_bgzip, needs_gunzip = True, True elif objectstore.is_remote(in_file) and not tz.get_in(["config", "algorithm", "align_split_size"], data): needs_bgzip, needs_gunzip = False, False else: needs_bgzip, needs_gunzip = True, False work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep")) if (needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends(data) or objectstore.is_remote(in_file) or (isinstance(data["in_file"], (tuple, list)) and len(data["in_file"]) > 1)): out_file = _bgzip_file(data["in_file"], data["config"], work_dir, needs_bgzip, needs_gunzip, needs_convert, data) else: out_file = os.path.join(work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file))) out_file = _symlink_or_copy_grabix(in_file, out_file, data) return out_file
def _get_quality_flag(data): qual_format = dd.get_quality_format(data) if qual_format.lower() == "illumina": return "--phred64" elif qual_format.lower() == "solexa": return "--solexa-quals" else: return "--phred33"
def _atropos_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with atropos. """ report_file = os.path.join( out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [ os.path.join( out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files ] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report_file, tx_out1 = tx_out[:2] if len(tx_out) > 2: tx_out2 = tx_out[2] adapters_args = " ".join(["-a %s" % a for a in adapters]) aligner_args = "--aligner adapter" if len(fastq_files) == 1: input_args = "-se %s" % objectstore.cl_input(fastq_files[0]) output_args = "-o >(bgzip --threads %s -c > {tx_out1})".format( **locals()) else: assert len(fastq_files) == 2, fastq_files adapters_args = adapters_args + " " + " ".join( ["-A %s" % a for a in adapters]) input_args = "-pe1 %s -pe2 %s" % tuple( [objectstore.cl_input(x) for x in fastq_files]) output_args = "-o >(bgzip -c > {tx_out1}) -p >(bgzip -c > {tx_out2})".format( **locals()) adapters_args += " --no-default-adapters" # Prevent GitHub queries quality_base = "64" if dd.get_quality_format( data).lower() == "illumina" else "33" sample_name = dd.get_sample_name(data) report_args = "--report-file %s --report-formats json --sample-id %s" % ( tx_report_file, dd.get_sample_name(data)) ropts = " ".join( str(x) for x in config_utils.get_resources( "atropos", data["config"]).get("options", [])) extra_opts = [] for k, alt_ks, v in [("--quality-cutoff", ["-q "], "5"), ("--minimum-length", ["-m "], str(dd.get_min_read_length(data))), ("--nextseq-trim", [], "25")]: if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks): extra_opts.append("%s=%s" % (k, v)) extra_opts = " ".join(extra_opts) thread_args = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") cmd = ( "atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq " "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}" ) do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data)) return out_files, report_file
def _run_qc_tools(bam_file, data): """Run a set of third party quality control tools, returning QC directory and metrics. :param bam_file: alignments in bam format :param data: dict with all configuration information :returns: dict with output of different tools """ from bcbio.qc import (coverage, damage, fastqc, kraken, qsignature, qualimap, samtools, picard, srna, umi, variant, viral, preseq) tools = {"fastqc": fastqc.run, "small-rna": srna.run, "samtools": samtools.run, "qualimap": qualimap.run, "qualimap_rnaseq": qualimap.run_rnaseq, "qsignature": qsignature.run, "coverage": coverage.run, "damage": damage.run, "variants": variant.run, "kraken": kraken.run, "picard": picard.run, "umi": umi.run, "viral": viral.run, "preseq": preseq.run, } qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"])) metrics = {} qc_out = {} for program_name in dd.get_algorithm_qc(data): if not bam_file and program_name != "kraken": # kraken doesn't need bam continue if dd.get_phenotype(data) == "germline" and program_name != "variants": continue qc_fn = tools[program_name] cur_qc_dir = os.path.join(qc_dir, program_name) out = qc_fn(bam_file, data, cur_qc_dir) qc_files = None if out and isinstance(out, dict): # Check for metrics output, two cases: # 1. output with {"metrics"} and files ("base") if "metrics" in out: metrics.update(out.pop("metrics")) # 2. a dictionary of metrics elif "base" not in out: metrics.update(out) # Check for files only output if "base" in out: qc_files = out elif out and isinstance(out, basestring) and os.path.exists(out): qc_files = {"base": out, "secondary": []} if not qc_files: qc_files = _organize_qc_files(program_name, cur_qc_dir) if qc_files: qc_out[program_name] = qc_files metrics["Name"] = dd.get_sample_name(data) metrics["Quality format"] = dd.get_quality_format(data).lower() return {"qc": qc_out, "metrics": metrics}
def _get_quality_format(config): SUPPORTED_FORMATS = ["illumina", "standard"] quality_format = dd.get_quality_format(data).lower() if quality_format not in SUPPORTED_FORMATS: logger.error("quality_format is set to an unsupported format. " "Supported formats are %s." % (", ".join(SUPPORTED_FORMATS))) exit(1) return quality_format
def _ready_gzip_fastq(in_files, data): """Check if we have gzipped fastq and don't need format conversion or splitting. """ all_gzipped = all([not x or x.endswith(".gz") for x in in_files]) needs_convert = dd.get_quality_format(data).lower() == "illumina" needs_trim = dd.get_trim_ends(data) do_splitting = tz.get_in(["config", "algorithm", "align_split_size"], data) is not False return (all_gzipped and not needs_convert and not do_splitting and not objectstore.is_remote(in_files[0]) and not needs_trim)
def _run_qc_tools(bam_file, data): """Run a set of third party quality control tools, returning QC directory and metrics. :param bam_file: alignments in bam format :param data: dict with all configuration information :returns: dict with output of different tools """ from bcbio.qc import (coverage, damage, fastqc, kraken, qsignature, qualimap, samtools, picard, srna, umi, variant, viral) tools = { "fastqc": fastqc.run, "small-rna": srna.run, "samtools": samtools.run, "qualimap": qualimap.run, "qualimap_rnaseq": qualimap.run_rnaseq, "qsignature": qsignature.run, "coverage": coverage.run, "damage": damage.run, "variants": variant.run, "kraken": kraken.run, "picard": picard.run, "umi": umi.run, "viral": viral.run } qc_dir = utils.safe_makedir( os.path.join(data["dirs"]["work"], "qc", data["description"])) metrics = {} qc_out = {} for program_name in dd.get_algorithm_qc(data): qc_fn = tools[program_name] cur_qc_dir = os.path.join(qc_dir, program_name) out = qc_fn(bam_file, data, cur_qc_dir) qc_files = None if out and isinstance(out, dict): if "base" in out: if "metrics" in out: metrics.update(out.pop("metrics")) qc_files = out else: metrics.update(out) elif out and isinstance(out, basestring) and os.path.exists(out): qc_files = {"base": out, "secondary": []} if not qc_files: qc_files = _organize_qc_files(program_name, cur_qc_dir) if qc_files: qc_out[program_name] = qc_files metrics["Name"] = dd.get_sample_name(data) metrics["Quality format"] = dd.get_quality_format(data).lower() return {"qc": qc_out, "metrics": metrics}
def _atropos_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with atropos. """ report_file = os.path.join( out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [ os.path.join( out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files ] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report_file, tx_out1 = tx_out[:2] if len(tx_out) > 2: tx_out2 = tx_out[2] adapters_args = " ".join(["-a %s" % a for a in adapters]) aligner_args = "--aligner adapter" if len(fastq_files) == 1: input_args = "-se %s" % objectstore.cl_input(fastq_files[0]) output_args = "-o >(bgzip --threads %s -c > {tx_out1})".format( **locals()) else: assert len(fastq_files) == 2, fastq_files if adapters and len(adapters) <= 2: aligner_args = "--aligner insert" adapters_args = adapters_args + " " + " ".join( ["-A %s" % a for a in adapters]) input_args = "-pe1 %s -pe2 %s" % tuple( [objectstore.cl_input(x) for x in fastq_files]) output_args = "-o >(bgzip -c > {tx_out1}) -p >(bgzip -c > {tx_out2})".format( **locals()) quality_base = "64" if dd.get_quality_format( data).lower() == "illumina" else "33" sample_name = dd.get_sample_name(data) report_args = "--report-file %s --report-formats json --sample-id %s" % ( tx_report_file, dd.get_sample_name(data)) ropts = " ".join( str(x) for x in config_utils.get_resources( "atropos", data["config"]).get("options", [])) thread_args = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") cmd = ( "atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq " "{adapters_args} {aligner_args} {input_args} {output_args} {report_args}" ) cmd += " --quality-cutoff=5 --minimum-length=%s" % dd.get_min_read_length( data) do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data)) return out_files, report_file
def _ready_bgzip_fastq(in_files, data): """Check if we have bgzipped fastq and don't need format conversion or splitting. """ all_gzipped = all([not x or x.endswith(".gz") for x in in_files]) if all_gzipped: all_bgzipped = all([not x or not _check_gzipped_input(x, data)[0] for x in in_files]) else: all_bgzipped = False needs_convert = dd.get_quality_format(data).lower() == "illumina" needs_trim = dd.get_trim_ends(data) do_splitting = tz.get_in(["config", "algorithm", "align_split_size"], data) is not False return (all_bgzipped and not needs_convert and not do_splitting and not objectstore.is_remote(in_files[0]) and not needs_trim)
def _ready_gzip_fastq(in_files, data, require_bgzip=False): """Check if we have gzipped fastq and don't need format conversion or splitting. Avoid forcing bgzip if we don't need indexed files. """ all_gzipped = all([not x or x.endswith(".gz") for x in in_files]) if require_bgzip and all_gzipped: all_gzipped = all([not x or not _check_gzipped_input(x, data)[0] for x in in_files]) needs_convert = dd.get_quality_format(data).lower() == "illumina" needs_trim = dd.get_trim_ends(data) do_splitting = dd.get_align_split_size(data) is not False return (all_gzipped and not needs_convert and not do_splitting and not objectstore.is_remote(in_files[0]) and not needs_trim and not get_downsample_params(data))
def _atropos_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with atropos. """ report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report_file, tx_out1 = tx_out[:2] if len(tx_out) > 2: tx_out2 = tx_out[2] # polyX trimming, anchored to the 3' ends of reads if "polyx" in dd.get_adapters(data): adapters += ["A{200}", "C{200}", "G{200}", "T{200}"] adapters_args = " ".join(["-a '%s'" % a for a in adapters]) adapters_args += " --overlap 8" # Avoid very short internal matches (default is 3) adapters_args += " --no-default-adapters --no-cache-adapters" # Prevent GitHub queries and saving pickles aligner_args = "--aligner adapter" if len(fastq_files) == 1: cores = dd.get_num_cores(data) input_args = "-se %s" % objectstore.cl_input(fastq_files[0]) output_args = "-o >(bgzip --threads {cores} -c > {tx_out1})".format(**locals()) else: assert len(fastq_files) == 2, fastq_files cores = max(1, dd.get_num_cores(data) // 2) adapters_args = adapters_args + " " + " ".join(["-A '%s'" % a for a in adapters]) input_args = "-pe1 %s -pe2 %s" % tuple([objectstore.cl_input(x) for x in fastq_files]) output_args = ("-o >(bgzip --threads {cores} -c > {tx_out1}) " "-p >(bgzip --threads {cores} -c > {tx_out2})").format(**locals()) quality_base = "64" if dd.get_quality_format(data).lower() == "illumina" else "33" sample_name = dd.get_sample_name(data) report_args = "--report-file %s --report-formats json --sample-id %s" % (tx_report_file, dd.get_sample_name(data)) ropts = " ".join(str(x) for x in config_utils.get_resources("atropos", data["config"]).get("options", [])) extra_opts = [] for k, alt_ks, v, want in [("--quality-cutoff", ["-q "], "5", True), ("--minimum-length", ["-m "], str(dd.get_min_read_length(data)), True), ("--nextseq-trim", [], "25", ("polyx" in dd.get_adapters(data) or "polyg" in dd.get_adapters(data)))]: if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks): if want: extra_opts.append("%s=%s" % (k, v)) extra_opts = " ".join(extra_opts) thread_args = ("--threads %s" % cores if cores > 1 else "") cmd = ("atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq " "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}") do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data)) return out_files, report_file
def _atropos_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with atropos. """ report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report_file, tx_out1 = tx_out[:2] if len(tx_out) > 2: tx_out2 = tx_out[2] # polyX trimming, anchored to the 3' ends of reads if "polyx" in dd.get_adapters(data): adapters += ["A{200}$", "C{200}$", "G{200}$", "T{200}$"] adapters_args = " ".join(["-a '%s'" % a for a in adapters]) adapters_args += " --overlap 8" # Avoid very short internal matches (default is 3) adapters_args += " --no-default-adapters --no-cache-adapters" # Prevent GitHub queries and saving pickles aligner_args = "--aligner adapter" if len(fastq_files) == 1: cores = dd.get_num_cores(data) input_args = "-se %s" % objectstore.cl_input(fastq_files[0]) output_args = "-o >(bgzip --threads {cores} -c > {tx_out1})".format(**locals()) else: assert len(fastq_files) == 2, fastq_files cores = max(1, dd.get_num_cores(data) // 2) adapters_args = adapters_args + " " + " ".join(["-A '%s'" % a for a in adapters]) input_args = "-pe1 %s -pe2 %s" % tuple([objectstore.cl_input(x) for x in fastq_files]) output_args = ("-o >(bgzip --threads {cores} -c > {tx_out1}) " "-p >(bgzip --threads {cores} -c > {tx_out2})").format(**locals()) quality_base = "64" if dd.get_quality_format(data).lower() == "illumina" else "33" sample_name = dd.get_sample_name(data) report_args = "--report-file %s --report-formats json --sample-id %s" % (tx_report_file, dd.get_sample_name(data)) ropts = " ".join(str(x) for x in config_utils.get_resources("atropos", data["config"]).get("options", [])) extra_opts = [] for k, alt_ks, v, want in [("--quality-cutoff", ["-q "], "5", True), ("--minimum-length", ["-m "], str(dd.get_min_read_length(data)), True), ("--nextseq-trim", [], "25", ("polyx" in dd.get_adapters(data) or "polyg" in dd.get_adapters(data)))]: if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks): if want: extra_opts.append("%s=%s" % (k, v)) extra_opts = " ".join(extra_opts) thread_args = ("--threads %s" % cores if cores > 1 else "") cmd = ("atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq " "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}") do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data)) return out_files, report_file
def _run_qc_tools(bam_file, data): """Run a set of third party quality control tools, returning QC directory and metrics. :param bam_file: alignments in bam format :param data: dict with all configuration information :returns: dict with output of different tools """ from bcbio.qc import fastqc, gemini, kraken, qsignature, qualimap, samtools, picard, srna, umi tools = { "fastqc": fastqc.run, "small-rna": srna.run, "samtools": samtools.run, "qualimap": qualimap.run, "qualimap_rnaseq": qualimap.run_rnaseq, "gemini": gemini.run, "qsignature": qsignature.run, "coverage": _run_coverage_qc, "variants": _run_variants_qc, "kraken": kraken.run, "picard": picard.run, "umi": umi.run } qc_dir = utils.safe_makedir( os.path.join(data["dirs"]["work"], "qc", data["description"])) metrics = {} qc_out = {} for program_name in tz.get_in(["config", "algorithm", "qc"], data): qc_fn = tools[program_name] cur_qc_dir = os.path.join(qc_dir, program_name) out = qc_fn(bam_file, data, cur_qc_dir) qc_files = None if out and isinstance(out, dict): metrics.update(out) elif out and isinstance(out, basestring) and os.path.exists(out): qc_files = {"base": out, "secondary": []} if not qc_files: qc_files = _organize_qc_files(program_name, cur_qc_dir) if qc_files: qc_out[program_name] = qc_files bam.remove("%s-downsample%s" % os.path.splitext(bam_file)) metrics["Name"] = dd.get_sample_name(data) metrics["Quality format"] = dd.get_quality_format(data).lower() return {"qc": qc_out, "metrics": metrics}
def _run_qc_tools(bam_file, data): """Run a set of third party quality control tools, returning QC directory and metrics. :param bam_file: alignments in bam format :param data: dict with all configuration information :returns: dict with output of different tools """ from bcbio.qc import fastqc, kraken, qsignature, qualimap, samtools, picard, srna, umi, variant tools = {"fastqc": fastqc.run, "small-rna": srna.run, "samtools": samtools.run, "qualimap": qualimap.run, "qualimap_rnaseq": qualimap.run_rnaseq, "qsignature": qsignature.run, "coverage": _run_coverage_qc, "variants": variant.run, "kraken": kraken.run, "picard": picard.run, "umi": umi.run} qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"])) metrics = {} qc_out = {} for program_name in dd.get_algorithm_qc(data): qc_fn = tools[program_name] cur_qc_dir = os.path.join(qc_dir, program_name) out = qc_fn(bam_file, data, cur_qc_dir) qc_files = None if out and isinstance(out, dict): if "base" in out: qc_files = out else: metrics.update(out) elif out and isinstance(out, basestring) and os.path.exists(out): qc_files = {"base": out, "secondary": []} if not qc_files: qc_files = _organize_qc_files(program_name, cur_qc_dir) if qc_files: qc_out[program_name] = qc_files metrics["Name"] = dd.get_sample_name(data) metrics["Quality format"] = dd.get_quality_format(data).lower() return {"qc": qc_out, "metrics": metrics}
def _filter_bad_reads(in_bam, ref_file, data): """Use GATK filter to remove problem reads which choke GATK and Picard. """ bam.index(in_bam, data["config"]) out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0] if not utils.file_exists(out_file): with tx_tmpdir(data) as tmp_dir: with file_transaction(data, out_file) as tx_out_file: params = [ ("FixMisencodedBaseQualityReads" if dd.get_quality_format( data, "").lower() == "illumina" else "PrintReads"), "-R", ref_file, "-I", in_bam, "-O", tx_out_file, "-RF", "MatchingBasesAndQualsReadFilter", "-RF", "SeqIsStoredReadFilter", "-RF", "CigarContainsNoNOperator" ] jvm_opts = broad.get_gatk_opts(data["config"], tmp_dir) do.run(broad.gatk_cmd("gatk", jvm_opts, params), "Filter problem reads") bam.index(out_file, data["config"]) return out_file
def _filter_bad_reads(in_bam, ref_file, data): """Use GATK filter to remove problem reads which choke GATK and Picard. """ bam.index(in_bam, data["config"]) out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0] if not utils.file_exists(out_file): with tx_tmpdir(data) as tmp_dir: with file_transaction(data, out_file) as tx_out_file: params = [("FixMisencodedBaseQualityReads" if dd.get_quality_format(data, "").lower() == "illumina" else "PrintReads"), "-R", ref_file, "-I", in_bam, "-O", tx_out_file, "-RF", "MatchingBasesAndQualsReadFilter", "-RF", "SeqIsStoredReadFilter", "-RF", "CigarContainsNoNOperator"] jvm_opts = broad.get_gatk_opts(data["config"], tmp_dir) do.run(broad.gatk_cmd("gatk", jvm_opts, params), "Filter problem reads") bam.index(out_file, data["config"]) return out_file
def _filter_bad_reads(in_bam, ref_file, data): """Use GATK filter to remove problem reads which choke GATK and Picard. """ bam.index(in_bam, data["config"]) out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0] if not utils.file_exists(out_file): with tx_tmpdir(data) as tmp_dir: with file_transaction(data, out_file) as tx_out_file: params = ["-T", "PrintReads", "-R", ref_file, "-I", in_bam, "--out", tx_out_file, "--filter_mismatching_base_and_quals", "--filter_bases_not_stored", "--filter_reads_with_N_cigar"] if dd.get_quality_format(data, "").lower() == "illumina": params.append("--fix_misencoded_quality_scores") jvm_opts = broad.get_gatk_framework_opts(data["config"], tmp_dir) do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Filter problem reads") bam.index(out_file, data["config"]) return out_file
def _seqtk_fastq_prep_cl(data, in_file=None, read_num=0): """Provide a commandline for prep of fastq inputs with seqtk. Handles fast conversion of fastq quality scores and trimming. """ needs_convert = dd.get_quality_format(data).lower() == "illumina" trim_ends = dd.get_trim_ends(data) seqtk = config_utils.get_program("seqtk", data["config"]) if in_file: in_file = objectstore.cl_input(in_file) else: in_file = "/dev/stdin" cmd = "" if needs_convert: cmd += "{seqtk} seq -Q64 -V {in_file}".format(**locals()) if trim_ends: left_trim, right_trim = trim_ends[0:2] if data.get("read_num", read_num) == 0 else trim_ends[2:4] if left_trim or right_trim: trim_infile = "/dev/stdin" if needs_convert else in_file pipe = " | " if needs_convert else "" cmd += "{pipe}{seqtk} trimfq -b {left_trim} -e {right_trim} {trim_infile}".format(**locals()) return cmd
def _atropos_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with atropos. """ report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report_file, tx_out1 = tx_out[:2] if len(tx_out) > 2: tx_out2 = tx_out[2] adapters_args = " ".join(["-a %s" % a for a in adapters]) aligner_args = "--aligner adapter" if len(fastq_files) == 1: input_args = "-se %s" % objectstore.cl_input(fastq_files[0]) output_args = "-o >(bgzip --threads %s -c > {tx_out1})".format(**locals()) else: assert len(fastq_files) == 2, fastq_files adapters_args = adapters_args + " " + " ".join(["-A %s" % a for a in adapters]) input_args = "-pe1 %s -pe2 %s" % tuple([objectstore.cl_input(x) for x in fastq_files]) output_args = "-o >(bgzip -c > {tx_out1}) -p >(bgzip -c > {tx_out2})".format(**locals()) quality_base = "64" if dd.get_quality_format(data).lower() == "illumina" else "33" sample_name = dd.get_sample_name(data) report_args = "--report-file %s --report-formats json --sample-id %s" % (tx_report_file, dd.get_sample_name(data)) ropts = " ".join(str(x) for x in config_utils.get_resources("atropos", data["config"]).get("options", [])) extra_opts = [] for k, alt_ks, v in [("--quality-cutoff", ["-q "], "5"), ("--minimum-length", ["-m "], str(dd.get_min_read_length(data)))]: if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks): extra_opts.append("%s=%s" % (k, v)) extra_opts = " ".join(extra_opts) thread_args = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") cmd = ("atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq " "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}") do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data)) return out_files, report_file
def _bgzip_from_fastq(data): """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already). """ in_file = data["in_file"] if isinstance(in_file, (list, tuple)): in_file = in_file[0] needs_convert = dd.get_quality_format(data).lower() == "illumina" # special case, empty files that have been cleaned if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0: needs_bgzip, needs_gunzip = False, False elif in_file.endswith(".gz") and not objectstore.is_remote(in_file): if needs_convert or dd.get_trim_ends(data): needs_bgzip, needs_gunzip = True, True else: needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data) elif in_file.endswith(".bz2"): needs_bgzip, needs_gunzip = True, True elif objectstore.is_remote(in_file) and not tz.get_in( ["config", "algorithm", "align_split_size"], data): needs_bgzip, needs_gunzip = False, False else: needs_bgzip, needs_gunzip = True, False work_dir = utils.safe_makedir( os.path.join(data["dirs"]["work"], "align_prep")) if (needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends(data) or objectstore.is_remote(in_file) or (isinstance(data["in_file"], (tuple, list)) and len(data["in_file"]) > 1)): out_file = _bgzip_file(data["in_file"], data["config"], work_dir, needs_bgzip, needs_gunzip, needs_convert, data) else: out_file = os.path.join( work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file))) out_file = _symlink_or_copy_grabix(in_file, out_file, data) return out_file