def get_qc_tools(data): """Retrieve a list of QC tools to use based on configuration and analysis type. Uses defaults if previously set. """ if dd.get_algorithm_qc(data): return dd.get_algorithm_qc(data) analysis = data["analysis"].lower() to_run = [] if "fastqc" not in dd.get_tools_off(data): to_run.append("fastqc") if any([tool in dd.get_tools_on(data) for tool in ["qualimap", "qualimap_full"]]): to_run.append("qualimap") if analysis.startswith("rna-seq"): if "qualimap" not in dd.get_tools_off(data): if gtf.is_qualimap_compatible(dd.get_gtf_file(data)): to_run.append("qualimap_rnaseq") else: logger.debug("GTF not compatible with Qualimap, skipping.") if analysis.startswith("smallrna-seq"): to_run.append("small-rna") if not analysis.startswith("smallrna-seq"): to_run.append("samtools") if tz.get_in(["config", "algorithm", "kraken"], data): to_run.append("kraken") if analysis.startswith(("standard", "variant", "variant2")): to_run += ["qsignature", "coverage", "variants", "picard"] if vcfutils.get_paired([data]): to_run += ["viral"] if damage.should_filter([data]): to_run += ["damage"] if dd.get_umi_consensus(data): to_run += ["umi"] return to_run
def get_qc_tools(data): """Retrieve a list of QC tools to use based on configuration and analysis type. Uses defaults if previously set. """ if dd.get_algorithm_qc(data): return dd.get_algorithm_qc(data) analysis = data["analysis"].lower() to_run = [] if tz.get_in(["config", "algorithm", "kraken"], data): to_run.append("kraken") if "fastqc" not in dd.get_tools_off(data): to_run.append("fastqc") if any([ tool in dd.get_tools_on(data) for tool in ["qualimap", "qualimap_full"] ]): to_run.append("qualimap") if analysis.startswith("rna-seq") or analysis == "smallrna-seq": if "qualimap" not in dd.get_tools_off(data): if gtf.is_qualimap_compatible(dd.get_gtf_file(data)): to_run.append("qualimap_rnaseq") else: logger.debug("GTF not compatible with Qualimap, skipping.") if analysis.startswith("chip-seq"): to_run.append("chipqc") if dd.get_chip_method(data) == "atac": to_run.append("atac") if analysis.startswith("smallrna-seq"): to_run.append("small-rna") to_run.append("atropos") if "coverage_qc" not in dd.get_tools_off(data): to_run.append("samtools") if dd.has_variantcalls(data): if "coverage_qc" not in dd.get_tools_off(data): to_run += ["coverage", "picard"] to_run += ["qsignature", "variants"] if vcfanno.is_human(data): to_run += ["contamination", "peddy"] if vcfutils.get_paired_phenotype(data): to_run += ["viral"] if damage.should_filter([data]): to_run += ["damage"] if dd.get_umi_consensus(data): to_run += ["umi"] if tz.get_in(["config", "algorithm", "preseq"], data): to_run.append("preseq") to_run = [tool for tool in to_run if tool not in dd.get_tools_off(data)] to_run.sort() return to_run
def get_qc_tools(data): """Retrieve a list of QC tools to use based on configuration and analysis type. Uses defaults if previously set. """ if dd.get_algorithm_qc(data): return dd.get_algorithm_qc(data) analysis = data["analysis"].lower() to_run = [] if "fastqc" not in dd.get_tools_off(data): to_run.append("fastqc") if any([tool in dd.get_tools_on(data) for tool in ["qualimap", "qualimap_full"]]): to_run.append("qualimap") if analysis.startswith("rna-seq"): if gtf.is_qualimap_compatible(dd.get_gtf_file(data)): to_run.append("qualimap_rnaseq") if not analysis.startswith("smallrna-seq"): to_run.append("samtools") to_run.append("gemini") if tz.get_in(["config", "algorithm", "kraken"], data): to_run.append("kraken") if analysis.startswith(("standard", "variant", "variant2")): to_run += ["qsignature", "coverage", "variants", "picard"] return to_run
def get_qc_tools(data): """Retrieve a list of QC tools to use based on configuration and analysis type. Uses defaults if previously set. """ if dd.get_algorithm_qc(data): return dd.get_algorithm_qc(data) analysis = data["analysis"].lower() to_run = [] if "fastqc" not in dd.get_tools_off(data): to_run.append("fastqc") if any([ tool in dd.get_tools_on(data) for tool in ["qualimap", "qualimap_full"] ]): to_run.append("qualimap") if analysis.startswith("rna-seq"): if gtf.is_qualimap_compatible(dd.get_gtf_file(data)): to_run.append("qualimap_rnaseq") else: logger.debug("GTF not compatible with Qualimap, skipping.") if analysis.startswith("smallrna-seq"): to_run.append("small-rna") if not analysis.startswith("smallrna-seq"): to_run.append("samtools") to_run.append("gemini") if tz.get_in(["config", "algorithm", "kraken"], data): to_run.append("kraken") if analysis.startswith(("standard", "variant", "variant2")): to_run += ["qsignature", "coverage", "variants", "picard"] return to_run
def run_cluster(*data): """ Run seqcluster cluster to detect smallRNA clusters """ sample = data[0][0] tools_off = dd.get_tools_off(data[0][0]) work_dir = dd.get_work_dir(sample) out_dir = op.join(work_dir, "seqcluster", "cluster") out_dir = op.abspath(safe_makedir(out_dir)) prepare_dir = op.join(work_dir, "seqcluster", "prepare") bam_file = op.join(work_dir, "align", "seqs.bam") if "seqcluster" not in tools_off: cluster_dir = _cluster(bam_file, prepare_dir, out_dir, dd.get_ref_file(sample), dd.get_srna_gtf_file(sample)) sample["report"] = _report(sample, dd.get_ref_file(sample)) sample["seqcluster"] = out_dir out_mirna = _make_isomir_counts(data, out_dir=op.join(work_dir, "mirbase")) if out_mirna: sample = dd.set_mirna_counts(sample, out_mirna[0]) sample = dd.set_isomir_counts(sample, out_mirna[1]) out_novel = _make_isomir_counts(data, "seqbuster_novel", op.join(work_dir, "mirdeep2"), "_novel") novel_db = mirdeep.run(data) if out_novel: sample = dd.set_novel_mirna_counts(sample, out_novel[0]) sample = dd.set_novel_isomir_counts(sample, out_novel[1]) data[0][0] = sample return data
def gatk_filter_rnaseq(vrn_file, data): """ this incorporates filters listed here, dropping clusters of variants within a 35 nucleotide window, high fischer strand values and low quality by depth https://software.broadinstitute.org/gatk/guide/article?id=3891 java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0" -filterName QD -filter "QD < 2.0" -o output.vcf """ out_file = "%s-filter%s" % utils.splitext_plus(vrn_file) if not file_exists(out_file): ref_file = dd.get_ref_file(data) with file_transaction(data, out_file) as tx_out_file: params = ["VariantFiltration", "-R", ref_file, "-V", vrn_file, "--cluster-window-size", "35", "--cluster-size", "3", "--filter-expression", "'FS > 30.0'", "--filter-name", "FS", "--filter-expression", "'QD < 2.0'", "--filter-name", "QD", "--output", tx_out_file] # Use GATK4 for filtering, tools_off is for variant calling config = utils.deepish_copy(dd.get_config(data)) if "gatk4" in dd.get_tools_off({"config": config}): config["algorithm"]["tools_off"].remove("gatk4") jvm_opts = broad.get_gatk_opts(config, os.path.dirname(tx_out_file)) do.run(broad.gatk_cmd("gatk", jvm_opts, params, config), "Filter RNA-seq variants.") return out_file
def _prep_grabix_indexes(in_files, dirs, data): if _is_bam_input(in_files): out = _bgzip_from_bam(in_files[0], dirs, data["config"]) elif _is_cram_input(in_files): out = _bgzip_from_cram(in_files[0], dirs, data) elif _ready_gzip_fastq(in_files, data): out = in_files else: inputs = [{ "in_file": x, "dirs": dirs, "config": data["config"], "rgnames": data["rgnames"] } for x in in_files if x] if "pbgzip" not in dd.get_tools_off(data): out = [_bgzip_from_fastq(d) for d in inputs] else: out = run_multicore(_bgzip_from_fastq_parallel, [[d] for d in inputs], data["config"]) items = [[{ "bgzip_file": x, "config": copy.deepcopy(data["config"]) }] for x in out if x] run_multicore(_grabix_index, items, data["config"]) return out
def gatk_filter_rnaseq(vrn_file, data): """ this incorporates filters listed here, dropping clusters of variants within a 35 nucleotide window, high fischer strand values and low quality by depth https://software.broadinstitute.org/gatk/guide/article?id=3891 java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0" -filterName QD -filter "QD < 2.0" -o output.vcf """ out_file = "%s-filter%s" % utils.splitext_plus(vrn_file) if not file_exists(out_file): ref_file = dd.get_ref_file(data) with file_transaction(data, out_file) as tx_out_file: params = [ "VariantFiltration", "-R", ref_file, "-V", vrn_file, "--cluster-window-size", "35", "--cluster-size", "3", "--filter-expression", "'FS > 30.0'", "--filter-name", "FS", "--filter-expression", "'QD < 2.0'", "--filter-name", "QD", "--output", tx_out_file ] # Use GATK4 for filtering, tools_off is for variant calling config = utils.deepish_copy(dd.get_config(data)) if "gatk4" in dd.get_tools_off({"config": config}): config["algorithm"]["tools_off"].remove("gatk4") jvm_opts = broad.get_gatk_opts(config, os.path.dirname(tx_out_file)) do.run(broad.gatk_cmd("gatk", jvm_opts, params, config), "Filter RNA-seq variants.") return out_file
def _get_gatk_opts(config, names, tmp_dir=None, memscale=None, include_gatk=True, parallel_gc=False): """Retrieve GATK memory specifications, moving down a list of potential specifications. """ if include_gatk and "gatk4" in dd.get_tools_off({"config": config}): opts = [ "-U", "LENIENT_VCF_PROCESSING", "--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment" ] else: opts = [] jvm_opts = ["-Xms750m", "-Xmx2g"] for n in names: resources = config_utils.get_resources(n, config) if resources and resources.get("jvm_opts"): jvm_opts = resources.get("jvm_opts") break if memscale: jvm_opts = config_utils.adjust_opts( jvm_opts, {"algorithm": { "memory_adjust": memscale }}) jvm_opts += get_default_jvm_opts(tmp_dir, parallel_gc=parallel_gc) return jvm_opts + opts
def collect_artifact_metrics(data): """Run CollectSequencingArtifacts to collect pre-adapter ligation artifact metrics https://gatk.broadinstitute.org/hc/en-us/articles/360037429491-CollectSequencingArtifactMetrics-Picard- """ OUT_SUFFIXES = [ ".bait_bias_detail_metrics", ".error_summary_metrics", ".pre_adapter_detail_metrics", ".pre_adapter_summary_metrics" ] broad_runner = broad.runner_from_config(dd.get_config(data)) gatk_type = broad_runner.gatk_type() ref_file = dd.get_ref_file(data) bam_file = dd.get_work_bam(data) if not bam_file: return None if "collectsequencingartifacts" in dd.get_tools_off(data): return None out_dir = os.path.join(dd.get_work_dir(data), "metrics", "artifact", dd.get_sample_name(data)) utils.safe_makedir(out_dir) out_base = os.path.join(out_dir, dd.get_sample_name(data)) out_files = [out_base + x for x in OUT_SUFFIXES] if all([utils.file_exists(x) for x in out_files]): return out_files with file_transaction(data, out_dir) as tx_out_dir: utils.safe_makedir(tx_out_dir) out_base = os.path.join(tx_out_dir, dd.get_sample_name(data)) params = [ "-T", "CollectSequencingArtifactMetrics", "--VALIDATION_STRINGENCY", "SILENT", "-R", ref_file, "-I", bam_file, "-O", out_base ] broad_runner.run_gatk(params, log_error=False, parallel_gc=True) return out_files
def get_gatk_version(self): """Retrieve GATK version, handling locally and config cached versions. Calling version can be expensive due to all the startup and shutdown of JVMs, so we prefer cached version information. """ if self._gatk_version is None: self._set_default_versions(self._config) if "gatk4" not in dd.get_tools_off({"config": self._config}): # In cases whwere we don't have manifest versions. Not possible to get # version from commandline with GATK4 alpha version if self._gatk4_version is None: self._gatk4_version = "4.0" return self._gatk4_version elif self._gatk_version is not None: return self._gatk_version else: if self._has_gatk_conda_wrapper(): gatk_jar = None else: gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"], allow_missing=True) self._gatk_version = get_gatk_version(gatk_jar, config=self._config) return self._gatk_version
def collect_artifact_metrics(data): """Run CollectSequencingArtifacts to collect pre-adapter ligation artifact metrics https://gatk.broadinstitute.org/hc/en-us/articles/360037429491-CollectSequencingArtifactMetrics-Picard- use picard wrapper rather than gatk - works for gatk4 and gatk3 projects refactor - move to broad/picardrun """ OUT_SUFFIXES = [".bait_bias_detail_metrics", ".error_summary_metrics", ".pre_adapter_detail_metrics", ".pre_adapter_summary_metrics"] picard = broad.runner_from_path("picard", dd.get_config(data)) ref_file = dd.get_ref_file(data) bam_file = dd.get_work_bam(data) if not bam_file: return None if "collectsequencingartifacts" in dd.get_tools_off(data): return None out_dir = os.path.join(dd.get_work_dir(data), "metrics", "artifact", dd.get_sample_name(data)) utils.safe_makedir(out_dir) out_base = os.path.join(out_dir, dd.get_sample_name(data)) out_files = [out_base + x for x in OUT_SUFFIXES] if all([utils.file_exists(x) for x in out_files]): return out_files with file_transaction(data, out_dir) as tx_out_dir: utils.safe_makedir(tx_out_dir) out_base = os.path.join(tx_out_dir, dd.get_sample_name(data)) params = [("-REFERENCE_SEQUENCE", ref_file), ("-INPUT", bam_file), ("-OUTPUT", out_base)] # picard runner sets VALIDATION_STRINGENCY picard.run("CollectSequencingArtifactMetrics", params) return out_files
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" # back compatible -- older files were named with lane information, use sample name now out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) if not utils.file_exists(out_file): out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) qual_format = data["config"]["algorithm"].get("quality_format", "").lower() min_size = None if data.get("align_split") or fastq_file.endswith(".sdf"): if fastq_file.endswith(".sdf"): min_size = rtg.min_read_size(fastq_file) final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None if qual_format == "illumina": fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data) rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): # If we cannot do piping, use older bwa aln approach if ("bwa-mem" not in dd.get_tools_on(data) and ("bwa-mem" in dd.get_tools_off(data) or not _can_use_mem(fastq_file, data, min_size))): out_file = _align_backtrack(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) else: out_file = _align_mem(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) data["work_bam"] = out_file return data
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" # back compatible -- older files were named with lane information, use sample name now if names["lane"] != dd.get_sample_name(data): out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) else: out_file = None if not out_file or not utils.file_exists(out_file): umi_ext = "-cumi" if "umi_bam" in data else "" out_file = os.path.join( align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data), umi_ext)) qual_format = data["config"]["algorithm"].get("quality_format", "").lower() min_size = None if data.get("align_split") or fastq_file.endswith(".sdf"): if fastq_file.endswith(".sdf"): min_size = rtg.min_read_size(fastq_file) final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls( fastq_file, pair_file, data) else: final_file = None if qual_format == "illumina": fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data) rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): # If we cannot do piping, use older bwa aln approach if ("bwa-mem" not in dd.get_tools_on(data) and ("bwa-mem" in dd.get_tools_off(data) or not _can_use_mem(fastq_file, data, min_size))): out_file = _align_backtrack(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) else: if is_precollapsed_bam( data) or not hla_on(data) or needs_separate_hla(data): out_file = _align_mem(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) else: out_file = _align_mem_hla(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) data["work_bam"] = out_file # bwakit will corrupt the non-HLA alignments in a UMI collapsed BAM file # (see https://github.com/bcbio/bcbio-nextgen/issues/3069) if needs_separate_hla(data): hla_file = os.path.join(os.path.dirname(out_file), "HLA-" + os.path.basename(out_file)) hla_file = _align_mem_hla(fastq_file, pair_file, ref_file, hla_file, names, rg_info, data) data["hla_bam"] = hla_file return data
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", "sentieon-bwa", False, None] for data in items): raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection") paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0]) previous_evidence = {} full_bams, sr_bams, disc_bams = [], [], [] for data in items: sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) full_bams.append(dd.get_align_bam(data)) sr_bams.append(sr_bam) disc_bams.append(disc_bam) cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir) previous_evidence[dd.get_sample_name(data)] = {} if cur_dels and utils.file_exists(cur_dels): previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels if cur_dups and utils.file_exists(cur_dups): previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items) gt_vcfs = {} for data in items: sample = dd.get_sample_name(data) sample_vcf = vcfutils.select_sample(lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) if "bnd-genotype" in dd.get_tools_on(data): gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), exclude_file, data) elif "lumpy-genotype" in dd.get_tools_off(data): gt_vcf = sample_vcf else: std_vcf, bnd_vcf = _split_breakends(sample_vcf, data) std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), exclude_file, data) gt_vcf = vcfutils.concat_variant_files_bcftools( orig_files=[std_gt_vcf, bnd_vcf], out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0], config=data["config"]) gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs[dd.get_sample_name(data)] if dd.get_svprioritize(data): effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") else: effects_vcf = None data["sv"].append({"variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file}) out.append(data) return out
def _run_with_memory_scaling(params, tx_out_file, data, ld_preload=False): num_cores = dd.get_num_cores(data) memscale = {"magnitude": 0.9 * num_cores, "direction": "increase"} if num_cores > 1 else None # Ignore tools_off: [gatk4], since it doesn't apply to GATK CNV calling config = utils.deepish_copy(data["config"]) if "gatk4" in dd.get_tools_off({"config": config}): config["algorithm"]["tools_off"].remove("gatk4") broad_runner = broad.runner_from_config(config) broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, ld_preload=ld_preload)
def cl_gatk(self, params, tmp_dir, memscale=None, parallel_gc=False): support_nt = set() support_nct = set(["BaseRecalibrator"]) if self._has_gatk_conda_wrapper(): gatk_jar = None else: gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"], allow_missing=True) if not gatk_jar: raise ValueError("GATK processing requested but gatk or older jar install not found: " "http://bcbio-nextgen.readthedocs.io/en/latest/contents/" "installation.html#gatk-and-mutect-mutect2") is_gatk4 = "gatk4" not in dd.get_tools_off({"config": self._config}) cores = self._config["algorithm"].get("num_cores", 1) config = self._config atype_index = params.index("-T") if params.count("-T") > 0 \ else params.index("--analysis_type") prog = params[atype_index + 1] # For GATK4 specify command first, so swap params to accomplish if is_gatk4: params = params[:] del params[atype_index + 1] del params[atype_index] params = [prog] + params if cores and int(cores) > 1: if prog in support_nt: params.extend(["-nt", str(cores)]) elif prog in support_nct: params.extend(["-nct", str(cores)]) memscale = config["algorithm"]["memory_adjust"] = {"direction": "increase", "magnitude": max(1, int(cores) // 2)} # Filters and unsafe specifications not in GATK4 if LooseVersion(self.gatk_major_version()) > LooseVersion("1.9") and not is_gatk4: if len([x for x in params if x.startswith(("-U", "--unsafe"))]) == 0: params.extend(["-U", "LENIENT_VCF_PROCESSING"]) params.extend(["--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment"]) if memscale: jvm_opts = get_gatk_opts(config, tmp_dir=tmp_dir, memscale=memscale, include_gatk=False, parallel_gc=parallel_gc) else: # Decrease memory slightly from configuration to avoid memory allocation errors jvm_opts = config_utils.adjust_opts(self._jvm_opts, {"algorithm": {"memory_adjust": {"magnitude": 1.1, "direction": "decrease"}}}) jvm_opts += get_default_jvm_opts(tmp_dir, parallel_gc=parallel_gc) if "keyfile" in self._gatk_resources: params = ["-et", "NO_ET", "-K", self._gatk_resources["keyfile"]] + params if gatk_jar: return " ".join(["java"] + jvm_opts + ["-jar", gatk_jar] + [str(x) for x in params]) else: cmd = gatk_cmd("gatk", jvm_opts, params, config=self._config) if cmd: return cmd else: raise ValueError("GATK processing requested but gatk or older jar install not found: " "http://bcbio-nextgen.readthedocs.io/en/latest/contents/" "installation.html#gatk-and-mutect-mutect2")
def run_cufflinks(data): """Quantitate transcript expression with Cufflinks""" if "cufflinks" in dd.get_tools_off(data): return [[data]] work_bam = dd.get_work_bam(data) ref_file = dd.get_sam_ref(data) out_dir, fpkm_file, fpkm_isoform_file = cufflinks.run(work_bam, ref_file, data) data = dd.set_cufflinks_dir(data, out_dir) data = dd.set_fpkm(data, fpkm_file) data = dd.set_fpkm_isoform(data, fpkm_isoform_file) return [[data]]
def _has_gatk_conda_wrapper(self): cmd = gatk_cmd("gatk", [], ["--version"], config=self._config) if cmd: if "gatk4" not in dd.get_tools_off({"config": self._config}): return True else: try: stdout = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True) return stdout.find("GATK jar file not found") == -1 except subprocess.CalledProcessError: return False
def _has_gatk_conda_wrapper(self): cmd = gatk_cmd("gatk", [], ["--version"], config=self._config) if cmd: if "gatk4" not in dd.get_tools_off({"config": self._config}): return True else: try: stdout = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True, encoding="UTF-8") return stdout.find("GATK jar file not found") == -1 except subprocess.CalledProcessError: return False
def get_bgzip_cmd(config, is_retry=False): """Retrieve command to use for bgzip, trying to use bgzip parallel threads. By default, parallel bgzip is enabled in bcbio. If it causes problems please report them. You can turn parallel bgzip off with `tools_off: [pbgzip]` """ num_cores = tz.get_in(["algorithm", "num_cores"], config, 1) cmd = config_utils.get_program("bgzip", config) if (not is_retry and num_cores > 1 and "pbgzip" not in dd.get_tools_off({"config": config})): cmd += " --threads %s" % num_cores return cmd
def _gatk_extract_reads_cl(data, region, prep_params, tmp_dir): """Use GATK to extract reads from full BAM file. """ args = ["PrintReads", "-L", region_to_gatk(region), "-R", dd.get_ref_file(data), "-I", data["work_bam"]] # GATK3 back compatibility, need to specify analysis type if "gatk4" in dd.get_tools_off(data): args = ["--analysis_type"] + args runner = broad.runner_from_config(data["config"]) return runner.cl_gatk(args, tmp_dir)
def _run_concat_variant_files_gatk4(input_file_list, out_file, config): """Use GATK4 GatherVcfs for concatenation of scattered VCFs. """ if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: params = ["-T", "GatherVcfs", "-I", input_file_list, "-O", tx_out_file] # Use GATK4 for merging, tools_off: [gatk4] applies to variant calling config = utils.deepish_copy(config) if "gatk4" in dd.get_tools_off({"config": config}): config["algorithm"]["tools_off"].remove("gatk4") broad_runner = broad.runner_from_config(config) broad_runner.run_gatk(params) return out_file
def _gatk_extract_reads_cl(data, region, prep_params, tmp_dir): """Use GATK to extract reads from full BAM file. """ args = [ "PrintReads", "-L", region_to_gatk(region), "-R", dd.get_ref_file(data), "-I", data["work_bam"] ] # GATK3 back compatibility, need to specify analysis type if "gatk4" in dd.get_tools_off(data): args = ["--analysis_type"] + args runner = broad.runner_from_config(data["config"]) return runner.cl_gatk(args, tmp_dir)
def do_db_build(samples, need_bam=True): """Confirm we should build a gemini database: need gemini and not in tools_off. """ genomes = set() for data in samples: if not need_bam or data.get("align_bam") or _has_precalled(data): genomes.add(data["genome_build"]) if "gemini" in dd.get_tools_off(data): return False if len(genomes) == 1: return _has_gemini(samples[0]) else: return False
def concat_variant_files(orig_files, out_file, regions, ref_file, config): """Concatenate multiple variant files from regions into a single output file. Uses bcftools concat --naive which only combines samples and does no parsing work, allowing scaling to large file sizes. """ if not utils.file_exists(out_file): input_file_list = _get_file_list(orig_files, out_file, regions, ref_file, config) if "gatk4" not in dd.get_tools_off({"config": config}): _run_concat_variant_files_gatk4(input_file_list, out_file, config) else: out_file = _run_concat_variant_files_bcftools(input_file_list, out_file, config, naive=True) if out_file.endswith(".gz"): bgzip_and_index(out_file, config) return out_file
def _do_prioritize(items): """Determine if we should perform prioritization. Currently done on tumor-only input samples. """ if not any("tumoronly-prioritization" in dd.get_tools_off(d) for d in items): if vcfutils.get_paired_phenotype(items[0]): has_tumor = False has_normal = False for sub_data in items: if vcfutils.get_paired_phenotype(sub_data) == "tumor": has_tumor = True elif vcfutils.get_paired_phenotype(sub_data) == "normal": has_normal = True return has_tumor and not has_normal
def _prep_grabix_indexes(in_files, dirs, data): if _is_bam_input(in_files): out = _bgzip_from_bam(in_files[0], dirs, data["config"]) elif _is_cram_input(in_files): out = _bgzip_from_cram(in_files[0], dirs, data) else: inputs = [{"in_file": x, "dirs": dirs, "config": data["config"], "rgnames": data["rgnames"]} for x in in_files if x] if "pbgzip" not in dd.get_tools_off(data): out = [_bgzip_from_fastq(d) for d in inputs] else: out = run_multicore(_bgzip_from_fastq_parallel, [[d] for d in inputs], data["config"]) items = [[{"bgzip_file": x, "config": copy.deepcopy(data["config"])}] for x in out if x] run_multicore(_grabix_index, items, data["config"]) return out
def get_bgzip_cmd(config, is_retry=False): """Retrieve command to use for bgzip, trying to use parallel pbgzip if available. By default, pbgzip is enabled in bcbio. If it causes problems please report them. You can turn pbgzip off with `tools_off: [pbgzip]` """ num_cores = tz.get_in(["algorithm", "num_cores"], config, 1) if (not is_retry and num_cores > 1 and "pbgzip" not in dd.get_tools_off({"config": config})): try: pbgzip = config_utils.get_program("pbgzip", config) return "%s -n %s " % (pbgzip, num_cores) except config_utils.CmdNotFound: pass return config_utils.get_program("bgzip", config)
def do_db_build(samples, need_bam=True, gresources=None): """Confirm we should build a gemini database: need gemini + human samples + hg19/GRCh37 + not in tools_off. """ genomes = set() for data in samples: if not need_bam or data.get("align_bam") or _has_precalled(data): genomes.add(data["genome_build"]) if "gemini" in dd.get_tools_off(data): return False if len(genomes) == 1: if not gresources: gresources = samples[0]["genome_resources"] return (tz.get_in(["aliases", "human"], gresources, False) and genomes.issubset( ("hg19", "GRCh37")) and _has_gemini(samples[0])) else: return False
def _run_concat_variant_files_gatk4(input_file_list, out_file, config): """Use GATK4 GatherVcfs for concatenation of scattered VCFs. """ if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: params = ["-T", "GatherVcfs", "-I", input_file_list, "-O", tx_out_file] # Use GATK4 for merging, tools_off: [gatk4] applies to variant calling config = utils.deepish_copy(config) if "gatk4" in dd.get_tools_off({"config": config}): config["algorithm"]["tools_off"].remove("gatk4") # Allow specification of verbosity in the unique style this tool uses resources = config_utils.get_resources("gatk", config) opts = [str(x) for x in resources.get("options", [])] if "--verbosity" in opts: params += ["--VERBOSITY:%s" % opts[opts.index("--verbosity") + 1]] broad_runner = broad.runner_from_config(config) broad_runner.run_gatk(params) return out_file
def do_db_build(samples, need_bam=True, gresources=None): """Confirm we should build a gemini database: need gemini + human samples + hg19/GRCh37 + not in tools_off. """ genomes = set() for data in samples: if not need_bam or data.get("align_bam") or _has_precalled(data): genomes.add(data["genome_build"]) if "gemini" in dd.get_tools_off(data): return False if len(genomes) == 1: if not gresources: gresources = samples[0]["genome_resources"] return (tz.get_in(["aliases", "human"], gresources, False) and genomes.issubset(("hg19", "GRCh37")) and _has_gemini(samples[0])) else: return False
def _get_gatk_opts(config, names, tmp_dir=None, memscale=None, include_gatk=True, parallel_gc=False): """Retrieve GATK memory specifications, moving down a list of potential specifications. """ if include_gatk and "gatk4" in dd.get_tools_off({"config": config}): opts = ["-U", "LENIENT_VCF_PROCESSING", "--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment"] else: opts = [] jvm_opts = ["-Xms750m", "-Xmx2g"] for n in names: resources = config_utils.get_resources(n, config) if resources and resources.get("jvm_opts"): jvm_opts = resources.get("jvm_opts") break if memscale: jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": memscale}}) jvm_opts += get_default_jvm_opts(tmp_dir, parallel_gc=parallel_gc) return jvm_opts + opts
def gatk_cmd(name, jvm_opts, params, config=None): """Retrieve PATH to gatk using locally installed java. """ if name == "gatk": if isinstance(config, dict) and "config" not in config: data = {"config": config} else: data = config if not data or "gatk4" not in dd.get_tools_off(data): return _gatk4_cmd(jvm_opts, params, data) gatk_cmd = utils.which( os.path.join(os.path.dirname(os.path.realpath(sys.executable)), name)) # if we can't find via the local executable, fallback to being in the path if not gatk_cmd: gatk_cmd = utils.which(name) if gatk_cmd: return "%s && export PATH=%s:$PATH && %s %s %s" % \ (utils.clear_java_home(), utils.get_java_binpath(gatk_cmd), gatk_cmd, " ".join(jvm_opts), " ".join([str(x) for x in params]))
def gatk_cmd(name, jvm_opts, params, config=None): """Retrieve PATH to gatk using locally installed java. """ if name == "gatk": if isinstance(config, dict) and "config" not in config: data = {"config": config} else: data = config if not data or "gatk4" not in dd.get_tools_off(data): return _gatk4_cmd(jvm_opts, params, data) else: name = "gatk3" gatk_cmd = utils.which(os.path.join(os.path.dirname(os.path.realpath(sys.executable)), name)) # if we can't find via the local executable, fallback to being in the path if not gatk_cmd: gatk_cmd = utils.which(name) if gatk_cmd: return "%s && export PATH=%s:\"$PATH\" && %s %s %s" % \ (utils.clear_java_home(), utils.get_java_binpath(gatk_cmd), gatk_cmd, " ".join(jvm_opts), " ".join([str(x) for x in params]))
def process_intervals(data): """Prepare intervals file""" bed_file = regions.get_sv_bed(data) if not bed_file: bed_file = bedutils.clean_file(dd.get_variant_regions(data), data) if not bed_file: return None basename = os.path.splitext(bed_file)[0] ready_file = basename + ".txt" if os.path.exists(ready_file): return ready_file optimized_bed = basename + ".optimized.bed" rscript = utils.Rscript_cmd("base") interval_file_r = utils.R_package_script("PureCN", "extdata/IntervalFile.R", env="base") ref_file = dd.get_ref_file(data) mappability_resource = dd.get_variation_resources( data)["purecn_mappability"] genome = dd.get_genome_build(data) tools_off = dd.get_tools_off(data) if tools_off and "purecn_offtarget" in tools_off: offtarget_flag = "" else: offtarget_flag = "--off-target" cmd = [ rscript, interval_file_r, "--in-file", bed_file, "--fasta", ref_file, "--out-file", ready_file, offtarget_flag, "--genome", genome, "--export", optimized_bed, "--mappability", mappability_resource ] try: cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib( env="base"), utils.get_R_exports(env="base"), " ".join( [str(x) for x in cmd])) do.run(cmd_line, "PureCN intervals") except subprocess.CalledProcessError as msg: logger.info("PureCN failed to prepare intervals") logger.debug("Saved PureCN interval file into " + ready_file) return ready_file
def _has_alignment_file(algorithm, sample): return (((algorithm.get("aligner") or algorithm.get("realign") or algorithm.get("recalibrate") or algorithm.get("bam_clean") or algorithm.get("mark_duplicates", algorithm.get("aligner")))) and sample.get("work_bam") is not None and "upload_alignment" not in dd.get_tools_off(sample))
def _has_alignment_file(algorithm, sample): return (((algorithm.get("aligner") or algorithm.get("realign") or algorithm.get("recalibrate") or algorithm.get("bam_clean") or algorithm.get("mark_duplicates"))) and sample.get("work_bam") is not None and "upload_alignment" not in dd.get_tools_off(sample))
def _run(in_file, work_dir, data): if "lumpy-genotype" in dd.get_tools_off(data): return in_file else: return _run_svtyper(in_file, [dd.get_align_bam(data)], call.get("exclude_file"), data)
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", "sentieon-bwa", False, None] for data in items): raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection") paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0]) previous_evidence = {} full_bams, sr_bams, disc_bams = [], [], [] for data in items: sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) full_bams.append(dd.get_align_bam(data)) sr_bams.append(sr_bam) disc_bams.append(disc_bam) cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir) previous_evidence[dd.get_sample_name(data)] = {} if cur_dels and utils.file_exists(cur_dels): previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels if cur_dups and utils.file_exists(cur_dups): previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items) gt_vcfs = {} # Retain paired samples with tumor/normal genotyped in one file if paired and paired.normal_name: batches = [[paired.tumor_data, paired.normal_data]] else: batches = [[x] for x in items] for batch_items in batches: data = batch_items[0] if len(batch_items) == 1: sample = dd.get_sample_name(data) sample_vcf = vcfutils.select_sample(lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) else: sample_vcf = lumpy_vcf align_bams = [dd.get_align_bam(x) for x in batch_items] if "bnd-genotype" in dd.get_tools_on(data): gt_vcf = _run_svtyper(sample_vcf, align_bams, exclude_file, data) elif "lumpy-genotype" in dd.get_tools_off(data): gt_vcf = sample_vcf else: std_vcf, bnd_vcf = _split_breakends(sample_vcf, data) std_gt_vcf = _run_svtyper(std_vcf, align_bams, exclude_file, data) gt_vcf = vcfutils.concat_variant_files_bcftools( orig_files=[std_gt_vcf, bnd_vcf], out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0], config=data["config"]) gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background(paired.tumor_name, [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs.get(dd.get_sample_name(data)) if vcf_file: if dd.get_svprioritize(data): effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") else: effects_vcf = None data["sv"].append({"variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file}) out.append(data) return out
def run_peddy(samples, out_dir=None): data = samples[0] batch = dd.get_batch(data) or dd.get_sample_name(data) if isinstance(batch, (list, tuple)): batch = batch[0] if out_dir: peddy_dir = safe_makedir(out_dir) else: peddy_dir = safe_makedir( os.path.join(dd.get_work_dir(data), "qc", batch, "peddy")) peddy_prefix = os.path.join(peddy_dir, batch) peddy_report = peddy_prefix + ".html" vcf_file = None for d in samples: vcinfo = None if dd.get_phenotype(d) == "germline" or dd.get_phenotype(d) not in [ "tumor" ]: vcinfo = variant.get_active_vcinfo(d, use_ensemble=False) if not vcinfo and dd.get_phenotype(d) in ["tumor"]: vcinfo = variant.extract_germline_vcinfo(d, peddy_dir) if vcinfo: for key in ["germline", "vrn_file"]: if vcinfo and vcinfo.get(key) and utils.file_exists( vcinfo[key]): if vcinfo[key] and dd.get_sample_name( d) in vcfutils.get_samples(vcinfo[key]): if vcinfo[ key] and vcfutils.vcf_has_nonfiltered_variants( vcinfo[key]): vcf_file = vcinfo[key] break peddy = config_utils.get_program("peddy", data) if config_utils.program_installed( "peddy", data) else None config_skips = any(["peddy" in dd.get_tools_off(d) for d in samples]) if not peddy or not vcf_file or not vcfanno.is_human(data) or config_skips: if not peddy: reason = "peddy executable not found" elif config_skips: reason = "peddy in tools_off configuration" elif not vcfanno.is_human(data): reason = "sample is not human" else: assert not vcf_file reason = "no suitable VCF files found with the sample and non-filtered variants" msg = "Skipping peddy QC, %s: %s" % ( reason, [dd.get_sample_name(d) for d in samples]) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write(msg) logger.info(msg) return samples if file_exists(peddy_prefix + "-failed.log"): return samples if not file_exists(peddy_report): ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir) num_cores = dd.get_num_cores(data) with tx_tmpdir(data) as tx_dir: peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix)) # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2 stderr_log = os.path.join(tx_dir, "run-stderr.log") sites_str = "--sites hg38" if dd.get_genome_build( data) == "hg38" else "" locale = utils.locale_export() cmd = ( "{locale} {peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} " "{vcf_file} {ped_file} 2> {stderr_log}") message = "Running peddy on {vcf_file} against {ped_file}." try: do.run(cmd.format(**locals()), message.format(**locals())) except: to_show = collections.deque(maxlen=100) with open(stderr_log) as in_handle: for line in in_handle: to_show.append(line) def allowed_errors(l): return ( (l.find("IndexError") >= 0 and l.find("is out of bounds for axis") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and n_features=") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and min") >= 0) or (l.find( "Input contains NaN, infinity or a value too large for dtype" ) >= 0)) def all_line_errors(l): return (l.find("no intervals found for") >= 0) if any([allowed_errors(l) for l in to_show]) or all( [all_line_errors(l) for l in to_show]): logger.info( "Skipping peddy because no variants overlap with checks: %s" % batch) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write( "peddy did not find overlaps with 1kg sites in VCF, skipping" ) return samples else: logger.warning("".join(to_show)) raise for ext in PEDDY_OUT_EXTENSIONS: if os.path.exists(peddy_prefix_tx + ext): shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext) peddyfiles = expected_peddy_files(peddy_report, batch) return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)