def summary(samples, run_parallel): cmd_name = 'chanjo' for data in samples: # input bam = tz.get_in(["work_bam"], data[0], None) sample_name = tz.get_in(['rgnames', 'sample'], data[0], None) bed_file = tz.get_in(["config", "algorithm", "coverage"], data[0], None) output_dir = os.path.abspath(tz.get_in(['upload', 'dir'], data[0])) if not os.path.exists(output_dir): safe_makedir(output_dir) output = os.path.join(output_dir, sample_name, '{0}-coverage.bed'.format(sample_name)) if not utils.file_exists(output): with file_transaction(data, output) as tx_out_file: with codecs.open(bed_file, encoding='utf-8') as bed_stream: with codecs.open(output, "w", encoding='utf-8') as coverage_stream: for line in chanjo.annotate_bed_stream(bed_stream, bam): coverage_stream.write(chanjo.serialize_interval(line)) coverage_stream.write('\n') out = [] for x in samples[0]: output_dir = os.path.abspath(tz.get_in(['upload', 'dir'], data[0])) output = os.path.join(output_dir, sample_name, '{0}-coverage.bed'.format(sample_name)) x["coverage"] = {"summary": output} out.append([x]) return out
def split_variants_by_sample(data): """Split a multi-sample call file into inputs for individual samples. For tumor/normal paired analyses, do not split the final file and attach it to the tumor input. """ # not split, do nothing if "group_orig" not in data: return [[data]] # cancer tumor/normal elif (vcfutils.get_paired_phenotype(data) and "tumor" in [vcfutils.get_paired_phenotype(d) for d in get_orig_items(data)]): out = [] for i, sub_data in enumerate(get_orig_items(data)): if vcfutils.get_paired_phenotype(sub_data) == "tumor": cur_batch = tz.get_in(["metadata", "batch"], data) if cur_batch: sub_data["metadata"]["batch"] = cur_batch sub_data["vrn_file"] = data["vrn_file"] else: sub_data.pop("vrn_file", None) out.append([sub_data]) return out # joint calling or population runs, do not split back up and keep in batches else: out = [] for sub_data in get_orig_items(data): cur_batch = tz.get_in(["metadata", "batch"], data) if cur_batch: sub_data["metadata"]["batch"] = cur_batch sub_data["vrn_file_batch"] = data["vrn_file"] sub_data["vrn_file"] = data["vrn_file"] out.append([sub_data]) return out
def create_inputs(data): """Index input reads and prepare groups of reads to process concurrently. Allows parallelization of alignment beyond processors available on a single machine. Uses bgzip and grabix to prepare an indexed fastq file. """ aligner = tz.get_in(("config", "algorithm", "aligner"), data) # CRAM files must be converted to bgzipped fastq, unless not aligning. # Also need to prep and download remote files. if not ("files" in data and aligner and (_is_cram_input(data["files"]) or objectstore.is_remote(data["files"][0]))): # skip indexing on samples without input files or not doing alignment # skip if we're not BAM and not doing alignment splitting if ("files" not in data or data["files"][0] is None or not aligner or _no_index_needed(data)): return [[data]] ready_files = _prep_grabix_indexes(data["files"], data["dirs"], data) data["files"] = ready_files # bgzip preparation takes care of converting illumina into sanger format data["config"]["algorithm"]["quality_format"] = "standard" if tz.get_in(["config", "algorithm", "align_split_size"], data): splits = _find_read_splits(ready_files[0], data["config"]["algorithm"]["align_split_size"]) else: splits = [None] if len(splits) == 1: return [[data]] else: out = [] for split in splits: cur_data = copy.deepcopy(data) cur_data["align_split"] = list(split) out.append([cur_data]) return out
def _normalize_cwl_inputs(items): """Extract variation and validation data from CWL input list of batched samples. """ with_validate = {} vrn_files = [] ready_items = [] batch_samples = [] for data in (cwlutils.normalize_missing(utils.to_single_data(d)) for d in items): batch_samples.append(dd.get_sample_name(data)) if tz.get_in(["config", "algorithm", "validate"], data): with_validate[_checksum(tz.get_in(["config", "algorithm", "validate"], data))] = data if data.get("vrn_file"): vrn_files.append(data["vrn_file"]) ready_items.append(data) if len(with_validate) == 0: data = _pick_lead_item(ready_items) data["batch_samples"] = batch_samples return data else: assert len(with_validate) == 1, len(with_validate) assert len(set(vrn_files)) == 1, set(vrn_files) data = _pick_lead_item(with_validate.values()) data["batch_samples"] = batch_samples data["vrn_file"] = vrn_files[0] return data
def _ready_gzip_fastq(in_files, data): """Check if we have gzipped fastq and don't need format conversion or splitting. """ all_gzipped = all([not x or x.endswith(".gz") for x in in_files]) needs_convert = tz.get_in(["config", "algorithm", "quality_format"], data, "").lower() == "illumina" do_splitting = tz.get_in(["config", "algorithm", "align_split_size"], data) is not False return all_gzipped and not needs_convert and not do_splitting and not objectstore.is_remote(in_files[0])
def detect_sv(items, all_items, config): """Top level parallel target for examining structural variation. """ svcaller = config["algorithm"].get("svcaller_active") out = [] if svcaller: if svcaller in _CALLERS: assert len(items) == 1 data = items[0] data["sv"] = _CALLERS[svcaller](data) out.append([data]) elif svcaller in _BATCH_CALLERS: if (svcaller in _NEEDS_BACKGROUND and not vcfutils.is_paired_analysis([x.get("align_bam") for x in items], items)): names = set([tz.get_in(["rgnames", "sample"], x) for x in items]) background = [x for x in all_items if tz.get_in(["rgnames", "sample"], x) not in names] for svdata in _BATCH_CALLERS[svcaller](items, background): out.append([svdata]) else: for svdata in _BATCH_CALLERS[svcaller](items): out.append([svdata]) else: raise ValueError("Unexpected structural variant caller: %s" % svcaller) else: out.append(items) return out
def remove_highdepth_regions(in_file, items): """Remove high depth regions from a BED file for analyzing a set of calls. Tries to avoid spurious errors and slow run times in collapsed repeat regions. Also adds ENCODE blacklist regions which capture additional collapsed repeats around centromeres. """ from bcbio.variation import bedutils highdepth_beds = filter(lambda x: x is not None, list(set([tz.get_in(["config", "algorithm", "highdepth_regions"], x) for x in items]))) encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], items[0]) if encode_bed and os.path.exists(encode_bed): highdepth_beds.append(encode_bed) out_file = "%s-glimit%s" % utils.splitext_plus(in_file) if not utils.file_uptodate(out_file, in_file): with file_transaction(items[0], out_file) as tx_out_file: with bedtools_tmpdir(items[0]): all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0] if len(highdepth_beds) > 0: with open(all_file, "w") as out_handle: for line in fileinput.input(highdepth_beds): parts = line.split("\t") out_handle.write("\t".join(parts[:4]).rstrip() + "\n") if utils.file_exists(all_file): to_remove = bedutils.sort_merge(all_file, items[0]) cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}" do.run(cmd.format(**locals()), "Remove high depth regions") else: utils.symlink_plus(in_file, out_file) return out_file
def _get_variant_regions(items): """Retrieve variant regions defined in any of the input items. """ return filter(lambda x: x is not None, [tz.get_in(("config", "algorithm", "variant_regions"), data) for data in items if tz.get_in(["config", "algorithm", "coverage_interval"], data) != "genome"])
def prepare_exclude_file(items, base_file, chrom=None): """Prepare a BED file for exclusion, incorporating variant regions and chromosome. Excludes locally repetitive regions (if `remove_lcr` is set) and centromere regions, both of which contribute to long run times and false positive structural variant calls. """ out_file = "%s-exclude.bed" % utils.splitext_plus(base_file)[0] all_vrs = _get_variant_regions(items) ready_region = (shared.subset_variant_regions(tz.first(all_vrs), chrom, base_file, items) if len(all_vrs) > 0 else chrom) with shared.bedtools_tmpdir(items[0]): # Get a bedtool for the full region if no variant regions if ready_region == chrom: want_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"], chrom) lcr_bed = shared.get_lcr_bed(items) if lcr_bed: want_bedtool = want_bedtool.subtract(pybedtools.BedTool(lcr_bed)) else: want_bedtool = pybedtools.BedTool(ready_region).saveas() sv_exclude_bed = _get_sv_exclude_file(items) if sv_exclude_bed and len(want_bedtool) > 0: want_bedtool = want_bedtool.subtract(sv_exclude_bed).saveas() if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(out_file) as tx_out_file: full_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"]) if len(want_bedtool) > 0: full_bedtool.subtract(want_bedtool).saveas(tx_out_file) else: full_bedtool.saveas(tx_out_file) return out_file
def prep_vep_cache(dbkey, ref_file, tooldir=None, config=None): """Ensure correct installation of VEP cache file. """ if config is None: config = {} resource_file = os.path.join(os.path.dirname(ref_file), "%s-resources.yaml" % dbkey) if tooldir: os.environ["PERL5LIB"] = "{t}/lib/perl5:{t}/lib/perl5/site_perl:{l}".format( t=tooldir, l=os.environ.get("PERL5LIB", "")) vepv = vep_version(config) if os.path.exists(resource_file) and vepv: with open(resource_file) as in_handle: resources = yaml.load(in_handle) ensembl_name = tz.get_in(["aliases", "ensembl"], resources) ensembl_version = tz.get_in(["aliases", "ensembl_version"], resources) symlink_dir = _special_dbkey_maps(dbkey, ref_file) if symlink_dir: return symlink_dir, ensembl_name elif ensembl_name: vep_dir = utils.safe_makedir(os.path.normpath(os.path.join( os.path.dirname(os.path.dirname(ref_file)), "vep"))) out_dir = os.path.join(vep_dir, ensembl_name, vepv) if not os.path.exists(out_dir): cmd = ["vep_install.pl", "-a", "c", "-s", ensembl_name, "-c", vep_dir] if ensembl_version: cmd += ["-v", ensembl_version] do.run(cmd, "Prepare VEP directory for %s" % ensembl_name) cmd = ["vep_convert_cache.pl", "-species", ensembl_name, "-version", vepv, "-d", vep_dir] do.run(cmd, "Convert VEP cache to tabix %s" % ensembl_name) tmp_dir = os.path.join(vep_dir, "tmp") if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) return vep_dir, ensembl_name return None, None
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ data = items[0] config = data["config"] broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) params = ["-R", ref_file] coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config) if coverage_depth_min and coverage_depth_min < 4: confidence = "4.0" params += ["--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] variant_regions = tz.get_in(["algorithm", "variant_regions"], config) region = subset_variant_regions(variant_regions, region, out_file, items) if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] broad_runner = broad.runner_from_config(config) return broad_runner, params
def _get_caller(data): callers = [ tz.get_in(["config", "algorithm", "jointcaller"], data), tz.get_in(["config", "algorithm", "variantcaller"], data), "precalled", ] return [c for c in callers if c][0]
def _extra_vars(args, cluster_config): return {"encrypted_mount": "/encrypted", "nfs_server": nfs_server, "nfs_clients": ",".join(nfs_clients), "login_user": tz.get_in(["nodes", "frontend", "login"], cluster_config), "encrypted_device": tz.get_in(["nodes", "frontend", "encrypted_volume_device"], cluster_config, "/dev/xvdf")}
def prepare_exclude_file(items, base_file, chrom=None): """Prepare a BED file for exclusion. Excludes high depth and centromere regions which contribute to long run times and false positive structural variant calls. """ out_file = "%s-exclude%s.bed" % (utils.splitext_plus(base_file)[0], "-%s" % chrom if chrom else "") if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with shared.bedtools_tmpdir(items[0]): # Get a bedtool for the full region if no variant regions want_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"], chrom) if chrom: want_bedtool = pybedtools.BedTool(shared.subset_bed_by_chrom(want_bedtool.saveas().fn, chrom, items[0])) sv_exclude_bed = _get_sv_exclude_file(items) if sv_exclude_bed and len(want_bedtool) > 0: want_bedtool = want_bedtool.subtract(sv_exclude_bed, nonamecheck=True).saveas() if any(dd.get_coverage_interval(d) == "genome" for d in items): want_bedtool = pybedtools.BedTool(shared.remove_highdepth_regions(want_bedtool.saveas().fn, items)) with file_transaction(items[0], out_file) as tx_out_file: full_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"]) if len(want_bedtool) > 0: full_bedtool.subtract(want_bedtool, nonamecheck=True).saveas(tx_out_file) else: full_bedtool.saveas(tx_out_file) return out_file
def batch_for_variantcall(samples): """Prepare a set of samples for parallel variant calling. CWL input target that groups samples into batches and variant callers for parallel processing. """ convert_to_list = set(["config__algorithm__tools_on", "config__algorithm__tools_off"]) to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False) batch_groups = collections.defaultdict(list) to_process = [utils.to_single_data(x) for x in to_process] all_keys = set([]) for data in to_process: all_keys.update(set(data["cwl_keys"])) for data in to_process: for raw_key in sorted(list(all_keys)): key = raw_key.split("__") if tz.get_in(key, data) is None: data = tz.update_in(data, key, lambda x: None) data["cwl_keys"].append(raw_key) if raw_key in convert_to_list: val = tz.get_in(key, data) if not val: val = [] elif not isinstance(val, (list, tuple)): val = [val] data = tz.update_in(data, key, lambda x: val) vc = get_variantcaller(data, require_bam=False) batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] for b in batches: batch_groups[(b, vc)].append(utils.deepish_copy(data)) return list(batch_groups.values()) + extras
def _square_batch_bcbio_variation(data, region, bam_files, vrn_files, out_file, todo="square"): """Run squaring or merging analysis using bcbio.variation.recall. """ ref_file = tz.get_in(("reference", "fasta", "base"), data) cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) resources = config_utils.get_resources("bcbio-variation-recall", data["config"]) # adjust memory by cores but leave room for run program memory memcores = int(math.ceil(float(cores) / 5.0)) jvm_opts = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms250m", "-Xmx2g"]), {"algorithm": {"memory_adjust": {"direction": "increase", "magnitude": memcores}}}) # Write unique VCFs and BAMs to input file input_file = "%s-inputs.txt" % os.path.splitext(out_file)[0] with open(input_file, "w") as out_handle: out_handle.write("\n".join(sorted(list(set(vrn_files)))) + "\n") if todo == "square": out_handle.write("\n".join(sorted(list(set(bam_files)))) + "\n") variantcaller = tz.get_in(("config", "algorithm", "jointcaller"), data).replace("-joint", "") cmd = ["bcbio-variation-recall", todo] + jvm_opts + broad.get_default_jvm_opts() + \ ["-c", cores, "-r", bamprep.region_to_gatk(region)] if todo == "square": cmd += ["--caller", variantcaller] cmd += [out_file, ref_file, input_file] do.run(cmd, "%s in region: %s" % (cmd, bamprep.region_to_gatk(region))) return out_file
def calculate_sv_coverage(data): """Calculate coverage within bins for downstream CNV calling. Creates corrected cnr files with log2 ratios and depths. """ from bcbio.variation import coverage from bcbio.structural import annotate, cnvkit data = utils.to_single_data(data) if not cnvkit.use_general_sv_bins(data): return [[data]] work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) out_target_file = os.path.join(work_dir, "%s-target-coverage.cnn" % dd.get_sample_name(data)) out_anti_file = os.path.join(work_dir, "%s-antitarget-coverage.cnn" % dd.get_sample_name(data)) if ((not utils.file_exists(out_target_file) or not utils.file_exists(out_anti_file)) and (dd.get_align_bam(data) or dd.get_work_bam(data))): # mosdepth target_cov = coverage.run_mosdepth(data, "target", tz.get_in(["regions", "bins", "target"], data)) anti_cov = coverage.run_mosdepth(data, "antitarget", tz.get_in(["regions", "bins", "antitarget"], data)) target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0) anti_cov_genes = annotate.add_genes(anti_cov.regions, data, max_distance=0) out_target_file = _add_log2_depth(target_cov_genes, out_target_file, data) out_anti_file = _add_log2_depth(anti_cov_genes, out_anti_file, data) # TODO: Correct for GC bias if os.path.exists(out_target_file): data["depth"]["bins"] = {"target": out_target_file, "antitarget": out_anti_file} return [[data]]
def _run_cnvkit_shared(data, test_bams, background_bams, access_file, work_dir, background_name=None): """Shared functionality to run CNVkit. """ ref_file = dd.get_ref_file(data) raw_work_dir = os.path.join(work_dir, "raw") out_base = os.path.splitext(os.path.basename(test_bams[0]))[0] background_cnn = "%s_background.cnn" % (background_name if background_name else "flat") if not utils.file_exists(os.path.join(raw_work_dir, "%s.cnr" % out_base)): if os.path.exists(raw_work_dir): shutil.rmtree(raw_work_dir) with tx_tmpdir(data, work_dir) as tx_work_dir: target_bed = tz.get_in(["config", "algorithm", "variant_regions"], data) cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1), len(test_bams) + len(background_bams)) cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "batch"] + \ test_bams + ["-n"] + background_bams + ["-f", ref_file] + \ ["--targets", target_bed, "--access", access_file, "-d", tx_work_dir, "--split", "-p", str(cores), "--output-reference", os.path.join(tx_work_dir, background_cnn)] at_avg, at_min, t_avg = _get_antitarget_size(access_file, target_bed) if at_avg: cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min), "--target-avg-size", str(t_avg)] local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") cmd += ["--rlibpath", local_sitelib] do.run(cmd, "CNVkit batch") shutil.move(tx_work_dir, raw_work_dir) return {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base), "cns": os.path.join(raw_work_dir, "%s.cns" % out_base), "back_cnn": os.path.join(raw_work_dir, background_cnn)}
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) report_file = os.path.join(out_dir, "qualimapReport.html") pdf_file = "qualimapReport.pdf" if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(out_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) utils.safe_makedir(out_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) cmd = ("unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = tz.get_in(("genome_resources", "aliases", "ensembl"), data, "") if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data)) return _parse_qualimap_metrics(report_file)
def samples_to_records(samples, default_keys=None): """Convert samples into output CWL records. """ from bcbio.pipeline import run_info RECORD_CONVERT_TO_LIST = set(["config__algorithm__tools_on", "config__algorithm__tools_off", "reference__genome_context"]) all_keys = _get_all_cwlkeys(samples, default_keys) out = [] for data in samples: for raw_key in sorted(list(all_keys)): key = raw_key.split("__") if tz.get_in(key, data) is None: data = tz.update_in(data, key, lambda x: None) if raw_key not in data["cwl_keys"]: data["cwl_keys"].append(raw_key) if raw_key in RECORD_CONVERT_TO_LIST: val = tz.get_in(key, data) if not val: val = [] elif not isinstance(val, (list, tuple)): val = [val] data = tz.update_in(data, key, lambda x: val) # Booleans are problematic for CWL serialization, convert into string representation if isinstance(tz.get_in(key, data), bool): data = tz.update_in(data, key, lambda x: str(tz.get_in(key, data))) data["metadata"] = run_info.add_metadata_defaults(data.get("metadata", {})) out.append(data) return out
def assign_complex_to_samples(items): """Assign complex inputs like variants and align outputs to samples. Handles list inputs to record conversion where we have inputs from multiple locations and need to ensure they are properly assigned to samples in many environments. The unpleasant approach here is to use standard file naming to match with samples so this can work in environments where we don't download/stream the input files (for space/time savings). """ extract_fns = {("variants", "samples"): _get_vcf_samples, ("align_bam",): _get_bam_samples} complex = {k: {} for k in extract_fns.keys()} for data in items: for k in complex: v = tz.get_in(k, data) if v is not None: for s in extract_fns[k](v, items): if s: complex[k][s] = v out = [] for data in items: for k in complex: newv = tz.get_in([k, dd.get_sample_name(data)], complex) if newv: data = tz.update_in(data, k, lambda x: newv) out.append(data) return out
def run_vep(data): """Annotate input VCF file with Ensembl variant effect predictor. """ out_file = utils.append_stem(data["vrn_file"], "-vepeffects") assert data["vrn_file"].endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache(data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("variant_effect_predictor.pl", data["config"]) dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data) loftee_args, loftee_fields = _get_loftee(data) std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature", "EXON", "PolyPhen", "SIFT", "Protein_position", "BIOTYPE", "CANONICAL", "CCDS"] resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout"] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--sift", "b", "--polyphen", "b", "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--ccds", "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields)] + dbnsfp_args + loftee_args cmd = "gunzip -c %s | %s | bgzip -c > %s" % (data["vrn_file"], " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def _cram_to_fastq_regions(regions, cram_file, dirs, data): """Convert CRAM files to fastq, potentially within sub regions. Returns multiple fastq files that can be merged back together. """ base_name = utils.splitext_plus(os.path.basename(cram_file))[0] work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep", "%s-parts" % base_name)) ref_file = tz.get_in(["reference", "fasta", "base"], data) resources = config_utils.get_resources("bamtofastq", data["config"]) cores = tz.get_in(["config", "algorithm", "num_cores"], data, 1) max_mem = int(resources.get("memory", "1073741824")) * cores # 1Gb/core default fnames = [] is_paired = False for region in regions: rext = "-%s" % region.replace(":", "_").replace("-", "_") if region else "full" out_s, out_p1, out_p2 = [os.path.join(work_dir, "%s%s-%s.fq.gz" % (base_name, rext, fext)) for fext in ["s1", "p1", "p2"]] if not utils.file_exists(out_p1): with file_transaction(out_s, out_p1, out_p2) as (tx_out_s, tx_out_p1, tx_out_p2): sortprefix = "%s-sort" % utils.splitext_plus(tx_out_s)[0] cmd = ("bamtofastq filename={cram_file} inputformat=cram T={sortprefix} " "gz=1 collate=1 colsbs={max_mem} " "F={tx_out_p1} F2={tx_out_p2} S={tx_out_s} O=/dev/null O2=/dev/null " "reference={ref_file}") if region: cmd += " ranges='{region}'" do.run(cmd.format(**locals()), "CRAM to fastq %s" % region if region else "") if is_paired or not _is_gzip_empty(out_p1): fnames.append((out_p1, out_p2)) is_paired = True else: fnames.append((out_s,)) return fnames
def get_recipes(path=None): """Get all the available conda recipes. Returns a namedtuple which contains the following keys: :name: the name of the recipe :path: the path for the package :version: the version of the recipe :build: the number of builds for the current version """ path = path or CONFIG["abspath"] recipes = [] for recipe in RECIPE_ORDER: recipe_path = os.path.join(path, recipe, "meta.yaml") if not os.path.exists(recipe_path): print("[x] Missing meta.yaml for {recipe}.".format(recipe=recipe)) continue output_path, _ = execute(["conda", "build", recipe, "--output", "--numpy", CONFIG["numpy"]], cwd=path) with open(recipe_path, "r") as recipe_handle: config = yaml.safe_load(recipe_handle) recipes.append( RECIPE( name=recipe, path=output_path.strip(), version=toolz.get_in(["package", "version"], config), build=toolz.get_in(["build", "number"], config, 0), ) ) return recipes
def _bgzip_from_cram(cram_file, dirs, data): """Create bgzipped fastq files from an input CRAM file in regions of interest. Returns a list with a single file, for single end CRAM files, or two files for paired end input. """ region_file = (tz.get_in(["config", "algorithm", "variant_regions"], data) if tz.get_in(["config", "algorithm", "coverage_interval"], data) in ["regional", "exome"] else None) if region_file: regions = ["%s:%s-%s" % tuple(r) for r in pybedtools.BedTool(region_file)] else: regions = [None] work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_s, out_p1, out_p2 = [os.path.join(work_dir, "%s-%s.fq.gz" % (utils.splitext_plus(os.path.basename(cram_file))[0], fext)) for fext in ["s1", "p1", "p2"]] if not utils.file_exists(out_s) and not utils.file_exists(out_p1): cram.index(cram_file) fastqs = _cram_to_fastq_regions(regions, cram_file, dirs, data) if len(fastqs[0]) == 1: with file_transaction(out_s) as tx_out_file: _merge_and_bgzip([xs[0] for xs in fastqs], tx_out_file, out_s) else: for i, out_file in enumerate([out_p1, out_p2]): ext = "/%s" % (i + 1) with file_transaction(out_file) as tx_out_file: _merge_and_bgzip([xs[i] for xs in fastqs], tx_out_file, out_file, ext) if utils.file_exists(out_p1): return [out_p1, out_p2] else: assert utils.file_exists(out_s) return [out_s]
def _meta_to_version(in_file): """Extract version information from meta description file. """ with open(in_file) as in_handle: config = yaml.safe_load(in_handle) return (tz.get_in(["package", "version"], config), tz.get_in(["build", "number"], config, 0))
def _run_cnvkit_shared(data, test_bams, background_bams, access_file, work_dir, background_name=None): """Shared functionality to run CNVkit. """ ref_file = dd.get_ref_file(data) raw_work_dir = os.path.join(work_dir, "raw") out_base = os.path.splitext(os.path.basename(test_bams[0]))[0] background_cnn = "%s_background.cnn" % (background_name if background_name else "flat") if not utils.file_exists(os.path.join(raw_work_dir, "%s.cnr" % out_base)): with tx_tmpdir(data, work_dir) as tx_work_dir: target_bed = tz.get_in(["config", "algorithm", "variant_regions"], data) cmd = ["batch"] + test_bams + ["-n"] + background_bams + ["-f", ref_file] + \ ["--targets", target_bed, "--access", access_file, "-d", raw_work_dir, "--split", "-p", str(tz.get_in(["config", "algorithm", "num_cores"], data, 1)), "--output-reference", os.path.join(raw_work_dir, background_cnn)] at_avg, at_min, t_avg = _get_antitarget_size(access_file, target_bed) if at_avg: cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min), "--target-avg-size", str(t_avg)] args = cnvlib_cmd.parse_args(cmd) args.func(args) shutil.move(tx_work_dir, raw_work_dir) return {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base), "cns": os.path.join(raw_work_dir, "%s.cns" % out_base), "back_cnn": os.path.join(raw_work_dir, background_cnn)}
def align_to_sort_bam(fastq1, fastq2, aligner, data): """Align to the named genome build, returning a sorted BAM file. """ names = data["rgnames"] align_dir_parts = [data["dirs"]["work"], "align", names["sample"]] if data.get("disambiguate"): align_dir_parts.append(data["disambiguate"]["genome_build"]) align_dir = utils.safe_makedir(apply(os.path.join, align_dir_parts)) aligner_indexes = os.path.commonprefix(tz.get_in(("reference", aligner, "indexes"), data)) if aligner_indexes.endswith("."): aligner_indexes = aligner_indexes[:-1] ref_file = tz.get_in(("reference", "fasta", "base"), data) if fastq1.endswith(".bam"): data = _align_from_bam(fastq1, aligner, aligner_indexes, ref_file, names, align_dir, data) else: data = _align_from_fastq(fastq1, fastq2, aligner, aligner_indexes, ref_file, names, align_dir, data) if data["work_bam"] and utils.file_exists(data["work_bam"]): bam.index(data["work_bam"], data["config"]) for extra in ["-sr", "-disc"]: extra_bam = utils.append_stem(data['work_bam'], extra) if utils.file_exists(extra_bam): bam.index(extra_bam, data["config"]) return data
def get_analysis_intervals(data, vrn_file, base_dir): """Retrieve analysis regions for the current variant calling pipeline. """ from bcbio.bam import callable if vrn_file and vcfutils.is_gvcf_file(vrn_file): callable_bed = _callable_from_gvcf(data, vrn_file, base_dir) if callable_bed: return callable_bed if data.get("ensemble_bed"): return data["ensemble_bed"] elif dd.get_sample_callable(data): return dd.get_sample_callable(data) elif data.get("align_bam"): return callable.sample_callable_bed(data["align_bam"], dd.get_ref_file(data), data)[0] elif data.get("work_bam"): return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0] elif data.get("work_bam_callable"): data = utils.deepish_copy(data) data["work_bam"] = data.pop("work_bam_callable") return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0] elif tz.get_in(["config", "algorithm", "callable_regions"], data): return tz.get_in(["config", "algorithm", "callable_regions"], data) elif tz.get_in(["config", "algorithm", "variant_regions"], data): return tz.get_in(["config", "algorithm", "variant_regions"], data)
def test_add_ingest_and_coordinator_nodes_does_not_restart_master_or_data_nodes() -> None: initial_master_task_ids = sdk_tasks.get_task_ids(service_name, "master") initial_data_task_ids = sdk_tasks.get_task_ids(service_name, "data") # Get service configuration. _, svc_config, _ = sdk_cmd.svc_cli(package_name, service_name, "describe", parse_json=True) ingest_nodes_count = get_in(["ingest_nodes", "count"], svc_config) coordinator_nodes_count = get_in(["coordinator_nodes", "count"], svc_config) global current_expected_task_count sdk_service.update_configuration( package_name, service_name, { "ingest_nodes": {"count": ingest_nodes_count + 1}, "coordinator_nodes": {"count": coordinator_nodes_count + 1}, }, current_expected_task_count, # As of 2018-12-14, sdk_upgrade's `wait_for_deployment` has different behavior than # sdk_install's (which is what we wanted here), so don't use it. Check manually afterwards # with `sdk_tasks.check_running`. wait_for_deployment=False, ) # Should be running 2 tasks more. current_expected_task_count += 2 sdk_tasks.check_running(service_name, current_expected_task_count) # Master nodes should not restart. sdk_tasks.check_tasks_not_updated(service_name, "master", initial_master_task_ids) # Data nodes should not restart. sdk_tasks.check_tasks_not_updated(service_name, "data", initial_data_task_ids)
def get_background_cnv_reference(data, caller): out = tz.get_in(["config", "algorithm", "background", "cnv_reference"], data) if out: return out.get(caller) if isinstance(out, dict) else out
def _sv_workdir(data): return utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural", tz.get_in(["rgnames", "sample"], data), "cnvkit"))
def get_resources(name, config): """Retrieve resources for a program, pulling from multiple config sources. """ return tz.get_in(["resources", name], config, tz.get_in(["resources", "default"], config, {}))
def get_keys(lookup): """ return the keys used to look up a function in the datadict """ return tz.get_in((lookup, "keys"), LOOKUPS, None)
def present(config): try: value = tz.get_in(keys, config, no_default=True) except: value = False return True if value else False
def _find_shared_batch(samples): for data in samples: batch = tz.get_in(["metadata", "batch"], data, dd.get_sample_name(data)) if not isinstance(batch, (list, tuple)): return batch
def test_dedup(): ds0 = SimpleDocNav(gen_dataset_test_dag(1, force_tree=True)) # make sure ds0 has duplicate C nodes with equivalent data assert ds0.sources['ab'].sources['bc'].doc is not ds0.sources['ac'].doc assert ds0.sources['ab'].sources['bc'].doc == ds0.sources['ac'].doc ds = SimpleDocNav(dedup_lineage(ds0)) assert ds.sources['ab'].sources['bc'].doc is ds.sources['ac'].doc assert ds.sources['ab'].sources['bc'].sources['cd'].doc is ds.sources[ 'ac'].sources['cd'].doc # again but with raw doc ds = SimpleDocNav(dedup_lineage(ds0.doc)) assert ds.sources['ab'].sources['bc'].doc is ds.sources['ac'].doc assert ds.sources['ab'].sources['bc'].sources['cd'].doc is ds.sources[ 'ac'].sources['cd'].doc # Test that we detect inconsistent metadata for duplicate entries (test 1) # test: different values in the same spot ds0 = SimpleDocNav(gen_dataset_test_dag(3, force_tree=True)) ds0.sources['ac'].doc['label'] = 'Modified' ds0 = SimpleDocNav(ds0.doc) assert ds0.sources['ab'].sources['bc'].doc != ds0.sources['ac'].doc with pytest.raises(InvalidDocException, match=r'Inconsistent metadata .*'): dedup_lineage(ds0) # Test that we detect inconsistent metadata for duplicate entries (test 2) # test: different sources structure ds0 = SimpleDocNav(gen_dataset_test_dag(3, force_tree=True)) ds0.sources['ac'].doc['lineage']['source_datasets']['extra'] = ds0.sources[ 'ae'].doc.copy() assert ds0.sources['ab'].sources['bc'].doc != ds0.sources['ac'].doc ds0 = SimpleDocNav(ds0.doc) with pytest.raises(InvalidDocException, match=r'Inconsistent lineage .*'): dedup_lineage(ds0) # Test that we detect inconsistent lineage subtrees for duplicate entries # Subtest 1: different set of keys ds0 = SimpleDocNav(gen_dataset_test_dag(7, force_tree=True)) srcs = toolz.get_in(ds0.sources_path, ds0.sources['ac'].doc) assert 'cd' in srcs srcs['cd'] = {} ds0 = SimpleDocNav(ds0.doc) with pytest.raises(InvalidDocException, match=r'Inconsistent lineage .*'): dedup_lineage(ds0) # Subtest 2: different values for "child" nodes ds0 = SimpleDocNav(gen_dataset_test_dag(7, force_tree=True)) srcs = toolz.get_in(ds0.sources_path, ds0.sources['ac'].doc) assert 'cd' in srcs srcs['cd']['id'] = '7fe57724-ed44-4beb-a3ab-c275339049be' ds0 = SimpleDocNav(ds0.doc) with pytest.raises(InvalidDocException, match=r'Inconsistent lineage .*'): dedup_lineage(ds0) # Subtest 3: different name for child ds0 = SimpleDocNav(gen_dataset_test_dag(7, force_tree=True)) srcs = toolz.get_in(ds0.sources_path, ds0.sources['ac'].doc) assert 'cd' in srcs srcs['CD'] = srcs['cd'] del srcs['cd'] ds0 = SimpleDocNav(ds0.doc) with pytest.raises(InvalidDocException, match=r'Inconsistent lineage .*'): dedup_lineage(ds0)
def _create_config_file(out_dir, samples): """Provide configuration file hiding duplicate columns. Future entry point for providing top level configuration of output reports. """ out_file = os.path.join(out_dir, "multiqc_config.yaml") out = {"table_columns_visible": dict()} # Avoid duplicated bcbio columns with qualimap if any(("qualimap" in dd.get_tools_on(d) or "qualimap_full" in dd.get_tools_on(d)) for d in samples): # Hiding metrics duplicated by Qualimap out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False} out["table_columns_visible"]["FastQC"] = {"percent_gc": False} # Setting up thresholds for Qualimap depth cutoff calculations, based on sample avg depths avg_depths = [tz.get_in(["summary", "metrics", "Avg_coverage"], s) for s in samples] # Picking all thresholds up to the highest sample average depth thresholds = [t for t in coverage.DEPTH_THRESHOLDS if t <= max(avg_depths)] # ...plus one more if len(thresholds) < len(coverage.DEPTH_THRESHOLDS): thresholds.append(coverage.DEPTH_THRESHOLDS[len(thresholds)]) # Showing only thresholds surrounding any of average depths thresholds_hidden = [] for i, t in enumerate(thresholds): if t > 20: # Not hiding anything below 20x if any(thresholds[i-1] <= c < thresholds[i] for c in avg_depths if c and i-1 >= 0) or \ any(thresholds[i] <= c < thresholds[i+1] for c in avg_depths if c and i+1 < len(thresholds)): pass else: thresholds_hidden.append(t) # Hide coverage unless running full qualimap, downsampled inputs are confusing if not any(("qualimap_full" in dd.get_tools_on(d)) for d in samples): thresholds_hidden = thresholds + thresholds_hidden thresholds_hidden.sort() thresholds = [] out['qualimap_config'] = { 'general_stats_coverage': [str(t) for t in thresholds], 'general_stats_coverage_hidden': [str(t) for t in thresholds_hidden]} # Avoid confusing peddy outputs, sticking to ancestry and sex prediction out["table_columns_visible"]["Peddy"] = {"family_id": False, "sex_het_ratio": False, "error_sex_check": False} # Setting the module order module_order = [] module_order.extend([ "bcbio", "samtools", "goleft_indexcov", "peddy" ]) out['bcftools'] = {'write_separate_table': True} # if germline calling was performed: if any("germline" in (get_active_vcinfo(s) or {}) # tumor-only somatic with germline extraction or dd.get_phenotype(s) == "germline" # or paired somatic with germline calling for normal for s in samples): # Split somatic and germline variant stats into separate multiqc submodules, # with somatic going into General Stats, and germline going into a separate table: module_order.extend([{ 'bcftools': { 'name': 'Bcftools (somatic)', 'info': 'Bcftools stats for somatic variant calls only.', 'path_filters': ['*_bcftools_stats.txt'], 'write_general_stats': True, }}, {'bcftools': { 'name': 'Bcftools (germline)', 'info': 'Bcftools stats for germline variant calls only.', 'path_filters': ['*_bcftools_stats_germline.txt'], 'write_general_stats': False }}, ]) else: module_order.append("bcftools") module_order.extend([ "picard", "qualimap", "snpeff", "fastqc", "preseq", ]) out["module_order"] = module_order preseq_samples = [s for s in samples if tz.get_in(["config", "algorithm", "preseq"], s)] if preseq_samples: out["preseq"] = _make_preseq_multiqc_config(preseq_samples) with open(out_file, "w") as out_handle: yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False) return out_file
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache( data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("vep", data["config"]) is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False) # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow if dd.get_ref_file_compressed(data): hgvs_compatible = True config_args = ["--fasta", dd.get_ref_file_compressed(data)] else: hgvs_compatible = False config_args = ["--fasta", dd.get_ref_file(data)] if is_human: plugin_fns = { "loftee": _get_loftee, "maxentscan": _get_maxentscan, "genesplicer": _get_genesplicer, "spliceregion": _get_spliceregion } plugins = ["loftee"] if "vep_splicesite_annotations" in dd.get_tools_on(data): # "genesplicer" too unstable so currently removed plugins += ["maxentscan", "spliceregion"] for plugin in plugins: plugin_args = plugin_fns[plugin](data) config_args += plugin_args config_args += ["--sift", "b", "--polyphen", "b"] if hgvs_compatible: config_args += ["--hgvs", "--shift_hgvs", "1"] if (dd.get_effects_transcripts(data).startswith("canonical") or tz.get_in( ("config", "algorithm", "clinical_reporting"), data)): config_args += ["--pick"] if ensembl_name.endswith("_merged"): config_args += ["--merged"] ensembl_name = ensembl_name.replace("_merged", "") resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory", "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_exac", "--pubmed", "--variant_class"] + config_args perl_exports = utils.get_perl_exports() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % ( perl_exports, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def get_NF_bam(data): """ get the nucleosome free BAM file for ATAC-seq if it exists """ return tz.get_in(("atac", "align", "NF"), data, None)
def _get_caller(data): callers = [ tz.get_in(["config", "algorithm", "jointcaller"], data), tz.get_in(["config", "algorithm", "variantcaller"], data), "precalled" ] return [c for c in callers if c][0]
def get_type(data): """Retrieve the type of effects calculation to do. """ if data["analysis"].lower().startswith("var"): return tz.get_in(("config", "algorithm", "effects"), data, "snpeff")
def _get_sample_and_caller(data): return [ tz.get_in(["metadata", "validate_sample"], data) or dd.get_sample_name(data), _get_caller_supplement(_get_caller(data), data) ]
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: target = shared.subset_variant_regions(dd.get_variant_regions(items[0]), region, out_file, do_merge=True) paired = vcfutils.get_paired_bams(align_bams, items) if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[x for x in [paired.tumor_name, paired.normal_name] if x]) else: if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vardict = get_vardict_command(items[0]) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) strandbias = "testsomatic.R" var2vcf = "var2vcf_paired.pl" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts, var2vcf_opts = _vardict_options_from_config(items, config, out_file, target) fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" freq_filter = "" else: var2vcf_opts += " -M " # this makes VarDict soft filter non-differential variants somatic_filter = ("| sed 's/\\\\.*Somatic\\\\/Somatic/' " "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " """| %s -c 'from bcbio.variation import freebayes; """ """freebayes.call_somatic("%s", "%s")' """ % (sys.executable, paired.tumor_name, paired.normal_name)) freq_filter = ("| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" % (os.path.join(os.path.dirname(sys.executable), "py"), 0, dd.get_aligner(paired.tumor_data))) jvm_opts = _get_jvm_opts(items[0], tx_out_file) py_cl = os.path.join(utils.get_bcbio_bin(), "py") setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| {strandbias} " "| {var2vcf} -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} " "-N \"{paired.tumor_name}|{paired.normal_name}\" " """| {py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' """ "{freq_filter} " "| bcftools filter -i 'QUAL >= 0' " "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) return out_file
def _get_base_tmpdir(data, fallback_base_dir): config_tmpdir = tz.get_in(("config", "resources", "tmp", "dir"), data) if not config_tmpdir: config_tmpdir = tz.get_in(("resources", "tmp", "dir"), data) return config_tmpdir or os.path.join(fallback_base_dir, DEFAULT_TMP)
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) vrn_file, rm_file, interval_bed = _prepare_inputs( vrn_file, rm_file, rm_interval_file, base_dir, data) rtg_ref = tz.get_in(["reference", "rtg"], data) assert rtg_ref and os.path.exists(rtg_ref), ( "Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) # handle CWL where we have a reference to a single file in the RTG directory if os.path.isfile(rtg_ref): rtg_ref = os.path.dirname(rtg_ref) # get core and memory usage from standard configuration threads = min(dd.get_num_cores(data), 6) resources = config_utils.get_resources("rtg", data["config"]) memory = config_utils.adjust_opts( resources.get("jvm_opts", ["-Xms500m", "-Xmx1500m"]), { "algorithm": { "memory_adjust": { "magnitude": threads, "direction": "increase" } } }) jvm_stack = [x for x in memory if x.startswith("-Xms")] jvm_mem = [x for x in memory if x.startswith("-Xmx")] jvm_stack = jvm_stack[0] if len(jvm_stack) > 0 else "-Xms500m" jvm_mem = jvm_mem[0].replace("-Xmx", "") if len(jvm_mem) > 0 else "3g" cmd = [ "rtg", "vcfeval", "--threads", str(threads), "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir ] rm_samples = vcfutils.get_samples(rm_file) if len(rm_samples) > 1 and dd.get_sample_name(data) in rm_samples: cmd += ["--sample=%s" % dd.get_sample_name(data)] cmd += [ "--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file)) ] mem_export = "%s export RTG_JAVA_OPTS='%s' && export RTG_MEM=%s" % ( utils.local_path_export(), jvm_stack, jvm_mem) cmd = mem_export + " && " + " ".join(cmd) do.run(cmd, "Validate calls using rtg vcfeval", data) out = { "fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz") } tp_calls = os.path.join(out_dir, "tp.vcf.gz") tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz") if os.path.exists(tp_baseline): out["tp"] = tp_baseline out["tp-calls"] = tp_calls else: out["tp"] = tp_calls return out
def combine_multiple_callers(samples): """Collapse together variant calls from multiple approaches into single data item with `variants`. """ by_bam = collections.OrderedDict() for data in (x[0] for x in samples): work_bam = tz.get_in(("combine", "work_bam", "out"), data, data.get("align_bam")) jointcaller = tz.get_in(("config", "algorithm", "jointcaller"), data) variantcaller = get_variantcaller(data) key = (multi.get_batch_for_key(data), work_bam) if key not in by_bam: by_bam[key] = [] by_bam[key].append((variantcaller, jointcaller, data)) out = [] for callgroup in by_bam.values(): ready_calls = [] for variantcaller, jointcaller, data in callgroup: if variantcaller: cur = data.get("vrn_file_plus", {}) cur.update({ "variantcaller": variantcaller, "vrn_file": data.get("vrn_file_orig") if jointcaller else data.get("vrn_file"), "vrn_file_batch": data.get("vrn_file_batch") if not jointcaller else None, "vrn_stats": data.get("vrn_stats"), "validate": data.get("validate") if not jointcaller else None }) if jointcaller: cur["population"] = False ready_calls.append(cur) if jointcaller: ready_calls.append({ "variantcaller": jointcaller, "vrn_file": data.get("vrn_file"), "vrn_file_batch": data.get("vrn_file_batch"), "validate": data.get("validate"), "do_upload": False }) if not jointcaller and not variantcaller: ready_calls.append({ "variantcaller": "precalled", "vrn_file": data.get("vrn_file"), "validate": data.get("validate"), "do_upload": False }) final = callgroup[0][-1] def orig_variantcaller_order(x): try: return final["config"]["algorithm"][ "orig_variantcaller"].index(x["variantcaller"]) except ValueError: return final["config"]["algorithm"]["orig_jointcaller"].index( x["variantcaller"]) if len(ready_calls ) > 1 and "orig_variantcaller" in final["config"]["algorithm"]: final["variants"] = sorted(ready_calls, key=orig_variantcaller_order) final["config"]["algorithm"]["variantcaller"] = final["config"][ "algorithm"].pop("orig_variantcaller") if "orig_jointcaller" in final["config"]["algorithm"]: final["config"]["algorithm"]["jointcaller"] = final["config"][ "algorithm"].pop("orig_jointcaller") else: final["variants"] = ready_calls final.pop("vrn_file_batch", None) final.pop("vrn_file_orig", None) final.pop("vrn_file_plus", None) final.pop("vrn_stats", None) out.append([final]) return out
def compare_to_rm(data): """Compare final variant calls against reference materials of known calls. """ if isinstance(data, (list, tuple)): data = _normalize_cwl_inputs(data) toval_data = _get_validate(data) toval_data = cwlutils.unpack_tarballs(toval_data, toval_data) if toval_data: caller = _get_caller(toval_data) sample = dd.get_sample_name(toval_data) base_dir = utils.safe_makedir( os.path.join(toval_data["dirs"]["work"], "validate", sample, caller)) if isinstance(toval_data["vrn_file"], (list, tuple)): raise NotImplementedError( "Multiple input files for validation: %s" % toval_data["vrn_file"]) else: vrn_file = os.path.abspath(toval_data["vrn_file"]) rm_file = normalize_input_path( toval_data["config"]["algorithm"]["validate"], toval_data) rm_interval_file = _gunzip( normalize_input_path( toval_data["config"]["algorithm"].get("validate_regions"), toval_data), toval_data) rm_interval_file = bedutils.clean_file( rm_interval_file, toval_data, prefix="validateregions-", bedprep_dir=utils.safe_makedir(os.path.join(base_dir, "bedprep"))) rm_file = naming.handle_synonyms(rm_file, dd.get_ref_file(toval_data), data.get("genome_build"), base_dir, data) rm_interval_file = (naming.handle_synonyms( rm_interval_file, dd.get_ref_file(toval_data), data.get("genome_build"), base_dir, data) if rm_interval_file else None) vmethod = tz.get_in(["config", "algorithm", "validate_method"], data, "rtg") if not vcfutils.vcf_has_variants(vrn_file): # RTG can fail on totally empty files. Skip these since we have nothing. pass # empty validation file, every call is a false positive elif not vcfutils.vcf_has_variants(rm_file): eval_files = _setup_call_fps(vrn_file, rm_interval_file, base_dir, toval_data) data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) elif vmethod == "rtg": eval_files = _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data) eval_files = _annotate_validations(eval_files, toval_data) data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) elif vmethod == "hap.py": data["validate"] = _run_happy_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data) elif vmethod == "bcbio.variation": data["validate"] = _run_bcbio_variation(vrn_file, rm_file, rm_interval_file, base_dir, sample, caller, toval_data) return [[data]]
def is_human(data): return (tz.get_in(["genome_resources", "aliases", "human"], data, False) or dd.get_genome_build(data) in ["hg19", "GRCh37", "hg38"])
def want_gvcf(items): jointcaller = tz.get_in(("config", "algorithm", "jointcaller"), items[0]) want_gvcf = any("gvcf" in dd.get_tools_on(d) for d in items) return jointcaller or want_gvcf
def stac_transform(input_stac: Document, relative: bool = True) -> Document: """Takes in a raw STAC 1.0 dictionary and returns an ODC dictionary""" product_label, product_name, region_code, default_grid = _stac_product_lookup( input_stac) # Generating UUID for products not having UUID. # Checking if provided id is valid UUID. # If not valid, creating new deterministic uuid using odc_uuid function based on product_name and product_label. # TODO: Verify if this approach to create UUID is valid. if _check_valid_uuid(input_stac["id"]): deterministic_uuid = input_stac["id"] else: if product_name in ["s2_l2a"]: deterministic_uuid = str( odc_uuid("sentinel-2_stac_process", "1.0.0", [product_label])) else: deterministic_uuid = str( odc_uuid(f"{product_name}_stac_process", "1.0.0", [product_label])) # Check for projection extension properties that are not in the asset fields. # Specifically, proj:shape and proj:transform, as these are otherwise # fetched in _get_stac_bands. properties = input_stac["properties"] proj_shape = properties.get("proj:shape") proj_transform = properties.get("proj:transform") # TODO: handle old STAC that doesn't have grid information here... bands, grids = _get_stac_bands( input_stac, default_grid, relative=relative, proj_shape=proj_shape, proj_transform=proj_transform, ) stac_properties, lineage = _get_stac_properties_lineage(input_stac) epsg = properties["proj:epsg"] native_crs = f"epsg:{epsg}" # Transform geometry to the native CRS at an appropriate precision geometry = Geometry(input_stac["geometry"], "epsg:4326") if native_crs != "epsg:4326": # Arbitrary precisions, but should be fine pixel_size = get_in(["default", "transform", 0], grids) precision = 0 if pixel_size < 0: precision = 6 geometry = _geographic_to_projected(geometry, native_crs, precision) stac_odc = { "$schema": "https://schemas.opendatacube.org/dataset", "id": deterministic_uuid, "crs": native_crs, "grids": grids, "product": { "name": product_name.lower() }, "label": product_label, "properties": stac_properties, "measurements": bands, "lineage": {}, } if region_code: stac_odc["properties"]["odc:region_code"] = region_code if geometry: stac_odc["geometry"] = geometry.json if lineage: stac_odc["lineage"] = lineage return stac_odc
def get_variantcaller(data, key="variantcaller", default=None, require_bam=True): if not require_bam or data.get("align_bam"): return tz.get_in(["config", "algorithm", key], data, default)
def merge_split_alignments(samples, run_parallel): """Manage merging split alignments back into a final working BAM file. Perform de-duplication on the final merged file. """ ready = [] file_key = "work_bam" to_merge = collections.defaultdict(list) for data in (xs[0] for xs in samples): if data.get("combine"): out_key = tz.get_in(["combine", file_key, "out"], data) if not out_key: out_key = data["rgnames"]["lane"] to_merge[out_key].append(data) else: ready.append([data]) ready_merge = [] hla_merges = [] for mgroup in to_merge.values(): cur_data = mgroup[0] del cur_data["align_split"] for x in mgroup[1:]: cur_data["combine"][file_key]["extras"].append(x[file_key]) ready_merge.append([cur_data]) cur_hla = None for d in mgroup: hla_files = tz.get_in(["hla", "fastq"], d) if hla_files: if not cur_hla: cur_hla = { "rgnames": { "sample": dd.get_sample_name(cur_data) }, "config": cur_data["config"], "dirs": cur_data["dirs"], "hla": { "fastq": [] } } cur_hla["hla"]["fastq"].append(hla_files) if cur_hla: hla_merges.append([cur_hla]) if not tz.get_in(["config", "algorithm", "kraken"], data): # kraken requires fasta filenames from data['files'] as input. # We don't want to remove those files if kraken qc is required. _save_fastq_space(samples) merged = run_parallel("delayed_bam_merge", ready_merge) hla_merge_raw = run_parallel("merge_split_alignments", hla_merges) hla_merges = {} for hla_merge in [x[0] for x in hla_merge_raw]: hla_merges[dd.get_sample_name(hla_merge)] = tz.get_in(["hla", "fastq"], hla_merge) # Add stable 'align_bam' target to use for retrieving raw alignment out = [] for data in [x[0] for x in merged + ready]: if data.get("work_bam"): data["align_bam"] = data["work_bam"] if dd.get_sample_name(data) in hla_merges: data["hla"]["fastq"] = hla_merges[dd.get_sample_name(data)] else: hla_files = glob.glob( os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data), "hla", "*.fq")) if hla_files: data["hla"]["fastq"] = hla_files out.append([data]) return out
def eo3_grid_spatial(doc: Dict[str, Any], resolution: Optional[float] = None) -> Dict[str, Any]: """Using doc[grids|crs|geometry] compute EO3 style grid spatial: Note that `geo_ref_points` are set to the 4 corners of the default grid only, while lon/lat bounds are computed using all the grids, unless tighter valid region is defined via `geometry` key, in which case it is used to determine lon/lat bounds instead. inputs: ``` crs: "<:str>" geometry: <:GeoJSON object> # optional grids: default: shape: [ny: int, nx: int] transform: [a0, a1, a2, a3, a4, a5, 0, 0, 1] <...> # optionally more grids ``` Where transform is a linear mapping matrix from pixel space to projected space encoded in row-major order: [X] [a0, a1, a2] [ Pixel] [Y] = [a3, a4, a5] [ Line ] [1] [ 0, 0, 1] [ 1 ] outputs: ``` extent: lat: {begin=<>, end=<>} lon: {begin=<>, end=<>} grid_spatial: projection: spatial_reference: "<crs>" geo_ref_points: {ll: {x:<>, y:<>}, ...} valid_data: {...} ``` """ grid = toolz.get_in(['grids', 'default'], doc, None) crs = doc.get('crs', None) if crs is None or grid is None: raise ValueError("Input must have crs and grids.default") geometry = doc.get('geometry') if geometry is not None: valid_data = dict(valid_data=geometry) else: valid_data = {} oo = dict(grid_spatial=dict( projection={ 'spatial_reference': crs, 'geo_ref_points': grid2ref_points(grid), **valid_data, })) x1, y1, x2, y2 = eo3_lonlat_bbox(doc, resolution=resolution) oo['extent'] = dict(lon=dict(begin=x1, end=x2), lat=dict(begin=y1, end=y2)) return oo
def normalize_sv_coverage(*items): """Normalize CNV coverage depths by GC, repeats and background. Provides normalized output based on CNVkit approaches, provides a point for providing additional methods in the future: - reference: calculates reference backgrounds from normals and pools including GC and repeat information - fix: Uses background to normalize coverage estimations http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix """ from bcbio.structural import cnvkit from bcbio.structural import shared as sshared items = [ utils.to_single_data(x) for x in cwlutils.handle_combined_input(items) ] if all(not cnvkit.use_general_sv_bins(x) for x in items): return [[d] for d in items] out_files = {} back_files = {} for group_id, gitems in itertools.groupby( items, lambda x: tz.get_in(["regions", "bins", "group"], x)): # No CNVkit calling for this particular set of samples if group_id is None: continue inputs, backgrounds = sshared.find_case_control(list(gitems)) cnns = reduce(operator.add, [[ tz.get_in(["depth", "bins", "target"], x), tz.get_in(["depth", "bins", "antitarget"], x) ] for x in backgrounds], []) assert inputs, "Did not find inputs for sample batch: %s" % (" ".join( dd.get_sample_name(x) for x in items)) for d in inputs: if tz.get_in(["depth", "bins", "target"], d): target_bed = tz.get_in(["depth", "bins", "target"], d) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d) work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(inputs[0]), "structural", dd.get_sample_name(inputs[0]), "bins")) input_backs = set( filter(lambda x: x is not None, [dd.get_background_cnv_reference(d) for d in inputs])) if input_backs: assert len( input_backs ) == 1, "Multiple backgrounds in group: %s" % list(input_backs) back_file = list(input_backs)[0] else: back_file = cnvkit.cnvkit_background( cnns, os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)), backgrounds or inputs, target_bed, antitarget_bed) fix_cmd_inputs = [] for data in inputs: work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) if tz.get_in(["depth", "bins", "target"], data): fix_file = os.path.join( work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data))) fix_cmd_inputs.append( (tz.get_in(["depth", "bins", "target"], data), tz.get_in(["depth", "bins", "antitarget"], data), back_file, fix_file, data)) out_files[dd.get_sample_name(data)] = fix_file back_files[dd.get_sample_name(data)] = back_file parallel = { "type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"] } run_multicore(cnvkit.run_fix_parallel, fix_cmd_inputs, inputs[0]["config"], parallel) out = [] for data in items: if dd.get_sample_name(data) in out_files: data["depth"]["bins"]["background"] = back_files[ dd.get_sample_name(data)] data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name( data)] out.append([data]) return out
def combine_calls(*args): """Combine multiple callsets into a final set of merged calls. """ if len(args) == 3: is_cwl = False batch_id, samples, data = args caller_names, vrn_files = _organize_variants(samples, batch_id) else: is_cwl = True samples = [utils.to_single_data(x) for x in args] samples = [cwlutils.unpack_tarballs(x, x) for x in samples] data = samples[0] batch_id = data["batch_id"] caller_names = data["variants"]["variantcallers"] vrn_files = data["variants"]["calls"] logger.info("Ensemble consensus calls for {0}: {1}".format( batch_id, ",".join(caller_names))) edata = copy.deepcopy(data) base_dir = utils.safe_makedir( os.path.join(edata["dirs"]["work"], "ensemble", batch_id)) if any([vcfutils.vcf_has_variants(f) for f in vrn_files]): # Decompose multiallelic variants and normalize passonly = not tz.get_in( ["config", "algorithm", "ensemble", "use_filtered"], edata, False) vrn_files = [ normalize.normalize(f, data, passonly=passonly, rerun_effects=False, remove_oldeffects=True, work_dir=utils.safe_makedir( os.path.join(base_dir, c))) for c, f in zip(caller_names, vrn_files) ] if "classifiers" not in (dd.get_ensemble(edata) or {}): callinfo = _run_ensemble_intersection(batch_id, vrn_files, caller_names, base_dir, edata) else: config_file = _write_config_file(batch_id, caller_names, base_dir, edata) callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir, dd.get_ref_file(edata), edata) callinfo["vrn_file"] = vcfutils.bgzip_and_index( callinfo["vrn_file"], data["config"]) # After decomposing multiallelic variants and normalizing, re-evaluate effects ann_ma_file, _ = effects.add_to_vcf(callinfo["vrn_file"], data) if ann_ma_file: callinfo["vrn_file"] = ann_ma_file edata["config"]["algorithm"]["variantcaller"] = "ensemble" edata["vrn_file"] = callinfo["vrn_file"] edata["ensemble_bed"] = callinfo["bed_file"] callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get( "validate") else: out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id)) vcfutils.write_empty_vcf( out_vcf_file, samples=[dd.get_sample_name(d) for d in samples]) callinfo = { "variantcaller": "ensemble", "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]), "bed_file": None } if is_cwl: callinfo["batch_samples"] = data["batch_samples"] callinfo["batch_id"] = batch_id return [{"ensemble": callinfo}] else: return [[batch_id, callinfo]]
def _run_cnvkit_shared(inputs, backgrounds): """Shared functionality to run CNVkit, parallelizing over multiple BAM files. """ work_dir = _sv_workdir(inputs[0]) raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_name = dd.get_sample_name( backgrounds[0]) if backgrounds else "flat" background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name)) ckouts = [] for cur_input in inputs: cur_raw_work_dir = utils.safe_makedir( os.path.join(_sv_workdir(cur_input), "raw")) out_base, out_base_old = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir, cur_input) if utils.file_exists(out_base_old + ".cns"): out_base = out_base_old ckouts.append({ "cnr": "%s.cnr" % out_base, "cns": "%s.cns" % out_base, "back_cnn": background_cnn }) if not utils.file_exists(ckouts[0]["cns"]): cov_interval = dd.get_coverage_interval(inputs[0]) samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \ zip(["evaluate"] * len(inputs), inputs) # New style shared SV bins if tz.get_in(["depth", "bins", "target"], inputs[0]): target_bed = tz.get_in(["depth", "bins", "target"], inputs[0]) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], inputs[0]) raw_coverage_cnns = reduce(operator.add, [ _get_general_coverage(cdata, itype) for itype, cdata in samples_to_run ]) # Back compatible with pre-existing runs else: target_bed, antitarget_bed = _get_original_targets(inputs[0]) raw_coverage_cnns = reduce(operator.add, [ _get_original_coverage(cdata, itype) for itype, cdata in samples_to_run ]) # Currently metrics not calculated due to speed and needing re-evaluation # We could re-enable with larger truth sets to evaluate background noise # But want to reimplement in a more general fashion as part of normalization if False: coverage_cnns = reduce(operator.add, [ _cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval, inputs + backgrounds) for cnns in tz.groupby("bam", raw_coverage_cnns).values() ]) background_cnn = _cnvkit_background( _select_background_cnns(coverage_cnns), background_cnn, target_bed, antitarget_bed, inputs[0]) else: coverage_cnns = raw_coverage_cnns background_cnn = _cnvkit_background([ x["file"] for x in coverage_cnns if x["itype"] == "background" ], background_cnn, target_bed, antitarget_bed, inputs[0]) parallel = { "type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"] } fixed_cnrs = run_multicore( _cnvkit_fix, [(cnns, background_cnn, inputs, ckouts) for cnns in tz.groupby( "bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values()], inputs[0]["config"], parallel) [ _cnvkit_segment(cnr, cov_interval, data, inputs + backgrounds) for cnr, data in fixed_cnrs ] return ckouts
def _run_info_from_yaml(dirs, run_info_yaml, config, sample_names=None, integrations=None): """Read run information from a passed YAML file. """ validate_yaml(run_info_yaml, run_info_yaml) with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if dirs.get("flowcell"): try: fc_name, fc_date = flowcell.parse_dirname(dirs.get("flowcell")) except ValueError: pass global_config = {} global_vars = {} resources = {} integration_config = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if "fc_name" in loaded: fc_name = loaded["fc_name"].replace(" ", "_") if "fc_date" in loaded: fc_date = str(loaded["fc_date"]).replace(" ", "_") global_vars = global_config.pop("globals", {}) resources = global_config.pop("resources", {}) for iname in ["arvados"]: integration_config[iname] = global_config.pop(iname, {}) loaded = loaded["details"] if sample_names: loaded = [x for x in loaded if x["description"] in sample_names] if integrations: for iname, retriever in integrations.items(): if iname in config: config[iname] = retriever.set_cache(config[iname]) loaded = retriever.add_remotes(loaded, config[iname]) run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, dirs.get("flowcell")) if "lane" not in item: item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if "description" not in item: if _item_is_bam(item): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError( "No `description` sample name provided for input #%s" % (i + 1)) item["description"] = _clean_characters(str(item["description"])) if "upload" not in item: upload = global_config.get("upload", {}) # Handle specifying a local directory directly in upload if isinstance(upload, basestring): upload = {"dir": upload} if fc_name: upload["fc_name"] = fc_name if fc_date: upload["fc_date"] = fc_date upload["run_id"] = "" if upload.get("dir"): upload["dir"] = _file_to_abs(upload["dir"], [dirs.get("work")], makedir=True) item["upload"] = upload item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars) item["algorithm"] = genome.abs_file_paths( item["algorithm"], ignore_keys=ALGORITHM_NOPATH_KEYS, fileonly_keys=ALGORITHM_FILEONLY_KEYS, do_download=all(not x for x in integrations.values())) item["genome_build"] = str(item.get("genome_build", "")) item["algorithm"] = _add_algorithm_defaults(item["algorithm"]) item["metadata"] = add_metadata_defaults(item.get("metadata", {})) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) if item.get("files"): item["files"] = [ genome.abs_file_paths( f, do_download=all(not x for x in integrations.values())) for f in item["files"] ] elif "files" in item: del item["files"] if item.get("vrn_file") and isinstance(item["vrn_file"], basestring): inputs_dir = utils.safe_makedir( os.path.join(dirs.get("work", os.getcwd()), "inputs", item["description"])) item["vrn_file"] = genome.abs_file_paths( item["vrn_file"], do_download=all(not x for x in integrations.values())) if os.path.isfile(item["vrn_file"]): item["vrn_file"] = vcfutils.bgzip_and_index(item["vrn_file"], config, remove_orig=False, out_dir=inputs_dir) if not tz.get_in(("metadata", "batch"), item): raise ValueError( "%s: Please specify a metadata batch for variant file (vrn_file) input.\n" % (item["description"]) + "Batching with a standard sample provides callable regions for validation." ) item = _clean_metadata(item) item = _clean_algorithm(item) # Add any global resource specifications if "resources" not in item: item["resources"] = {} for prog, pkvs in resources.items(): if prog not in item["resources"]: item["resources"][prog] = {} if pkvs is not None: for key, val in pkvs.items(): item["resources"][prog][key] = val for iname, ivals in integration_config.items(): if ivals: if iname not in item: item[iname] = {} for k, v in ivals.items(): item[iname][k] = v run_details.append(item) _check_sample_config(run_details, run_info_yaml, config) return run_details
def _get_files_project(sample, upload_config): """Retrieve output files associated with an entire analysis project. """ out = [{"path": sample["provenance"]["programs"]}] for fname in ["bcbio-nextgen.log", "bcbio-nextgen-commands.log"]: if os.path.exists( os.path.join(log.get_log_dir(sample["config"]), fname)): out.append({ "path": os.path.join(log.get_log_dir(sample["config"]), fname), "type": "external_command_log", "ext": "" }) if "summary" in sample and sample["summary"].get("project"): out.append({"path": sample["summary"]["project"]}) mixup_check = tz.get_in(["summary", "mixup_check"], sample) if mixup_check: out.append({ "path": sample["summary"]["mixup_check"], "type": "directory", "ext": "mixup_check" }) report = os.path.join(dd.get_work_dir(sample), "report") if utils.file_exists(report): out.append({"path": report, "type": "directory", "ext": "report"}) if sample.get("seqcluster", None): out.append({ "path": sample["seqcluster"], "type": "directory", "ext": "seqcluster" }) for x in sample.get("variants", []): if "pop_db" in x: out.append({ "path": x["pop_db"], "type": "sqlite", "variantcaller": x["variantcaller"] }) for x in sample.get("variants", []): if "population" in x: pop_db = tz.get_in(["population", "db"], x) if pop_db: out.append({ "path": pop_db, "type": "sqlite", "variantcaller": x["variantcaller"] }) out.extend(_get_variant_file(x, ("population", "vcf"))) for x in sample.get("variants", []): if x.get("validate") and x["validate"].get("grading_summary"): out.append({"path": x["validate"]["grading_summary"]}) break if "coverage" in sample: cov_db = tz.get_in(["coverage", "summary"], sample) if cov_db: out.append({"path": cov_db, "type": "sqlite", "ext": "coverage"}) all_coverage = tz.get_in(["coverage", "all"], sample) if all_coverage: out.append({ "path": all_coverage, "type": "bed", "ext": "coverage" }) if dd.get_mirna_counts(sample): out.append({"path": dd.get_mirna_counts(sample)}) if dd.get_isomir_counts(sample): out.append({"path": dd.get_isomir_counts(sample)}) if dd.get_combined_counts(sample): out.append({"path": dd.get_combined_counts(sample)}) if dd.get_annotated_combined_counts(sample): out.append({"path": dd.get_annotated_combined_counts(sample)}) if dd.get_combined_fpkm(sample): out.append({"path": dd.get_combined_fpkm(sample)}) if dd.get_combined_fpkm_isoform(sample): out.append({"path": dd.get_combined_fpkm_isoform(sample)}) if dd.get_assembled_gtf(sample): out.append({"path": dd.get_assembled_gtf(sample)}) if dd.get_dexseq_counts(sample): out.append({"path": dd.get_dexseq_counts(sample)}) if dd.get_express_counts(sample): out.append({"path": dd.get_express_counts(sample)}) if dd.get_express_fpkm(sample): out.append({"path": dd.get_express_fpkm(sample)}) if dd.get_express_tpm(sample): out.append({"path": dd.get_express_tpm(sample)}) if dd.get_isoform_to_gene(sample): out.append({"path": dd.get_isoform_to_gene(sample)}) if dd.get_square_vcf(sample): out.append({"path": dd.get_square_vcf(sample)}) if dd.get_sailfish_tidy(sample): out.append({"path": dd.get_sailfish_tidy(sample)}) if dd.get_sailfish_transcript_tpm(sample): out.append({"path": dd.get_sailfish_transcript_tpm(sample)}) if dd.get_sailfish_gene_tpm(sample): out.append({"path": dd.get_sailfish_gene_tpm(sample)}) return _add_meta(out, config=upload_config)
def convert_sdmx_element(element, dataset_json, dataset_context, dsd_infos, series_jsonl_file): global timings # Due to event=end, given to iterparse, we receive <Obs> then <Series> elements, in this order. if element.tag.endswith("Series"): # Ignore some specific XML element attributes corresponding to series SDMX attributes, # because series SDMX attributes do not exist in DBnomics. series_element_attributes = OrderedDict([ (attribute_key, attribute_value) for attribute_key, attribute_value in element.attrib.items() if attribute_key not in {"TIME_FORMAT"} # Redundant with FREQ. ]) dimensions_codes_order = list(series_element_attributes.keys()) if dataset_json["dimensions_codes_order"] is None: dataset_json["dimensions_codes_order"] = dimensions_codes_order else: # dimensions_codes_order must not change between series. assert dataset_json["dimensions_codes_order"] == dimensions_codes_order, \ (dataset_json["dimensions_codes_order"], dimensions_codes_order) # Fill series dimensions labels in dataset.json. t0 = time.time() for dimension_code, dimension_value_code in series_element_attributes.items( ): if dimension_code not in dataset_json["dimensions_labels"]: dimension_label = dsd_infos["concepts"].get(dimension_code) if dimension_label and dimension_code not in dataset_json[ "dimensions_labels"]: # Some dimensions labels are an empty string: e.g. bs_bs12_04.sdmx.xml dataset_json["dimensions_labels"][ dimension_code] = dimension_label if dimension_code in dataset_json["dimensions_values_labels"] and \ dimension_value_code in dataset_json["dimensions_values_labels"][dimension_code]: continue codelist_code = dsd_infos["codelist_by_concept"][dimension_code] dimension_value_label = get_in( [codelist_code, dimension_value_code], dsd_infos["codelists"]) if dimension_value_label: dataset_json["dimensions_values_labels"].setdefault( dimension_code, {})[dimension_value_code] = dimension_value_label timings["series_labels"] += time.time() - t0 # Series code is not defined by provider: create it from dimensions values codes. series_code = ".".join(series_element_attributes[dimension_code] for dimension_code in dimensions_codes_order) # Write series JSON to file. t0 = time.time() observations_header = [["PERIOD", "VALUE"] + dsd_infos["attributes"]] series_json = { "code": series_code, "dimensions": [ series_element_attributes[ dimension_code] # Every dimension MUST be defined for each series. for dimension_code in dimensions_codes_order ], "observations": observations_header + dataset_context["current_series_observations"], } dataset_context["observations_offsets"][ series_code] = series_jsonl_file.tell() json.dump(series_json, series_jsonl_file, ensure_ascii=False, sort_keys=True) series_jsonl_file.write("\n") timings["series_file"] += time.time() - t0 # Reset context for next series. dataset_context["current_series_observations"] = [] elif element.tag.endswith("Obs"): # Fill observations attributes labels in dataset.json. t0 = time.time() for attribute_code, attribute_value_code in element.attrib.items(): # Ignore period and value observations XML attributes, because they don't need labels. if attribute_code in ["TIME_PERIOD", "OBS_VALUE"]: continue attribute_label = dsd_infos["concepts"].get(attribute_code) if attribute_label and attribute_code not in dataset_json[ "attributes_labels"]: dataset_json["attributes_labels"][ attribute_code] = attribute_label # Some attributes values codes are multi-valued and concatenated into the same string. attribute_value_codes = list(attribute_value_code) \ if attribute_code == "OBS_STATUS" \ else [attribute_value_code] for attribute_value_code in attribute_value_codes: if attribute_code in dataset_json["attributes_values_labels"] and \ attribute_value_code in dataset_json["attributes_values_labels"][attribute_code]: continue codelist_code = dsd_infos["codelist_by_concept"][ attribute_code] attribute_value_label = get_in( [codelist_code, attribute_value_code], dsd_infos["codelists"]) if attribute_value_label: dataset_json["attributes_values_labels"].setdefault( attribute_code, {})[attribute_value_code] = attribute_value_label timings["observations_labels"] += time.time() - t0 obs_value = element.attrib.get("OBS_VALUE") if obs_value is not None: obs_value = observations.value_to_float(obs_value) dataset_context["current_series_observations"].append([ element. attrib["TIME_PERIOD"], # SDMX periods are already normalized. obs_value, ] + [ element.attrib.get(attribute_name, "") for attribute_name in dsd_infos["attributes"] ])