def summarize_vc(items): """CWL target: summarize variant calls and validation for multiple samples. """ items = [utils.to_single_data(x) for x in validate.summarize_grading(items)] out = {"validate": items[0]["validate"], "variants": {"calls": [], "gvcf": []}} added = set([]) for data in items: if data.get("vrn_file"): names = dd.get_batches(data) if not names: names = [dd.get_sample_name(data)] batch_name = names[0] if data.get("vrn_file_joint") is not None: to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)), ("vrn_file_joint", "calls", batch_name)] else: to_add = [("vrn_file", "calls", batch_name)] for vrn_key, out_key, name in to_add: cur_name = "%s-%s" % (name, dd.get_variantcaller(data)) if cur_name not in added: out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variants", out_key)), "%s.vcf.gz" % cur_name) added.add(cur_name) # Ideally could symlink here but doesn't appear to work with # Docker container runs on Toil where PATHs don't get remapped utils.copy_plus(os.path.realpath(data[vrn_key]), out_file) vcfutils.bgzip_and_index(out_file, data["config"]) out["variants"][out_key].append(out_file) return [out]
def run(self, config, config_file, parallel, dirs, samples): ## Alignment and preparation requiring the entire input file (multicore cluster) with prun.start(_wres(parallel, ["aligner", "samtools", "sambamba"], (["reference", "fasta"], ["reference", "aligner"], ["files"])), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("alignment preparation", dirs): samples = run_parallel("prep_align_inputs", samples) samples = disambiguate.split(samples) with profile.report("alignment", dirs): samples = run_parallel("process_alignment", samples) samples = alignprep.merge_split_alignments(samples, run_parallel) samples = disambiguate.resolve(samples, run_parallel) with profile.report("callable regions", dirs): samples = run_parallel("postprocess_alignment", samples) samples = run_parallel("combine_sample_regions", [samples]) samples = region.clean_sample_data(samples) with profile.report("coverage", dirs): samples = coverage.summarize_samples(samples, run_parallel) ## Variant calling on sub-regions of the input file (full cluster) with prun.start(_wres(parallel, ["gatk", "picard", "variantcaller"]), samples, config, dirs, "full", multiplier=region.get_max_counts(samples), max_multicore=1) as run_parallel: with profile.report("alignment post-processing", dirs): samples = region.parallel_prep_region(samples, run_parallel) with profile.report("variant calling", dirs): samples = genotype.parallel_variantcall_region(samples, run_parallel) ## Finalize variants (per-sample cluster) with prun.start(_wres(parallel, ["gatk", "gatk-vqsr", "snpeff", "bcbio_variation"]), samples, config, dirs, "persample") as run_parallel: with profile.report("joint squaring off/backfilling", dirs): samples = joint.square_off(samples, run_parallel) with profile.report("variant post-processing", dirs): samples = run_parallel("postprocess_variants", samples) samples = run_parallel("split_variants_by_sample", samples) with profile.report("validation", dirs): samples = run_parallel("compare_to_rm", samples) samples = genotype.combine_multiple_callers(samples) ## Finalizing BAMs and population databases, handle multicore computation with prun.start(_wres(parallel, ["gemini", "samtools", "fastqc", "bamtools", "bcbio_variation", "bcbio-variation-recall"]), samples, config, dirs, "multicore2") as run_parallel: with profile.report("prepped BAM merging", dirs): samples = region.delayed_bamprep_merge(samples, run_parallel) with profile.report("ensemble calling", dirs): samples = ensemble.combine_calls_parallel(samples, run_parallel) with profile.report("validation summary", dirs): samples = validate.summarize_grading(samples) with profile.report("structural variation", dirs): samples = structural.run(samples, run_parallel) with profile.report("population database", dirs): samples = population.prep_db_parallel(samples, run_parallel) with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("archive", dirs): samples = archive.compress(samples, run_parallel) logger.info("Timing: finished") return samples
def summarize_vc(items): """CWL target: summarize variant calls and validation for multiple samples. """ items = [ utils.to_single_data(x) for x in validate.summarize_grading(items) ] out = {"validate": items[0]["validate"], "variants": {"calls": []}} added = set([]) for data in items: if data.get("vrn_file"): names = dd.get_batches(data) if not names: names = [dd.get_sample_name(data)] cur_name = "%s-%s" % (names[0], dd.get_variantcaller(data)) if cur_name not in added: out_file = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "variants", "calls")), "%s.vcf.gz" % cur_name) added.add(cur_name) # Ideally could symlink here but doesn't appear to work with # Docker container runs on Toil where PATHs don't get remapped utils.copy_plus(os.path.realpath(data["vrn_file"]), out_file) vcfutils.bgzip_and_index(out_file, data["config"]) out["variants"]["calls"].append(out_file) return [out]
def run(self, config, config_file, parallel, dirs, samples): ## Alignment and preparation requiring the entire input file (multicore cluster) with prun.start(_wres(parallel, ["aligner", "samtools", "sambamba"], (["reference", "fasta"], ["reference", "aligner"], ["files"])), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("alignment preparation", dirs): samples = run_parallel("prep_align_inputs", samples) samples = disambiguate.split(samples) with profile.report("alignment", dirs): samples = run_parallel("process_alignment", samples) samples = alignprep.merge_split_alignments(samples, run_parallel) samples = disambiguate.resolve(samples, run_parallel) with profile.report("callable regions", dirs): samples = run_parallel("postprocess_alignment", samples) samples = run_parallel("combine_sample_regions", [samples]) samples = region.clean_sample_data(samples) with profile.report("coverage", dirs): samples = coverage.summarize_samples(samples, run_parallel) ## Variant calling on sub-regions of the input file (full cluster) with prun.start(_wres(parallel, ["gatk", "picard", "variantcaller"]), samples, config, dirs, "full", multiplier=region.get_max_counts(samples), max_multicore=1) as run_parallel: with profile.report("alignment post-processing", dirs): samples = region.parallel_prep_region(samples, run_parallel) with profile.report("variant calling", dirs): samples = genotype.parallel_variantcall_region(samples, run_parallel) ## Finalize variants (per-sample cluster) with prun.start(_wres(parallel, ["gatk", "gatk-vqsr", "snpeff", "bcbio_variation"]), samples, config, dirs, "persample") as run_parallel: with profile.report("variant post-processing", dirs): samples = run_parallel("postprocess_variants", samples) samples = run_parallel("split_variants_by_sample", samples) with profile.report("validation", dirs): samples = run_parallel("compare_to_rm", samples) samples = genotype.combine_multiple_callers(samples) ## Finalizing BAMs and population databases, handle multicore computation with prun.start(_wres(parallel, ["gemini", "samtools", "fastqc", "bamtools", "bcbio_variation", "bcbio-variation-recall"]), samples, config, dirs, "multicore2") as run_parallel: with profile.report("prepped BAM merging", dirs): samples = region.delayed_bamprep_merge(samples, run_parallel) with profile.report("ensemble calling", dirs): samples = ensemble.combine_calls_parallel(samples, run_parallel) with profile.report("validation summary", dirs): samples = validate.summarize_grading(samples) with profile.report("structural variation", dirs): samples = structural.run(samples, run_parallel) with profile.report("population database", dirs): samples = population.prep_db_parallel(samples, run_parallel) with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("archive", dirs): samples = archive.compress(samples, run_parallel) logger.info("Timing: finished") return samples
def run(self, config, config_file, run_parallel, parallel, dirs, lane_items): ## Alignment and preparation requiring the entire input file (multicore cluster) with global_parallel(parallel, "multicore", ["align_prep_full"], lane_items, dirs, config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: alignment") samples = run_parallel( "align_prep_full", [list(x) + [config_file] for x in lane_items]) regions = callable.combine_sample_regions(samples) samples = region.add_region_info(samples, regions) samples = region.clean_sample_data(samples) logger.info("Timing: coverage") samples = coverage.summarize_samples(samples, run_parallel) ## Variant calling on sub-regions of the input file (full cluster) with global_parallel(parallel, "full", ["piped_bamprep", "variantcall_sample"], samples, dirs, config, multiplier=len(regions["analysis"])) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: alignment post-processing") samples = region.parallel_prep_region(samples, regions, run_parallel) logger.info("Timing: variant calling") samples = region.parallel_variantcall_region(samples, run_parallel) ## Finalize variants (per-sample cluster) with global_parallel(parallel, "persample", ["postprocess_variants"], samples, dirs, config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: variant post-processing") samples = run_parallel("postprocess_variants", samples) logger.info("Timing: validation") samples = run_parallel("compare_to_rm", samples) samples = combine_multiple_callers(samples) logger.info("Timing: ensemble calling") samples = ensemble.combine_calls_parallel(samples, run_parallel) samples = validate.summarize_grading(samples) logger.info("Timing: quality control") samples = qcsummary.generate_parallel(samples, run_parallel) ## Finalizing BAMs and population databases, handle multicore computation with global_parallel(parallel, "multicore2", ["prep_gemini_db", "delayed_bam_merge"], samples, dirs, config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: prepped BAM merging") samples = region.delayed_bamprep_merge(samples, run_parallel) logger.info("Timing: population database") samples = population.prep_db_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def run(self, config, config_file, run_parallel, parallel, dirs, samples): ## Alignment and preparation requiring the entire input file (multicore cluster) with global_parallel(parallel, "multicore", ["process_alignment", "postprocess_alignment"], samples, dirs, config, multiplier=alignprep.parallel_multiplier(samples)) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: alignment") samples = run_parallel("prep_align_inputs", samples) samples = disambiguate.split(samples) samples = run_parallel("process_alignment", samples) samples = alignprep.merge_split_alignments(samples, run_parallel) samples = disambiguate.resolve(samples, run_parallel) samples = run_parallel("postprocess_alignment", samples) regions = callable.combine_sample_regions(samples) samples = region.add_region_info(samples, regions) samples = region.clean_sample_data(samples) logger.info("Timing: coverage") samples = coverage.summarize_samples(samples, run_parallel) ## Variant calling on sub-regions of the input file (full cluster) with global_parallel(parallel, "full", ["piped_bamprep", "variantcall_sample"], samples, dirs, config, multiplier=len(regions["analysis"]), max_multicore=1) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: alignment post-processing") samples = region.parallel_prep_region(samples, regions, run_parallel) logger.info("Timing: variant calling") samples = region.parallel_variantcall_region(samples, run_parallel) ## Finalize variants (per-sample cluster) with global_parallel(parallel, "persample", ["postprocess_variants"], samples, dirs, config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: variant post-processing") samples = run_parallel("postprocess_variants", samples) logger.info("Timing: validation") samples = run_parallel("compare_to_rm", samples) samples = combine_multiple_callers(samples) logger.info("Timing: ensemble calling") samples = ensemble.combine_calls_parallel(samples, run_parallel) samples = validate.summarize_grading(samples) ## Finalizing BAMs and population databases, handle multicore computation with global_parallel(parallel, "multicore2", ["prep_gemini_db", "delayed_bam_merge"], samples, dirs, config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: prepped BAM merging") samples = region.delayed_bamprep_merge(samples, run_parallel) logger.info("Timing: structural variation") samples = structural.run(samples, run_parallel) logger.info("Timing: population database") samples = population.prep_db_parallel(samples, run_parallel) logger.info("Timing: quality control") samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def summarize_vc(items): """CWL target: summarize variant calls and validation for multiple samples. """ items = [utils.to_single_data(x) for x in utils.flatten(items)] items = [_normalize_vc_input(x) for x in items] items = validate.summarize_grading(items) items = [utils.to_single_data(x) for x in items] out = { "validate": validate.combine_validations(items), "variants": { "calls": [], "gvcf": [], "samples": [] } } added = set([]) variants_by_sample = collections.defaultdict(list) sample_order = [] for data in items: batch_samples = data.get("batch_samples", [dd.get_sample_name(data)]) for s in batch_samples: if s not in sample_order: sample_order.append(s) if data.get("vrn_file"): # Only get batches if we're actually doing variantcalling in bcbio # otherwise we'll be using the original files names = dd.get_batches(data) if dd.get_variantcaller( data) else None if not names: names = [dd.get_sample_name(data)] batch_name = names[0] if data.get("vrn_file_joint") is not None: to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)), ("vrn_file_joint", "calls", batch_name)] else: to_add = [("vrn_file", "calls", batch_name)] for vrn_key, out_key, name in to_add: cur_name = "%s-%s" % (name, dd.get_variantcaller(data)) out_file = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "variants", out_key)), "%s.vcf.gz" % cur_name) for s in batch_samples: variants_by_sample[s].append(out_file) if cur_name not in added: added.add(cur_name) # Ideally could symlink here but doesn't appear to work with # Docker container runs on Toil where PATHs don't get remapped utils.copy_plus(os.path.realpath(data[vrn_key]), out_file) vcfutils.bgzip_and_index(out_file, data["config"]) out["variants"][out_key].append(out_file) for sample in sample_order: out["variants"]["samples"].append(variants_by_sample[sample]) return [out]
def summarize_sv(items): """CWL target: summarize structural variants for multiple samples. XXX Need to support non-VCF output as tabix indexed output """ items = [ utils.to_single_data(x) for x in vcvalidate.summarize_grading(items, "svvalidate") ] out = { "sv": { "calls": [], "prioritize": { "tsv": [], "raw": [] } }, "svvalidate": vcvalidate.combine_validations(items, "svvalidate") } added = set([]) # Standard callers for data in items: if data.get("sv"): names = dd.get_batches(data) if not names: names = [dd.get_sample_name(data)] batch_name = names[0] cur_name = "%s-%s" % (batch_name, data["sv"]["variantcaller"]) if data["sv"].get("vrn_file"): ext = utils.splitext_plus(data["sv"]["vrn_file"])[-1] if cur_name not in added and ext.startswith(".vcf"): added.add(cur_name) out_file = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "sv", "calls")), "%s%s" % (cur_name, ext)) utils.copy_plus(data["sv"]["vrn_file"], out_file) out_file = vcfutils.bgzip_and_index( out_file, data["config"]) out["sv"]["calls"].append(out_file) # prioritization for pdata in _group_by_sample(items): prioritysv = [ x for x in prioritize.run([utils.deepish_copy(pdata)])[0].get( "sv", []) if x["variantcaller"] == "sv-prioritize" ] if prioritysv: out["sv"]["prioritize"]["tsv"].append(prioritysv[0]["vrn_file"]) out["sv"]["prioritize"]["raw"].extend( prioritysv[0]["raw_files"].values()) return [out]
def run(self, config, config_file, run_parallel, parallel, dirs, lane_items): ## Alignment and preparation requiring the entire input file (multicore cluster) with global_parallel(parallel, "multicore", ["align_prep_full"], lane_items, dirs["work"], config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: alignment") samples = run_parallel( "align_prep_full", [list(x) + [config_file] for x in lane_items]) regions = callable.combine_sample_regions(samples) samples = region.add_region_info(samples, regions) samples = region.clean_sample_data(samples) ## Variant calling on sub-regions of the input file (full cluster) with global_parallel( parallel, "full", ["piped_bamprep", "variantcall_sample"], samples, dirs["work"], config, multiplier=len(regions["analysis"])) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: alignment post-processing") samples = region.parallel_prep_region(samples, regions, run_parallel) logger.info("Timing: variant calling") samples = region.parallel_variantcall_region(samples, run_parallel) ## Finalize variants (per-sample cluster) with global_parallel(parallel, "persample", ["postprocess_variants"], samples, dirs["work"], config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: variant post-processing") samples = run_parallel("postprocess_variants", samples) samples = combine_multiple_callers(samples) logger.info("Timing: ensemble calling") samples = ensemble.combine_calls_parallel(samples, run_parallel) logger.info("Timing: prepped BAM merging") samples = region.delayed_bamprep_merge(samples, run_parallel) logger.info("Timing: validation") samples = run_parallel("compare_to_rm", samples) samples = validate.summarize_grading(samples) logger.info("Timing: population database") samples = population.prep_db_parallel(samples, run_parallel) logger.info("Timing: quality control") samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def run(self, config, config_file, run_parallel, dirs, lane_items): # Handle alignment and preparation requiring the entire input file samples = run_parallel("align_prep_full", (list(x) + [config_file] for x in lane_items)) regions = callable.combine_sample_regions(samples) samples = region.add_region_info(samples, regions) # Handle all variant calling on sub-regions of the input file samples = region.clean_sample_data(samples) samples = region.parallel_prep_region(samples, regions, run_parallel) samples = region.parallel_variantcall_region(samples, run_parallel) samples = run_parallel("postprocess_variants", samples) samples = combine_multiple_callers(samples) samples = ensemble.combine_calls_parallel(samples, run_parallel) samples = population.prep_db_parallel(samples, run_parallel) samples = region.delayed_bamprep_merge(samples, run_parallel) samples = qcsummary.generate_parallel(samples, run_parallel) samples = validate.summarize_grading(samples) return samples
def summarize_vc(items): """CWL target: summarize variant calls and validation for multiple samples. """ items = [utils.to_single_data(x) for x in utils.flatten(items)] items = [_normalize_vc_input(x) for x in items] items = validate.summarize_grading(items) items = [utils.to_single_data(x) for x in items] out = {"validate": validate.combine_validations(items), "variants": {"calls": [], "gvcf": [], "samples": []}} added = set([]) variants_by_sample = collections.defaultdict(list) sample_order = [] for data in items: batch_samples = data.get("batch_samples", [dd.get_sample_name(data)]) for s in batch_samples: if s not in sample_order: sample_order.append(s) if data.get("vrn_file"): # Only get batches if we're actually doing variantcalling in bcbio # otherwise we'll be using the original files names = dd.get_batches(data) if dd.get_variantcaller(data) else None if not names: names = [dd.get_sample_name(data)] batch_name = names[0] if data.get("vrn_file_joint") is not None: to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)), ("vrn_file_joint", "calls", batch_name)] else: to_add = [("vrn_file", "calls", batch_name)] for vrn_key, out_key, name in to_add: cur_name = "%s-%s" % (name, dd.get_variantcaller(data)) out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variants", out_key)), "%s.vcf.gz" % cur_name) for s in batch_samples: variants_by_sample[s].append(out_file) if cur_name not in added: added.add(cur_name) # Ideally could symlink here but doesn't appear to work with # Docker container runs on Toil where PATHs don't get remapped utils.copy_plus(os.path.realpath(data[vrn_key]), out_file) vcfutils.bgzip_and_index(out_file, data["config"]) out["variants"][out_key].append(out_file) for sample in sample_order: out["variants"]["samples"].append(variants_by_sample[sample]) return [out]
def summarize_sv(items): """CWL target: summarize structural variants for multiple samples. XXX Need to support non-VCF output as tabix indexed output """ items = [ utils.to_single_data(x) for x in vcvalidate.summarize_grading(items, "svvalidate") ] out = { "sv": { "calls": [] }, "svvalidate": vcvalidate.combine_validations(items, "svvalidate") } added = set([]) for data in items: if data.get("sv"): names = dd.get_batches(data) if not names: names = [dd.get_sample_name(data)] batch_name = names[0] cur_name = "%s-%s" % (batch_name, data["sv"]["variantcaller"]) if data["sv"].get("vrn_file"): ext = utils.splitext_plus(data["sv"]["vrn_file"])[-1] if cur_name not in added and ext.startswith(".vcf"): added.add(cur_name) out_file = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "sv", "calls")), "%s%s" % (cur_name, ext)) utils.copy_plus(data["sv"]["vrn_file"], out_file) out_file = vcfutils.bgzip_and_index( out_file, data["config"]) out["sv"]["calls"].append(out_file) return [out]
def summarize_sv(items): """CWL target: summarize structural variants for multiple samples. XXX Need to support non-VCF output as tabix indexed output """ items = [utils.to_single_data(x) for x in vcvalidate.summarize_grading(items, "svvalidate")] out = {"sv": {"calls": [], "supplemental": [], "prioritize": {"tsv": [], "raw": []}}, "svvalidate": vcvalidate.combine_validations(items, "svvalidate")} added = set([]) # Standard callers for data in items: if data.get("sv"): if data["sv"].get("vrn_file"): ext = utils.splitext_plus(data["sv"]["vrn_file"])[-1] cur_name = _useful_basename(data) if cur_name not in added and ext.startswith(".vcf"): added.add(cur_name) out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "sv", "calls")), "%s%s" % (cur_name, ext)) utils.copy_plus(data["sv"]["vrn_file"], out_file) out_file = vcfutils.bgzip_and_index(out_file, data["config"]) out["sv"]["calls"].append(out_file) if data["sv"].get("supplemental"): out["sv"]["supplemental"].extend([x for x in data["sv"]["supplemental"] if x]) # prioritization for pdata in _group_by_sample(items): prioritysv = [x for x in prioritize.run([utils.deepish_copy(pdata)])[0].get("sv", []) if x["variantcaller"] == "sv-prioritize"] if prioritysv: out["sv"]["prioritize"]["tsv"].append(prioritysv[0]["vrn_file"]) out["sv"]["prioritize"]["raw"].extend(prioritysv[0]["raw_files"].values()) return [out]
def variant2pipeline(config, run_info_yaml, parallel, dirs, samples): ## Alignment and preparation requiring the entire input file (multicore cluster) with prun.start( _wres( parallel, ["aligner", "samtools", "sambamba"], (["reference", "fasta"], ["reference", "aligner"], ["files"])), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[ dirs, config, run_info_yaml, [x[0]["description"] for x in samples] ]]) ww = initialize_watcher(samples) with profile.report("alignment preparation", dirs): samples = run_parallel("prep_align_inputs", samples) ww.report("prep_align_inputs", samples) samples = run_parallel("disambiguate_split", [samples]) with profile.report("alignment", dirs): samples = run_parallel("process_alignment", samples) ww.report("process_alignment", samples) samples = disambiguate.resolve(samples, run_parallel) samples = alignprep.merge_split_alignments(samples, run_parallel) with profile.report("callable regions", dirs): samples = run_parallel("prep_samples", [samples]) ww.report("prep_samples", samples) samples = run_parallel("postprocess_alignment", samples) ww.report("postprocess_alignment", samples) samples = run_parallel("combine_sample_regions", [samples]) samples = region.clean_sample_data(samples) ww.report("combine_sample_regions", samples) with profile.report("hla typing", dirs): samples = hla.run(samples, run_parallel) ww.report("call_hla", samples) ## Variant calling on sub-regions of the input file (full cluster) with prun.start(_wres(parallel, ["gatk", "picard", "variantcaller"]), samples, config, dirs, "full", multiplier=region.get_max_counts(samples), max_multicore=1) as run_parallel: with profile.report("alignment post-processing", dirs): samples = region.parallel_prep_region(samples, run_parallel) with profile.report("variant calling", dirs): samples = genotype.parallel_variantcall_region( samples, run_parallel) ## Finalize variants, BAMs and population databases (per-sample multicore cluster) with prun.start(_wres(parallel, [ "gatk", "gatk-vqsr", "snpeff", "bcbio_variation", "gemini", "samtools", "fastqc", "sambamba", "bcbio-variation-recall", "qsignature", "svcaller" ]), samples, config, dirs, "multicore2", multiplier=structural.parallel_multiplier( samples)) as run_parallel: with profile.report("joint squaring off/backfilling", dirs): samples = joint.square_off(samples, run_parallel) with profile.report("variant post-processing", dirs): samples = run_parallel("postprocess_variants", samples) samples = run_parallel("split_variants_by_sample", samples) with profile.report("prepped BAM merging", dirs): samples = region.delayed_bamprep_merge(samples, run_parallel) with profile.report("validation", dirs): samples = run_parallel("compare_to_rm", samples) samples = genotype.combine_multiple_callers(samples) with profile.report("ensemble calling", dirs): samples = ensemble.combine_calls_parallel(samples, run_parallel) with profile.report("validation summary", dirs): samples = validate.summarize_grading(samples) with profile.report("structural variation precall", dirs): samples = structural.run(samples, run_parallel, "precall") with profile.report("structural variation", dirs): samples = structural.run(samples, run_parallel, "initial") with profile.report("structural variation", dirs): samples = structural.run(samples, run_parallel, "standard") with profile.report("structural variation ensemble", dirs): samples = structural.run(samples, run_parallel, "ensemble") with profile.report("structural variation validation", dirs): samples = run_parallel("validate_sv", samples) with profile.report("heterogeneity", dirs): samples = heterogeneity.run(samples, run_parallel) with profile.report("population database", dirs): samples = population.prep_db_parallel(samples, run_parallel) with profile.report("quality control", dirs): ww.report("pre_qc", samples) samples = qcsummary.generate_parallel(samples, run_parallel) ww.report("qc_summary", samples) with profile.report("archive", dirs): samples = archive.compress(samples, run_parallel) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for sample in samples: run_parallel("upload_samples_project", [sample]) logger.info("Timing: finished") return samples
def summarize_grading_vc(*args): return validate.summarize_grading(*args)
def variant2pipeline(config, run_info_yaml, parallel, dirs, samples): ## Alignment and preparation requiring the entire input file (multicore cluster) with prun.start(_wres(parallel, ["aligner", "samtools", "sambamba"], (["reference", "fasta"], ["reference", "aligner"], ["files"])), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[dirs, config, run_info_yaml, [x[0]["description"] for x in samples]]]) ww = WorldWatcher(dirs["work"], is_on=any([dd.get_cwl_reporting(d[0]) for d in samples])) ww.initialize(samples) with profile.report("alignment preparation", dirs): samples = run_parallel("prep_align_inputs", samples) ww.report("prep_align_inputs", samples) samples = run_parallel("disambiguate_split", [samples]) with profile.report("alignment", dirs): samples = run_parallel("process_alignment", samples) ww.report("process_alignment", samples) samples = disambiguate.resolve(samples, run_parallel) samples = alignprep.merge_split_alignments(samples, run_parallel) with profile.report("callable regions", dirs): samples = run_parallel("prep_samples", [samples]) ww.report("prep_samples", samples) samples = run_parallel("postprocess_alignment", samples) ww.report("postprocess_alignment", samples) samples = run_parallel("combine_sample_regions", [samples]) samples = region.clean_sample_data(samples) ww.report("combine_sample_regions", samples) with profile.report("structural variation initial", dirs): samples = structural.run(samples, run_parallel, "initial") ww.report("sv_initial", samples) with profile.report("hla typing", dirs): samples = hla.run(samples, run_parallel) ww.report("call_hla", samples) ## Variant calling on sub-regions of the input file (full cluster) with prun.start(_wres(parallel, ["gatk", "picard", "variantcaller"]), samples, config, dirs, "full", multiplier=region.get_max_counts(samples), max_multicore=1) as run_parallel: with profile.report("alignment post-processing", dirs): samples = region.parallel_prep_region(samples, run_parallel) with profile.report("variant calling", dirs): samples = genotype.parallel_variantcall_region(samples, run_parallel) ## Finalize variants, BAMs and population databases (per-sample multicore cluster) with prun.start(_wres(parallel, ["gatk", "gatk-vqsr", "snpeff", "bcbio_variation", "gemini", "samtools", "fastqc", "bamtools", "bcbio-variation-recall", "qsignature", "svcaller"]), samples, config, dirs, "multicore2", multiplier=structural.parallel_multiplier(samples)) as run_parallel: with profile.report("joint squaring off/backfilling", dirs): samples = joint.square_off(samples, run_parallel) with profile.report("variant post-processing", dirs): samples = run_parallel("postprocess_variants", samples) samples = run_parallel("split_variants_by_sample", samples) with profile.report("prepped BAM merging", dirs): samples = region.delayed_bamprep_merge(samples, run_parallel) with profile.report("validation", dirs): samples = run_parallel("compare_to_rm", samples) samples = genotype.combine_multiple_callers(samples) with profile.report("ensemble calling", dirs): samples = ensemble.combine_calls_parallel(samples, run_parallel) with profile.report("validation summary", dirs): samples = validate.summarize_grading(samples) with profile.report("structural variation final", dirs): samples = structural.run(samples, run_parallel, "standard") with profile.report("structural variation ensemble", dirs): samples = structural.run(samples, run_parallel, "ensemble") with profile.report("structural variation validation", dirs): samples = run_parallel("validate_sv", samples) with profile.report("heterogeneity", dirs): samples = heterogeneity.run(samples, run_parallel) with profile.report("population database", dirs): samples = population.prep_db_parallel(samples, run_parallel) with profile.report("quality control", dirs): ww.report("pre_qc", samples) samples = qcsummary.generate_parallel(samples, run_parallel) ww.report("qc_summary", samples) with profile.report("archive", dirs): samples = archive.compress(samples, run_parallel) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for sample in samples: run_parallel("upload_samples_project", [sample]) logger.info("Timing: finished") return samples