def _get_input_files(samples, base_dir, tx_out_dir): """Retrieve input files, keyed by sample and QC method name. Stages files into the work directory to ensure correct names for MultiQC sample assessment when running with CWL. """ in_files = collections.defaultdict(list) for data in samples: sum_qc = tz.get_in(["summary", "qc"], data, {}) if sum_qc in [None, "None"]: sum_qc = {} elif isinstance(sum_qc, six.string_types): sum_qc = {dd.get_algorithm_qc(data)[0]: sum_qc} elif not isinstance(sum_qc, dict): raise ValueError("Unexpected summary qc: %s" % sum_qc) for program, pfiles in sum_qc.items(): if isinstance(pfiles, dict): pfiles = [pfiles["base"]] + pfiles.get("secondary", []) # CWL: presents output files as single file plus associated secondary files elif isinstance(pfiles, six.string_types): if os.path.exists(pfiles): pfiles = [ os.path.join(basedir, f) for basedir, subdir, filenames in os.walk( os.path.dirname(pfiles)) for f in filenames ] else: pfiles = [] in_files[(dd.get_sample_name(data), program)].extend(pfiles) staged_files = [] for (sample, program), files in in_files.items(): cur_dir = utils.safe_makedir( os.path.join(base_dir, "inputs", sample, program)) for f in files: if _check_multiqc_input(f) and _is_good_file_for_multiqc(f): if _in_temp_directory(f) or any( [cwlutils.is_cwl_run(d) for d in samples]): staged_f = os.path.join(cur_dir, os.path.basename(f)) shutil.copy(f, staged_f) staged_files.append(staged_f) else: staged_files.append(f) staged_files.extend(get_qsig_multiqc_files(samples)) # Back compatible -- to migrate to explicit specifications in input YAML if not any([cwlutils.is_cwl_run(d) for d in samples]): staged_files += ["trimmed", "htseq-count/*summary"] # Add in created target_info file if os.path.isfile( os.path.join(base_dir, "report", "metrics", "target_info.yaml")): staged_files += [ os.path.join(base_dir, "report", "metrics", "target_info.yaml") ] return sorted(list(set(staged_files)))
def _save_uploaded_file_list(samples, file_list_work, out_dir): """ Fixes all absolute work-rooted paths to relative final-rooted paths For CWL, prepare paths relative to output directory. """ if not utils.file_exists(file_list_work): return None if any([cwlutils.is_cwl_run(d) for d in samples]): upload_paths = [] with open(file_list_work) as f: for p in (l.strip() for l in f.readlines() if os.path.exists(l.strip())): if p.startswith(out_dir): upload_paths.append(p.replace(out_dir + "/", "")) else: upload_path_mapping = dict() for sample in samples: upload_path_mapping.update(get_all_upload_paths_from_sample(sample)) if not upload_path_mapping: return None with open(file_list_work) as f: paths = [l.strip() for l in f.readlines() if os.path.exists(l.strip())] upload_paths = [p for p in [ _work_path_to_rel_final_path(path, upload_path_mapping, samples[0]["upload"]["dir"]) for path in paths ] if p] if not upload_paths: return None file_list_final = os.path.join(out_dir, "list_files_final.txt") with open(file_list_final, "w") as f: for path in upload_paths: f.write(path + '\n') return file_list_final
def postprocess_variants(items): """Provide post-processing of variant calls: filtering and effects annotation. """ vrn_key = "vrn_file" if not isinstance(items, dict): items = [utils.to_single_data(x) for x in items] if "vrn_file_joint" in items[0]: vrn_key = "vrn_file_joint" data, items = _get_batch_representative(items, vrn_key) items = cwlutils.unpack_tarballs(items, data) data = cwlutils.unpack_tarballs(data, data) cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data)) logger.info("Finalizing variant calls: %s" % cur_name) orig_vrn_file = data.get(vrn_key) data = _symlink_to_workdir(data, [vrn_key]) data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"]) if data.get(vrn_key): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data) if ann_vrn_file: data[vrn_key] = ann_vrn_file if vrn_stats: data["vrn_stats"] = vrn_stats orig_items = _get_orig_items(items) logger.info("Annotate VCF file: %s" % cur_name) data[vrn_key] = annotation.finalize_vcf(data[vrn_key], get_variantcaller(data), orig_items) if dd.get_analysis(data).lower().find("rna-seq") >= 0: logger.info("Annotate RNA editing sites") ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"], data) if ann_file: data[vrn_key] = ann_file if cwlutils.is_cwl_run(data): logger.info("Annotate with population level variation data") ann_file = population.run_vcfanno(dd.get_vrn_file(data), data, population.do_db_build([data])) if ann_file: data[vrn_key] = ann_file logger.info("Filtering for %s" % cur_name) data[vrn_key] = variant_filtration( data[vrn_key], dd.get_ref_file(data), tz.get_in(("genome_resources", "variation"), data, {}), data, orig_items) logger.info("Prioritization for %s" % cur_name) prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data, orig_items) if prio_vrn_file != data[vrn_key]: data[vrn_key] = prio_vrn_file logger.info("Germline extraction for %s" % cur_name) data = germline.extract(data, orig_items) if dd.get_align_bam(data): data = damage.run_filter(data[vrn_key], dd.get_align_bam(data), dd.get_ref_file(data), data, orig_items) if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file): data[vrn_key] = orig_vrn_file return [[data]]
def to_cram(data): """Convert BAM archive files into indexed CRAM. """ data = utils.to_single_data(data) cram_file = cram.compress(dd.get_work_bam(data) or dd.get_align_bam(data), data) out_key = "archive_bam" if cwlutils.is_cwl_run(data) else "work_bam" data[out_key] = cram_file return [[data]]
def to_cram(data): """Convert BAM archive files into indexed CRAM. """ data = utils.to_single_data(data) cram_file = cram.compress( dd.get_work_bam(data) or dd.get_align_bam(data), data) out_key = "archive_bam" if cwlutils.is_cwl_run(data) else "work_bam" data[out_key] = cram_file return [[data]]
def run_qc(_, data, out_dir): """Run quality control in QC environment on a single sample. Enables peddy integration with CWL runs. """ if cwlutils.is_cwl_run(data): qc_data = run_peddy([data], out_dir) if tz.get_in(["summary", "qc", "peddy"], qc_data): return tz.get_in(["summary", "qc", "peddy"], qc_data)
def _get_input_files(samples, base_dir, tx_out_dir): """Retrieve input files, keyed by sample and QC method name. Stages files into the work directory to ensure correct names for MultiQC sample assessment when running with CWL. """ in_files = collections.defaultdict(list) for data in samples: sum_qc = tz.get_in(["summary", "qc"], data, {}) if sum_qc in [None, "None"]: sum_qc = {} elif isinstance(sum_qc, six.string_types): sum_qc = {dd.get_algorithm_qc(data)[0]: sum_qc} elif not isinstance(sum_qc, dict): raise ValueError("Unexpected summary qc: %s" % sum_qc) for program, pfiles in sum_qc.items(): if isinstance(pfiles, dict): pfiles = [pfiles["base"]] + pfiles.get("secondary", []) # CWL: presents output files as single file plus associated secondary files elif isinstance(pfiles, six.string_types): if os.path.exists(pfiles): pfiles = [os.path.join(basedir, f) for basedir, subdir, filenames in os.walk(os.path.dirname(pfiles)) for f in filenames] else: pfiles = [] in_files[(dd.get_sample_name(data), program)].extend(pfiles) staged_files = [] for (sample, program), files in in_files.items(): cur_dir = utils.safe_makedir(os.path.join(base_dir, "inputs", sample, program)) for f in files: if _check_multiqc_input(f) and _is_good_file_for_multiqc(f): if _in_temp_directory(f) or any([cwlutils.is_cwl_run(d) for d in samples]): staged_f = os.path.join(cur_dir, os.path.basename(f)) shutil.copy(f, staged_f) staged_files.append(staged_f) else: staged_files.append(f) staged_files.extend(get_qsig_multiqc_files(samples)) # Back compatible -- to migrate to explicit specifications in input YAML if not any([cwlutils.is_cwl_run(d) for d in samples]): staged_files += ["trimmed", "htseq-count/*summary"] # Add in created target_info file if os.path.isfile(os.path.join(base_dir, "report", "metrics", "target_info.yaml")): staged_files += [os.path.join(base_dir, "report", "metrics", "target_info.yaml")] return sorted(list(set(staged_files)))
def _symlink_or_copy_grabix(in_file, out_file, data): """We cannot symlink in CWL, but may be able to use inputs or copy """ if cwlutils.is_cwl_run(data): # Has grabix indexes, we're okay to go if utils.file_exists(in_file + ".gbi"): out_file = in_file else: utils.copy_plus(in_file, out_file) else: utils.symlink_plus(in_file, out_file) return out_file
def postprocess_variants(items): """Provide post-processing of variant calls: filtering and effects annotation. """ vrn_key = "vrn_file" if not isinstance(items, dict): items = [utils.to_single_data(x) for x in items] if "vrn_file_joint" in items[0]: vrn_key = "vrn_file_joint" data, items = _get_batch_representative(items, vrn_key) items = cwlutils.unpack_tarballs(items, data) data = cwlutils.unpack_tarballs(data, data) cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data, require_bam=False)) logger.info("Finalizing variant calls: %s" % cur_name) orig_vrn_file = data.get(vrn_key) data = _symlink_to_workdir(data, [vrn_key]) data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"]) if data.get(vrn_key): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data) if ann_vrn_file: data[vrn_key] = ann_vrn_file if vrn_stats: data["vrn_stats"] = vrn_stats orig_items = _get_orig_items(items) logger.info("Annotate VCF file: %s" % cur_name) data[vrn_key] = annotation.finalize_vcf(data[vrn_key], get_variantcaller(data, require_bam=False), orig_items) if cwlutils.is_cwl_run(data): logger.info("Annotate with population level variation data") ann_file = population.run_vcfanno(data[vrn_key], data) if ann_file: data[vrn_key] = ann_file logger.info("Filtering for %s" % cur_name) data[vrn_key] = variant_filtration(data[vrn_key], dd.get_ref_file(data), tz.get_in(("genome_resources", "variation"), data, {}), data, orig_items) logger.info("Prioritization for %s" % cur_name) prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data, orig_items) if prio_vrn_file != data[vrn_key]: data[vrn_key] = prio_vrn_file logger.info("Germline extraction for %s" % cur_name) data = germline.extract(data, orig_items) if dd.get_align_bam(data): data = damage.run_filter(data[vrn_key], dd.get_align_bam(data), dd.get_ref_file(data), data, orig_items) if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file): data[vrn_key] = orig_vrn_file return [[data]]
def detect_sv(items, all_items=None, stage="standard"): """Top level parallel target for examining structural variation. items = sample-sv_caller list, from one batch """ items = [utils.to_single_data(x) for x in items] items = cwlutils.unpack_tarballs(items, items[0]) svcaller = items[0]["config"]["algorithm"].get("svcaller") caller_fn = _get_callers(items, stage, special_cases=True).get(svcaller) out = [] batch = dd.get_batch(items[0]) # no SV calling when just creating a PON for PureCN if batch == "pon_build" and "purecn" in dd.get_svcaller(items[0]): return out if svcaller and caller_fn: if (all_items and svcaller in _NEEDS_BACKGROUND and not vcfutils.is_paired_analysis( [x.get("align_bam") for x in items], items)): names = set([dd.get_sample_name(x) for x in items]) background = [ x for x in all_items if dd.get_sample_name(x) not in names ] for svdata in caller_fn(items, background): out.append([svdata]) else: for svdata in caller_fn(items): out.append([svdata]) else: for data in items: out.append([data]) # Avoid nesting of callers for CWL runs for easier extraction if cwlutils.is_cwl_run(items[0]): out_cwl = [] for data in [utils.to_single_data(x) for x in out]: # Run validation directly from CWL runs since we're single stage data = validate.evaluate(data) data["svvalidate"] = { "summary": tz.get_in(["sv-validate", "csv"], data) } svs = data.get("sv") if svs: assert len(svs) == 1, svs data["sv"] = svs[0] else: data["sv"] = {} data = _add_supplemental(data) out_cwl.append([data]) return out_cwl return out
def compare_to_rm(data): """Compare final variant calls against reference materials of known calls. """ if isinstance(data, (list, tuple)) and cwlutils.is_cwl_run(utils.to_single_data(data[0])): data = _normalize_cwl_inputs(data) toval_data = _get_validate(data) toval_data = cwlutils.unpack_tarballs(toval_data, toval_data) if toval_data: caller = _get_caller(toval_data) sample = dd.get_sample_name(toval_data) base_dir = utils.safe_makedir(os.path.join(toval_data["dirs"]["work"], "validate", sample, caller)) if isinstance(toval_data["vrn_file"], (list, tuple)): raise NotImplementedError("Multiple input files for validation: %s" % toval_data["vrn_file"]) else: vrn_file = os.path.abspath(toval_data["vrn_file"]) rm_file = normalize_input_path(toval_data["config"]["algorithm"]["validate"], toval_data) rm_interval_file = _gunzip(normalize_input_path(toval_data["config"]["algorithm"].get("validate_regions"), toval_data), toval_data) rm_interval_file = bedutils.clean_file(rm_interval_file, toval_data, prefix="validateregions-", bedprep_dir=utils.safe_makedir(os.path.join(base_dir, "bedprep"))) rm_file = naming.handle_synonyms(rm_file, dd.get_ref_file(toval_data), data.get("genome_build"), base_dir, data) rm_interval_file = (naming.handle_synonyms(rm_interval_file, dd.get_ref_file(toval_data), data.get("genome_build"), base_dir, data) if rm_interval_file else None) vmethod = tz.get_in(["config", "algorithm", "validate_method"], data, "rtg") # RTG can fail on totally empty files. Call everything in truth set as false negatives if not vcfutils.vcf_has_variants(vrn_file): eval_files = _setup_call_false(rm_file, rm_interval_file, base_dir, toval_data, "fn") data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) # empty validation file, every call is a false positive elif not vcfutils.vcf_has_variants(rm_file): eval_files = _setup_call_fps(vrn_file, rm_interval_file, base_dir, toval_data, "fp") data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) elif vmethod in ["rtg", "rtg-squash-ploidy"]: eval_files = _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data, vmethod) eval_files = _annotate_validations(eval_files, toval_data) data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) elif vmethod == "hap.py": data["validate"] = _run_happy_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data) elif vmethod == "bcbio.variation": data["validate"] = _run_bcbio_variation(vrn_file, rm_file, rm_interval_file, base_dir, sample, caller, toval_data) return [[data]]
def _handle_precalled(data): """Copy in external pre-called variants fed into analysis. Symlinks for non-CWL runs where we want to ensure VCF present in a local directory. """ if data.get("vrn_file") and not cwlutils.is_cwl_run(data): vrn_file = data["vrn_file"] if isinstance(vrn_file, (list, tuple)): assert len(vrn_file) == 1 vrn_file = vrn_file[0] precalled_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "precalled")) ext = utils.splitext_plus(vrn_file)[-1] orig_file = os.path.abspath(vrn_file) our_vrn_file = os.path.join(precalled_dir, "%s-precalled%s" % (dd.get_sample_name(data), ext)) utils.copy_plus(orig_file, our_vrn_file) data["vrn_file"] = our_vrn_file return data
def _set_align_split_size(data): """Set useful align_split_size, generating an estimate if it doesn't exist. We try to split on larger inputs and avoid too many pieces, aiming for size chunks of 5Gb or at most 50 maximum splits. The size estimate used in calculations is 20 million reads for ~5Gb. For UMI calculations we skip splitting since we're going to align and re-align after consensus. For CWL runs, we pick larger split sizes to avoid overhead of staging each chunk. """ if cwlutils.is_cwl_run(data): target_size = 20 # Gb target_size_reads = 80 # million reads else: target_size = 5 # Gb target_size_reads = 20 # million reads max_splits = 100 # Avoid too many pieces, causing merge memory problems val = dd.get_align_split_size(data) umi_consensus = dd.get_umi_consensus(data) if val is None: if not umi_consensus: total_size = 0 # Gb # Use original files if we might have reduced the size of our prepped files input_files = data.get( "files_orig", []) if dd.get_save_diskspace(data) else data.get( "files", []) for fname in input_files: if os.path.exists(fname): total_size += os.path.getsize(fname) / (1024.0 * 1024.0 * 1024.0) # Only set if we have files and are bigger than the target size if total_size > target_size: data["config"]["algorithm"]["align_split_size"] = \ int(1e6 * _pick_align_split_size(total_size, target_size, target_size_reads, max_splits)) elif val: assert not umi_consensus, "Cannot set align_split_size to %s with UMI conensus specified" % val return data
def _set_align_split_size(data): """Set useful align_split_size, generating an estimate if it doesn't exist. We try to split on larger inputs and avoid too many pieces, aiming for size chunks of 5Gb or at most 50 maximum splits. The size estimate used in calculations is 20 million reads for ~5Gb. For UMI calculations we skip splitting since we're going to align and re-align after consensus. For CWL runs, we pick larger split sizes to avoid overhead of staging each chunk. """ if cwlutils.is_cwl_run(data): target_size = 20 # Gb target_size_reads = 80 # million reads else: target_size = 5 # Gb target_size_reads = 20 # million reads max_splits = 100 # Avoid too many pieces, causing merge memory problems val = dd.get_align_split_size(data) umi_consensus = dd.get_umi_consensus(data) if val is None: if not umi_consensus: total_size = 0 # Gb # Use original files if we might have reduced the size of our prepped files input_files = data.get("files_orig", []) if dd.get_save_diskspace(data) else data.get("files", []) for fname in input_files: if os.path.exists(fname): total_size += os.path.getsize(fname) / (1024.0 * 1024.0 * 1024.0) # Only set if we have files and are bigger than the target size if total_size > target_size: data["config"]["algorithm"]["align_split_size"] = \ int(1e6 * _pick_align_split_size(total_size, target_size, target_size_reads, max_splits)) elif val: assert not umi_consensus, "Cannot set align_split_size to %s with UMI conensus specified" % val return data
def detect_sv(items, all_items=None, stage="standard"): """Top level parallel target for examining structural variation. """ items = [utils.to_single_data(x) for x in items] items = cwlutils.unpack_tarballs(items, items[0]) svcaller = items[0]["config"]["algorithm"].get("svcaller") caller_fn = _get_callers(items, stage, special_cases=True).get(svcaller) out = [] if svcaller and caller_fn: if (all_items and svcaller in _NEEDS_BACKGROUND and not vcfutils.is_paired_analysis([x.get("align_bam") for x in items], items)): names = set([dd.get_sample_name(x) for x in items]) background = [x for x in all_items if dd.get_sample_name(x) not in names] for svdata in caller_fn(items, background): out.append([svdata]) else: for svdata in caller_fn(items): out.append([svdata]) else: for data in items: out.append([data]) # Avoid nesting of callers for CWL runs for easier extraction if cwlutils.is_cwl_run(items[0]): out_cwl = [] for data in [utils.to_single_data(x) for x in out]: # Run validation directly from CWL runs since we're single stage data = validate.evaluate(data) data["svvalidate"] = {"summary": tz.get_in(["sv-validate", "csv"], data)} svs = data.get("sv") if svs: assert len(svs) == 1, svs data["sv"] = svs[0] else: data["sv"] = {} data = _add_supplemental(data) out_cwl.append([data]) return out_cwl return out
def summary(*samples): """Summarize all quality metrics together""" samples = list(utils.flatten(samples)) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.") out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") file_list = os.path.join(out_dir, "list_files.txt") work_samples = cwlutils.unpack_tarballs([utils.deepish_copy(x) for x in samples], samples[0]) work_samples = _summarize_inputs(work_samples, out_dir) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(work_samples, out_dir, tx_out) in_files += _merge_metrics(work_samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): _create_config_file(out_dir, work_samples) input_list_file = _create_list_file(in_files, file_list) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s && " % dd.get_tmp_dir(samples[0]) else: export_tmp = "" locale_export = utils.locale_export() path_export = utils.local_path_export() other_opts = config_utils.get_resources("multiqc", samples[0]["config"]).get("options", []) other_opts = " ".join([str(x) for x in other_opts]) cmd = ("{path_export}{export_tmp}{locale_export} " "{multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}") do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")): shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) samples = _group_by_sample_and_batch(samples) if utils.file_exists(out_file) and samples: data_files = set() for i, data in enumerate(samples): data_files.add(os.path.join(out_dir, "report", "metrics", dd.get_sample_name(data) + "_bcbio.txt")) data_files.add(os.path.join(out_dir, "report", "metrics", "target_info.yaml")) data_files.add(os.path.join(out_dir, "multiqc_config.yaml")) [data_files.add(f) for f in glob.glob(os.path.join(out_dir, "multiqc_data", "*"))] data_files = [f for f in data_files if f and utils.file_exists(f)] if "summary" not in samples[0]: samples[0]["summary"] = {} samples[0]["summary"]["multiqc"] = {"base": out_file, "secondary": data_files} data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json") data_json_final = _save_uploaded_data_json(samples, data_json, os.path.join(out_dir, "multiqc_data")) if data_json_final: samples[0]["summary"]["multiqc"]["secondary"].append(data_json_final) # Prepare final file list and inputs for downstream usage file_list_final = _save_uploaded_file_list(samples, file_list, out_dir) if file_list_final: samples[0]["summary"]["multiqc"]["secondary"].append(file_list_final) if any([cwlutils.is_cwl_run(d) for d in samples]): for indir in ["inputs", "report"]: tarball = os.path.join(out_dir, "multiqc-%s.tar.gz" % (indir)) if not utils.file_exists(tarball): with utils.chdir(out_dir): cmd = ["tar", "-czvpf", tarball, indir] do.run(cmd, "Compress multiqc inputs: %s" % indir) samples[0]["summary"]["multiqc"]["secondary"].append(tarball) if any([cwlutils.is_cwl_run(d) for d in samples]): samples = _add_versions(samples) return [[data] for data in samples]
def summary(*samples): """Summarize all quality metrics together""" samples = list(utils.flatten(samples)) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug( "multiqc not found. Update bcbio_nextgen.py tools to fix this issue." ) out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") file_list = os.path.join(out_dir, "list_files.txt") work_samples = cwlutils.unpack_tarballs( [utils.deepish_copy(x) for x in samples], samples[0]) work_samples = _summarize_inputs(work_samples, out_dir) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(work_samples, out_dir, tx_out) in_files += _merge_metrics(work_samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): config_file = _create_config_file(out_dir, work_samples) input_list_file = _create_list_file(in_files, file_list) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s && " % dd.get_tmp_dir( samples[0]) else: export_tmp = "" locale_export = utils.locale_export() path_export = utils.local_path_export() other_opts = config_utils.get_resources( "multiqc", samples[0]["config"]).get("options", []) other_opts = " ".join([str(x) for x in other_opts]) cmd = ( "{path_export}{export_tmp}{locale_export} " "{multiqc} -c {config_file} -f -l {input_list_file} {other_opts} -o {tx_out}" ) do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists( os.path.join(tx_out, "multiqc_report.html")): shutil.move( os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) samples = _group_by_sample_and_batch(samples) if utils.file_exists(out_file) and samples: data_files = set() for i, data in enumerate(samples): data_files.add( os.path.join(out_dir, "report", "metrics", dd.get_sample_name(data) + "_bcbio.txt")) data_files.add( os.path.join(out_dir, "report", "metrics", "target_info.yaml")) data_files.add(os.path.join(out_dir, "multiqc_config.yaml")) [ data_files.add(f) for f in glob.glob(os.path.join(out_dir, "multiqc_data", "*")) ] data_files = [f for f in data_files if f and utils.file_exists(f)] if "summary" not in samples[0]: samples[0]["summary"] = {} samples[0]["summary"]["multiqc"] = { "base": out_file, "secondary": data_files } data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json") data_json_final = _save_uploaded_data_json( samples, data_json, os.path.join(out_dir, "multiqc_data")) if data_json_final: samples[0]["summary"]["multiqc"]["secondary"].append( data_json_final) # Prepare final file list and inputs for downstream usage file_list_final = _save_uploaded_file_list(samples, file_list, out_dir) if file_list_final: samples[0]["summary"]["multiqc"]["secondary"].append( file_list_final) if any([cwlutils.is_cwl_run(d) for d in samples]): for indir in ["inputs", "report"]: tarball = os.path.join(out_dir, "multiqc-%s.tar.gz" % (indir)) if not utils.file_exists(tarball): with utils.chdir(out_dir): cmd = ["tar", "-czvpf", tarball, indir] do.run(cmd, "Compress multiqc inputs: %s" % indir) samples[0]["summary"]["multiqc"]["secondary"].append( tarball) if any([cwlutils.is_cwl_run(d) for d in samples]): samples = _add_versions(samples) return [[data] for data in samples]