def combine_sample_regions(*samples): """Create batch-level sets of callable regions for multi-sample calling. Intersects all non-callable (nblock) regions from all samples in a batch, producing a global set of callable regions. """ samples = utils.unpack_worlds(samples) # back compatibility -- global file for entire sample set global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed") if utils.file_exists(global_analysis_file) and not _needs_region_update( global_analysis_file, samples): global_no_analysis_file = os.path.join( os.path.dirname(global_analysis_file), "noanalysis_blocks.bed") else: global_analysis_file = None out = [] analysis_files = [] batches = [] with shared.bedtools_tmpdir(samples[0]): for batch, items in vmulti.group_by_batch(samples, require_bam=False).items(): batches.append(items) if global_analysis_file: analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file else: analysis_file, no_analysis_file = _combine_sample_regions_batch( batch, items) for data in items: vr_file = dd.get_variant_regions(data) if analysis_file: analysis_files.append(analysis_file) data["config"]["algorithm"][ "callable_regions"] = analysis_file data["config"]["algorithm"][ "non_callable_regions"] = no_analysis_file data["config"]["algorithm"][ "callable_count"] = pybedtools.BedTool( analysis_file).count() elif vr_file: data["config"]["algorithm"][ "callable_count"] = pybedtools.BedTool( vr_file).count() highdepth_bed = tz.get_in(["regions", "highdepth"], data) if highdepth_bed: data["config"]["algorithm"][ "highdepth_regions"] = highdepth_bed # attach a representative sample for calculating callable region if not data.get("work_bam"): for x in items: if x.get("work_bam"): data["work_bam_callable"] = x["work_bam"] out.append([data]) assert len(out) == len(samples) if len(analysis_files) > 0: final_regions = pybedtools.BedTool(analysis_files[0]) _analysis_block_stats(final_regions, batches[0]) return out
def summary(*samples): """Summarize all quality metrics together""" samples = utils.unpack_worlds(samples) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.") file_fapths = [] opts = "" out_dir = os.path.join(work_dir, "multiqc") out_data = os.path.join(work_dir, "multiqc", "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") samples = _report_summary(samples, os.path.join(out_dir, "report")) for data in samples: for program, pfiles in tz.get_in(["summary", "qc"], data, {}).iteritems(): if isinstance(pfiles, dict): pfiles = [pfiles["base"]] + pfiles["secondary"] elif isinstance(pfiles, basestring): pfiles = [pfiles] file_fapths.extend(pfiles) file_fapths.append(os.path.join(out_dir, "report", "metrics", "target_info.yaml")) # XXX temporary workaround until we can handle larger inputs through MultiQC file_fapths = list(set(file_fapths)) # Back compatible -- to migrate to explicit specifications in input YAML file_fapths += ["trimmed", "htseq-count/*summary"] if not utils.file_exists(out_file): with utils.chdir(work_dir): file_fapths = [fpath for fpath in file_fapths if _check_multiqc_input(fpath) and _is_good_file_for_multiqc(fpath)] input_list_file = _create_list_file(file_fapths) export_tmp = "" if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0]) if input_list_file: cmd = "{export_tmp} {multiqc} -f -l {input_list_file} -o {tx_out} {opts}" with tx_tmpdir(data, work_dir) as tx_out: do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")): shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) out = [] for i, data in enumerate(samples): if i == 0: if utils.file_exists(out_file): data_files = glob.glob(os.path.join(out_dir, "multiqc_data", "*.txt")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.bed")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.txt")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.tsv")) data_files += glob.glob(os.path.join(out_dir, "report", "*.R*")) if "summary" not in data: data["summary"] = {} data["summary"]["multiqc"] = {"base": out_file, "secondary": data_files} out.append(data) return [[fpath] for fpath in out]
def summary(*samples): """Summarize all quality metrics together""" samples = utils.unpack_worlds(samples) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.") folders = [] opts = "" out_dir = os.path.join(work_dir, "multiqc") out_data = os.path.join(work_dir, "multiqc", "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") samples = _report_summary(samples, os.path.join(out_dir, "report")) for data in samples: for program, pfiles in tz.get_in(["summary", "qc"], data, {}).iteritems(): if isinstance(pfiles, dict): pfiles = pfiles["base"] folders.append(os.path.dirname(pfiles)) # XXX temporary workaround until we can handle larger inputs through MultiQC folders = list(set(folders)) if len(folders) > 250: logger.warning("Too many samples for MultiQC, only using first 250 entries.") folders = folders[:250] opts = "--flat" # Back compatible -- to migrate to explicit specifications in input YAML folders += ["trimmed", "htseq-count/*summary"] if not utils.file_exists(out_file): with utils.chdir(work_dir): input_dir = " ".join([_check_multiqc_input(d) for d in folders]) export_tmp = "" if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0]) if input_dir.strip(): cmd = "{export_tmp} {multiqc} -f {input_dir} -o {tx_out} {opts}" with tx_tmpdir(data, work_dir) as tx_out: do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")): shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) out = [] for i, data in enumerate(samples): if i == 0: if utils.file_exists(out_file): data_files = glob.glob(os.path.join(out_dir, "multiqc_data", "*.txt")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.bed")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.txt")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.tsv")) data_files += glob.glob(os.path.join(out_dir, "report", "*.R*")) if "summary" not in data: data["summary"] = {} data["summary"]["multiqc"] = {"base": out_file, "secondary": data_files} out.append(data) return [[d] for d in out]
def summary(*samples): """Summarize all quality metrics together""" samples = utils.unpack_worlds(samples) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.") out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") file_list = os.path.join(out_dir, "list_files.txt") samples = _report_summary(samples, os.path.join(out_dir, "report")) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(samples, out_dir, tx_out) in_files += _merge_metrics(samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): _create_config_file(out_dir, samples) input_list_file = _create_list_file(in_files, file_list) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0]) else: export_tmp = "" path_export = utils.local_path_export() cmd = "{path_export}{export_tmp} {multiqc} -f -l {input_list_file} -o {tx_out}" do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")): shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) out = [] for i, data in enumerate(_group_by_samplename(samples)): if i == 0: if utils.file_exists(out_file): data_files = glob.glob(os.path.join(out_dir, "multiqc_data", "*.txt")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.bed")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.txt")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.tsv")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.yaml")) data_files += glob.glob(os.path.join(out_dir, "report", "*.R*")) data_files.append(file_list) if "summary" not in data: data["summary"] = {} data["summary"]["multiqc"] = {"base": out_file, "secondary": data_files} file_list_final = _save_uploaded_file_list(samples, file_list, out_dir) if file_list_final: data["summary"]["multiqc"]["secondary"].append(file_list_final) out.append([data]) return out
def combine_sample_regions(*samples): """Create batch-level sets of callable regions for multi-sample calling. Intersects all non-callable (nblock) regions from all samples in a batch, producing a global set of callable regions. """ samples = utils.unpack_worlds(samples) samples = cwlutils.unpack_tarballs(samples, samples[0]) # back compatibility -- global file for entire sample set global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed") if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples): global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed") else: global_analysis_file = None out = [] analysis_files = [] batches = [] with shared.bedtools_tmpdir(samples[0]): for batch, items in vmulti.group_by_batch(samples, require_bam=False).items(): batches.append(items) if global_analysis_file: analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file else: analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items) for data in items: vr_file = dd.get_variant_regions(data) if analysis_file: analysis_files.append(analysis_file) data["config"]["algorithm"]["callable_regions"] = analysis_file data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(analysis_file).count() elif vr_file: data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(vr_file).count() # attach a representative sample for calculating callable region if not data.get("work_bam"): for x in items: if x.get("work_bam"): data["work_bam_callable"] = x["work_bam"] out.append([data]) # Ensure output order matches input order, consistency for CWL-based runs assert len(out) == len(samples) sample_indexes = {dd.get_sample_name(d): i for i, d in enumerate(samples)} def by_input_index(xs): return sample_indexes[dd.get_sample_name(xs[0])] out.sort(key=by_input_index) if len(analysis_files) > 0: final_regions = pybedtools.BedTool(analysis_files[0]) _analysis_block_stats(final_regions, batches[0]) return out
def combine_sample_regions(*samples): """Create batch-level sets of callable regions for multi-sample calling. Intersects all non-callable (nblock) regions from all samples in a batch, producing a global set of callable regions. """ samples = utils.unpack_worlds(samples) samples = [cwlutils.unpack_tarballs(x, x) for x in samples] # back compatibility -- global file for entire sample set global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed") if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples): global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed") else: global_analysis_file = None out = [] analysis_files = [] batches = [] with shared.bedtools_tmpdir(samples[0]): for batch, items in vmulti.group_by_batch(samples, require_bam=False).items(): batches.append(items) if global_analysis_file: analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file else: analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items) for data in items: vr_file = dd.get_variant_regions(data) if analysis_file: analysis_files.append(analysis_file) data["config"]["algorithm"]["callable_regions"] = analysis_file data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(analysis_file).count() elif vr_file: data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(vr_file).count() # attach a representative sample for calculating callable region if not data.get("work_bam"): for x in items: if x.get("work_bam"): data["work_bam_callable"] = x["work_bam"] out.append([data]) # Ensure output order matches input order, consistency for CWL-based runs assert len(out) == len(samples) sample_indexes = {dd.get_sample_name(d): i for i, d in enumerate(samples)} def by_input_index(xs): return sample_indexes[dd.get_sample_name(xs[0])] out.sort(key=by_input_index) if len(analysis_files) > 0: final_regions = pybedtools.BedTool(analysis_files[0]) _analysis_block_stats(final_regions, batches[0]) return out
def summary(*samples): """Summarize all quality metrics together""" samples = utils.unpack_worlds(samples) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.") out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "mulitqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") samples = _report_summary(samples, os.path.join(out_dir, "report")) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(samples, out_dir, tx_out) in_files += _merge_metrics(samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): _create_config_file(out_dir, samples) input_list_file = _create_list_file(in_files, out_dir) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0]) else: export_tmp = "" cmd = "{export_tmp} {multiqc} -f -l {input_list_file} -o {tx_out}" do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")): shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) out = [] for i, data in enumerate(_group_by_samplename(samples)): if i == 0: if utils.file_exists(out_file): data_files = glob.glob(os.path.join(out_dir, "multiqc_data", "*.txt")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.bed")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.txt")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.tsv")) data_files += glob.glob(os.path.join(out_dir, "report", "*.R*")) if "summary" not in data: data["summary"] = {} data["summary"]["multiqc"] = {"base": out_file, "secondary": data_files} out.append([data]) return out
def summary(*samples): """Summarize all quality metrics together""" samples = utils.unpack_worlds(samples) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug( "multiqc not found. Update bcbio_nextgen.py tools to fix this issue." ) file_fapths = [] opts = "" out_dir = os.path.join(work_dir, "multiqc") out_data = os.path.join(work_dir, "multiqc", "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") samples = _report_summary(samples, os.path.join(out_dir, "report")) for data in samples: for program, pfiles in tz.get_in(["summary", "qc"], data, {}).iteritems(): if isinstance(pfiles, dict): pfiles = [pfiles["base"]] + pfiles["secondary"] elif isinstance(pfiles, basestring): pfiles = [pfiles] file_fapths.extend(pfiles) file_fapths.append( os.path.join(out_dir, "report", "metrics", "target_info.yaml")) # XXX temporary workaround until we can handle larger inputs through MultiQC file_fapths = list(set(file_fapths)) # Back compatible -- to migrate to explicit specifications in input YAML file_fapths += ["trimmed", "htseq-count/*summary"] if not utils.file_exists(out_file): with utils.chdir(work_dir): file_fapths = [ fpath for fpath in file_fapths if _check_multiqc_input(fpath) and _is_good_file_for_multiqc(fpath) ] input_list_file = _create_list_file(file_fapths) export_tmp = "" if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0]) if input_list_file: cmd = "{export_tmp} {multiqc} -f -l {input_list_file} -o {tx_out} {opts}" with tx_tmpdir(data, work_dir) as tx_out: do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists( os.path.join(tx_out, "multiqc_report.html")): shutil.move( os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) out = [] for i, data in enumerate(samples): if i == 0: if utils.file_exists(out_file): data_files = glob.glob( os.path.join(out_dir, "multiqc_data", "*.txt")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.bed")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.txt")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.tsv")) data_files += glob.glob(os.path.join(out_dir, "report", "*.R*")) if "summary" not in data: data["summary"] = {} data["summary"]["multiqc"] = { "base": out_file, "secondary": data_files } out.append(data) return [[fpath] for fpath in out]
def summary(*samples): """Summarize all quality metrics together""" samples = utils.unpack_worlds(samples) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug( "multiqc not found. Update bcbio_nextgen.py tools to fix this issue." ) folders = [] opts = "" out_dir = os.path.join(work_dir, "multiqc") out_data = os.path.join(work_dir, "multiqc", "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") samples = _report_summary(samples, os.path.join(out_dir, "report")) for data in samples: for program, pfiles in tz.get_in(["summary", "qc"], data, {}).iteritems(): if isinstance(pfiles, dict): pfiles = pfiles["base"] folders.append(os.path.dirname(pfiles)) # XXX temporary workaround until we can handle larger inputs through MultiQC folders = list(set(folders)) if len(folders) > 250: logger.warning( "Too many samples for MultiQC, only using first 250 entries.") folders = folders[:250] opts = "--flat" # Back compatible -- to migrate to explicit specifications in input YAML folders += ["trimmed", "htseq-count/*summary"] if not utils.file_exists(out_file): with utils.chdir(work_dir): input_dir = " ".join([_check_multiqc_input(d) for d in folders]) if input_dir.strip(): cmd = "{multiqc} -f {input_dir} -o {tx_out} {opts}" with tx_tmpdir(data, work_dir) as tx_out: do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists( os.path.join(tx_out, "multiqc_report.html")): shutil.move( os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) out = [] for i, data in enumerate(samples): if i == 0: if utils.file_exists(out_file): data_files = glob.glob( os.path.join(out_dir, "multiqc_data", "*.txt")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.bed")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.txt")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.tsv")) data_files += glob.glob(os.path.join(out_dir, "report", "*.R*")) if "summary" not in data: data["summary"] = {} data["summary"]["multiqc"] = { "base": out_file, "secondary": data_files } out.append(data) return [[d] for d in out]