def _maybe_add_junction_files(algorithm, sample, out): """ add splice junction files from STAR, if available """ junction_bed = dd.get_junction_bed(sample) if junction_bed: out.append({ "path": junction_bed, "type": "bed", "ext": "SJ", "dir": "STAR" }) chimeric_file = dd.get_chimericjunction(sample) if chimeric_file: out.append({ "path": chimeric_file, "type": "tsv", "ext": "chimericSJ", "dir": "STAR" }) sj_file = dd.get_starjunction(sample) if sj_file: out.append({ "path": sj_file, "type": "tab", "ext": "SJ", "dir": "STAR" }) star_summary = dd.get_summary_qc(sample).get("star", None) if star_summary: star_log = star_summary["base"] if star_log: out.append({"path": star_log, "type": "log", "dir": "STAR"}) return out
def _combine_qc_samples(samples): """Combine split QC analyses into single samples based on BAM files. """ by_bam = collections.defaultdict(list) for data in [utils.to_single_data(x) for x in samples]: batch = dd.get_batch(data) or dd.get_sample_name(data) if not isinstance(batch, (list, tuple)): batch = [batch] batch = tuple(batch) by_bam[(dd.get_align_bam(data) or dd.get_work_bam(data), batch)].append(data) out = [] for data_group in by_bam.values(): data = data_group[0] alg_qc = [] qc = {} metrics = {} for d in data_group: qc.update(dd.get_summary_qc(d)) metrics.update(dd.get_summary_metrics(d)) alg_qc.extend(dd.get_algorithm_qc(d)) data["config"]["algorithm"]["qc"] = alg_qc data["summary"]["qc"] = qc data["summary"]["metrics"] = metrics out.append([data]) return out
def create_ataqv_report(samples): """ make the ataqv report from a set of ATAC-seq samples """ data = samples[0][0] new_samples = [] reportdir = os.path.join(dd.get_work_dir(data), "qc", "ataqv") sentinel = os.path.join(reportdir, "index.html") if utils.file_exists(sentinel): ataqv_output = {"base": sentinel, "secondary": get_ataqv_report_files(reportdir)} new_data = [] for data in dd.sample_data_iterator(samples): data = tz.assoc_in(data, ["ataqv_report"], ataqv_output) new_data.append(data) return dd.get_samples_from_datalist(new_data) mkarv = config_utils.get_program("mkarv", dd.get_config(data)) ataqv_files = [] for data in dd.sample_data_iterator(samples): qc = dd.get_summary_qc(data) ataqv_file = tz.get_in(("ataqv", "base"), qc, None) if ataqv_file and utils.file_exists(ataqv_file): ataqv_files.append(ataqv_file) if not ataqv_files: return samples ataqv_json_file_string = " ".join(ataqv_files) with file_transaction(reportdir) as txreportdir: cmd = f"{mkarv} {txreportdir} {ataqv_json_file_string}" message = f"Creating ataqv report from {ataqv_json_file_string}." do.run(cmd, message) new_data = [] ataqv_output = {"base": sentinel, "secondary": get_ataqv_report_files(reportdir)} for data in dd.sample_data_iterator(samples): data = tz.assoc_in(data, ["ataqv_report"], ataqv_output) new_data.append(data) return dd.get_samples_from_datalist(new_data)
def _combine_qc_samples(samples): """Combine split QC analyses into single samples based on BAM files. """ by_bam = collections.defaultdict(list) for data in [utils.to_single_data(x) for x in samples]: batch = dd.get_batch(data) or dd.get_sample_name(data) if not isinstance(batch, (list, tuple)): batch = [batch] batch = tuple(batch) by_bam[(dd.get_align_bam(data), batch)].append(data) out = [] for data_group in by_bam.values(): data = data_group[0] alg_qc = [] qc = {} metrics = {} for d in data_group: qc.update(dd.get_summary_qc(d)) metrics.update(dd.get_summary_metrics(d)) alg_qc.extend(dd.get_algorithm_qc(d)) data["config"]["algorithm"]["qc"] = alg_qc data["summary"]["qc"] = qc data["summary"]["metrics"] = metrics out.append([data]) return out
def _run_qc_tools(bam_file, data): """Run a set of third party quality control tools, returning QC directory and metrics. :param bam_file: alignments in bam format :param data: dict with all configuration information :returns: dict with output of different tools """ from bcbio.qc import (atropos, coverage, damage, fastqc, kraken, qsignature, qualimap, samtools, picard, srna, umi, variant, viral, preseq) tools = {"fastqc": fastqc.run, "atropos": atropos.run, "small-rna": srna.run, "samtools": samtools.run, "qualimap": qualimap.run, "qualimap_rnaseq": qualimap.run_rnaseq, "qsignature": qsignature.run, "coverage": coverage.run, "damage": damage.run, "variants": variant.run, "peddy": peddy.run_qc, "kraken": kraken.run, "picard": picard.run, "umi": umi.run, "viral": viral.run, "preseq": preseq.run, } qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"])) metrics = {} qc_out = utils.deepish_copy(dd.get_summary_qc(data)) for program_name in dd.get_algorithm_qc(data): if not bam_file and program_name != "kraken": # kraken doesn't need bam continue if dd.get_phenotype(data) == "germline" and program_name != "variants": continue qc_fn = tools[program_name] cur_qc_dir = os.path.join(qc_dir, program_name) out = qc_fn(bam_file, data, cur_qc_dir) qc_files = None if out and isinstance(out, dict): # Check for metrics output, two cases: # 1. output with {"metrics"} and files ("base") if "metrics" in out: metrics.update(out.pop("metrics")) # 2. a dictionary of metrics elif "base" not in out: metrics.update(out) # Check for files only output if "base" in out: qc_files = out elif out and isinstance(out, basestring) and os.path.exists(out): qc_files = {"base": out, "secondary": []} if not qc_files: qc_files = _organize_qc_files(program_name, cur_qc_dir) if qc_files: qc_out[program_name] = qc_files metrics["Name"] = dd.get_sample_name(data) metrics["Quality format"] = dd.get_quality_format(data).lower() return {"qc": qc_out, "metrics": metrics}
def _run_qc_tools(bam_file, data): """Run a set of third party quality control tools, returning QC directory and metrics. :param bam_file: alignments in bam format :param data: dict with all configuration information :returns: dict with output of different tools """ from bcbio.qc import (atropos, contamination, coverage, damage, fastqc, kraken, qsignature, qualimap, samtools, picard, srna, umi, variant, viral, preseq, chipseq) tools = {"fastqc": fastqc.run, "atropos": atropos.run, "small-rna": srna.run, "samtools": samtools.run, "qualimap": qualimap.run, "qualimap_rnaseq": qualimap.run_rnaseq, "qsignature": qsignature.run, "contamination": contamination.run, "coverage": coverage.run, "damage": damage.run, "variants": variant.run, "peddy": peddy.run_qc, "kraken": kraken.run, "picard": picard.run, "umi": umi.run, "viral": viral.run, "preseq": preseq.run, "chipqc": chipseq.run } qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"])) metrics = {} qc_out = utils.deepish_copy(dd.get_summary_qc(data)) for program_name in dd.get_algorithm_qc(data): if not bam_file and program_name != "kraken": # kraken doesn't need bam continue if dd.get_phenotype(data) == "germline" and program_name != "variants": continue qc_fn = tools[program_name] cur_qc_dir = os.path.join(qc_dir, program_name) out = qc_fn(bam_file, data, cur_qc_dir) qc_files = None if out and isinstance(out, dict): # Check for metrics output, two cases: # 1. output with {"metrics"} and files ("base") if "metrics" in out: metrics.update(out.pop("metrics")) # 2. a dictionary of metrics elif "base" not in out: metrics.update(out) # Check for files only output if "base" in out: qc_files = out elif out and isinstance(out, six.string_types) and os.path.exists(out): qc_files = {"base": out, "secondary": []} if not qc_files: qc_files = _organize_qc_files(program_name, cur_qc_dir) if qc_files: qc_out[program_name] = qc_files metrics["Name"] = dd.get_sample_name(data) metrics["Quality format"] = dd.get_quality_format(data).lower() return {"qc": qc_out, "metrics": metrics}