示例#1
0
def _maybe_add_junction_files(algorithm, sample, out):
    """
    add splice junction files from STAR, if available
    """
    junction_bed = dd.get_junction_bed(sample)
    if junction_bed:
        out.append({
            "path": junction_bed,
            "type": "bed",
            "ext": "SJ",
            "dir": "STAR"
        })
    chimeric_file = dd.get_chimericjunction(sample)
    if chimeric_file:
        out.append({
            "path": chimeric_file,
            "type": "tsv",
            "ext": "chimericSJ",
            "dir": "STAR"
        })
    sj_file = dd.get_starjunction(sample)
    if sj_file:
        out.append({
            "path": sj_file,
            "type": "tab",
            "ext": "SJ",
            "dir": "STAR"
        })
    star_summary = dd.get_summary_qc(sample).get("star", None)
    if star_summary:
        star_log = star_summary["base"]
        if star_log:
            out.append({"path": star_log, "type": "log", "dir": "STAR"})
    return out
示例#2
0
def _combine_qc_samples(samples):
    """Combine split QC analyses into single samples based on BAM files.
    """
    by_bam = collections.defaultdict(list)
    for data in [utils.to_single_data(x) for x in samples]:
        batch = dd.get_batch(data) or dd.get_sample_name(data)
        if not isinstance(batch, (list, tuple)):
            batch = [batch]
        batch = tuple(batch)
        by_bam[(dd.get_align_bam(data)
                or dd.get_work_bam(data), batch)].append(data)
    out = []
    for data_group in by_bam.values():
        data = data_group[0]
        alg_qc = []
        qc = {}
        metrics = {}
        for d in data_group:
            qc.update(dd.get_summary_qc(d))
            metrics.update(dd.get_summary_metrics(d))
            alg_qc.extend(dd.get_algorithm_qc(d))
        data["config"]["algorithm"]["qc"] = alg_qc
        data["summary"]["qc"] = qc
        data["summary"]["metrics"] = metrics
        out.append([data])
    return out
示例#3
0
def create_ataqv_report(samples):
    """
    make the ataqv report from a set of ATAC-seq samples
    """
    data = samples[0][0]
    new_samples = []
    reportdir = os.path.join(dd.get_work_dir(data), "qc", "ataqv")
    sentinel = os.path.join(reportdir, "index.html")
    if utils.file_exists(sentinel):
        ataqv_output = {"base": sentinel, "secondary": get_ataqv_report_files(reportdir)}
        new_data = []
        for data in dd.sample_data_iterator(samples):
            data = tz.assoc_in(data, ["ataqv_report"], ataqv_output)
            new_data.append(data)
        return dd.get_samples_from_datalist(new_data)
    mkarv = config_utils.get_program("mkarv", dd.get_config(data))
    ataqv_files = []
    for data in dd.sample_data_iterator(samples):
        qc = dd.get_summary_qc(data)
        ataqv_file = tz.get_in(("ataqv", "base"), qc, None)
        if ataqv_file and utils.file_exists(ataqv_file):
            ataqv_files.append(ataqv_file)
    if not ataqv_files:
        return samples
    ataqv_json_file_string = " ".join(ataqv_files)
    with file_transaction(reportdir) as txreportdir:
        cmd = f"{mkarv} {txreportdir} {ataqv_json_file_string}"
        message = f"Creating ataqv report from {ataqv_json_file_string}."
        do.run(cmd, message)
    new_data = []
    ataqv_output = {"base": sentinel, "secondary": get_ataqv_report_files(reportdir)}
    for data in dd.sample_data_iterator(samples):
        data = tz.assoc_in(data, ["ataqv_report"], ataqv_output)
        new_data.append(data)
    return dd.get_samples_from_datalist(new_data)
示例#4
0
def _combine_qc_samples(samples):
    """Combine split QC analyses into single samples based on BAM files.
    """
    by_bam = collections.defaultdict(list)
    for data in [utils.to_single_data(x) for x in samples]:
        batch = dd.get_batch(data) or dd.get_sample_name(data)
        if not isinstance(batch, (list, tuple)):
            batch = [batch]
        batch = tuple(batch)
        by_bam[(dd.get_align_bam(data), batch)].append(data)
    out = []
    for data_group in by_bam.values():
        data = data_group[0]
        alg_qc = []
        qc = {}
        metrics = {}
        for d in data_group:
            qc.update(dd.get_summary_qc(d))
            metrics.update(dd.get_summary_metrics(d))
            alg_qc.extend(dd.get_algorithm_qc(d))
        data["config"]["algorithm"]["qc"] = alg_qc
        data["summary"]["qc"] = qc
        data["summary"]["metrics"] = metrics
        out.append([data])
    return out
示例#5
0
def _run_qc_tools(bam_file, data):
    """Run a set of third party quality control tools, returning QC directory and metrics.

        :param bam_file: alignments in bam format
        :param data: dict with all configuration information

        :returns: dict with output of different tools
    """
    from bcbio.qc import (atropos, coverage, damage, fastqc, kraken, qsignature, qualimap,
                          samtools, picard, srna, umi, variant, viral, preseq)
    tools = {"fastqc": fastqc.run,
             "atropos": atropos.run,
             "small-rna": srna.run,
             "samtools": samtools.run,
             "qualimap": qualimap.run,
             "qualimap_rnaseq": qualimap.run_rnaseq,
             "qsignature": qsignature.run,
             "coverage": coverage.run,
             "damage": damage.run,
             "variants": variant.run,
             "peddy": peddy.run_qc,
             "kraken": kraken.run,
             "picard": picard.run,
             "umi": umi.run,
             "viral": viral.run,
             "preseq": preseq.run,
             }
    qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"]))
    metrics = {}
    qc_out = utils.deepish_copy(dd.get_summary_qc(data))
    for program_name in dd.get_algorithm_qc(data):
        if not bam_file and program_name != "kraken":  # kraken doesn't need bam
            continue
        if dd.get_phenotype(data) == "germline" and program_name != "variants":
            continue
        qc_fn = tools[program_name]
        cur_qc_dir = os.path.join(qc_dir, program_name)
        out = qc_fn(bam_file, data, cur_qc_dir)
        qc_files = None
        if out and isinstance(out, dict):
            # Check for metrics output, two cases:
            # 1. output with {"metrics"} and files ("base")
            if "metrics" in out:
                metrics.update(out.pop("metrics"))
            # 2. a dictionary of metrics
            elif "base" not in out:
                metrics.update(out)
            # Check for files only output
            if "base" in out:
                qc_files = out
        elif out and isinstance(out, basestring) and os.path.exists(out):
            qc_files = {"base": out, "secondary": []}
        if not qc_files:
            qc_files = _organize_qc_files(program_name, cur_qc_dir)
        if qc_files:
            qc_out[program_name] = qc_files

    metrics["Name"] = dd.get_sample_name(data)
    metrics["Quality format"] = dd.get_quality_format(data).lower()
    return {"qc": qc_out, "metrics": metrics}
示例#6
0
def _run_qc_tools(bam_file, data):
    """Run a set of third party quality control tools, returning QC directory and metrics.

        :param bam_file: alignments in bam format
        :param data: dict with all configuration information

        :returns: dict with output of different tools
    """
    from bcbio.qc import (atropos, contamination, coverage, damage, fastqc, kraken,
                          qsignature, qualimap, samtools, picard, srna, umi, variant,
                          viral, preseq, chipseq)
    tools = {"fastqc": fastqc.run,
             "atropos": atropos.run,
             "small-rna": srna.run,
             "samtools": samtools.run,
             "qualimap": qualimap.run,
             "qualimap_rnaseq": qualimap.run_rnaseq,
             "qsignature": qsignature.run,
             "contamination": contamination.run,
             "coverage": coverage.run,
             "damage": damage.run,
             "variants": variant.run,
             "peddy": peddy.run_qc,
             "kraken": kraken.run,
             "picard": picard.run,
             "umi": umi.run,
             "viral": viral.run,
             "preseq": preseq.run,
             "chipqc": chipseq.run
             }
    qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"]))
    metrics = {}
    qc_out = utils.deepish_copy(dd.get_summary_qc(data))
    for program_name in dd.get_algorithm_qc(data):
        if not bam_file and program_name != "kraken":  # kraken doesn't need bam
            continue
        if dd.get_phenotype(data) == "germline" and program_name != "variants":
            continue
        qc_fn = tools[program_name]
        cur_qc_dir = os.path.join(qc_dir, program_name)
        out = qc_fn(bam_file, data, cur_qc_dir)
        qc_files = None
        if out and isinstance(out, dict):
            # Check for metrics output, two cases:
            # 1. output with {"metrics"} and files ("base")
            if "metrics" in out:
                metrics.update(out.pop("metrics"))
            # 2. a dictionary of metrics
            elif "base" not in out:
                metrics.update(out)
            # Check for files only output
            if "base" in out:
                qc_files = out
        elif out and isinstance(out, six.string_types) and os.path.exists(out):
            qc_files = {"base": out, "secondary": []}
        if not qc_files:
            qc_files = _organize_qc_files(program_name, cur_qc_dir)
        if qc_files:
            qc_out[program_name] = qc_files

    metrics["Name"] = dd.get_sample_name(data)
    metrics["Quality format"] = dd.get_quality_format(data).lower()
    return {"qc": qc_out, "metrics": metrics}