Пример #1
0
def parallel_calling(data, run_parallel):
    """This is needed only if running methylated veruss hidroxy-methulated"""
    out = []
    for sample in data:
        work_bam = dd.get_work_bam(sample[0])
        with closing(pysam.Samfile(work_bam, "rb")) as pysam_work_bam:
            chroms = pysam_work_bam.references
            for chrom in chroms:
                new_sample = copy.deepcopy(sample)
                if chrom.find("_") > -1:
                    continue
                new_sample[0]['chr_to_run'] = chrom
                out.append(new_sample)
    out = run_parallel("cpg_calling", out)
    for sample in out:
        phenotype = dd.get_phenotype(sample[0])
        batch = dd.get_batch(sample[0])
        if phenotype == "mC":
            for sample2 in out:
                if batch in dd.get_batch(sample2[0]) and dd.get_phenotype(
                        sample2[0]) == "hmC":
                    if sample[0]["chr_to_run"] == sample2[0]["chr_to_run"]:
                        sample[0]["control"] = sample2[0]["cpg_file"]
                        break
    out = run_parallel("cpg_processing", out)
    for sample in data:
        sample[0]["cpg_split"] = []
        sample[0]["hmc_split"] = []
        name = dd.get_sample_name(sample[0])
        for chunck in out:
            if name == dd.get_sample_name(chunck[0]):
                sample[0]["cpg_split"].append(chunck[0]["cpg_file"])
                if "hmc_file" in chunck[0]:
                    sample[0]["hmc_split"].append(chunck[0]["hmc_file"])
Пример #2
0
def _get_paired_samples(sample, data):
    """Get input sample for each chip bam file."""
    dd.get_phenotype(sample)
    for origin in data:
        if  dd.get_batch(sample) in dd.get_batch(origin[0]) and dd.get_phenotype(origin[0]) == "input":
            sample["work_bam_input"] = dd.get_work_bam(origin[0])
            return [sample]
Пример #3
0
def _get_paired_samples(sample, data):
    """Get input sample for each chip bam file."""
    dd.get_phenotype(sample)
    for origin in data:
        if dd.get_batch(sample) in dd.get_batch(
                origin[0]) and dd.get_phenotype(origin[0]) == "input":
            sample["work_bam_input"] = dd.get_work_bam(origin[0])
            return [sample]
Пример #4
0
def _check(sample, data):
    """Get input sample for each chip bam file."""
    if dd.get_chip_method(sample).lower() == "atac":
        return [sample]
    if dd.get_phenotype(sample) == "input":
        return None
    for origin in data:
        if  dd.get_batch(sample) in dd.get_batch(origin[0]) and dd.get_phenotype(origin[0]) == "input":
            sample["work_bam_input"] = dd.get_work_bam(origin[0])
            return [sample]
    return [sample]
Пример #5
0
def _check(sample, data):
    """Get input sample for each chip bam file."""
    if dd.get_chip_method(sample).lower() == "atac":
        return [sample]
    if dd.get_phenotype(sample) == "input":
        return None
    for origin in data:
        if  dd.get_batch(sample) in dd.get_batch(origin[0]) and dd.get_phenotype(origin[0]) == "input":
            sample["work_bam_input"] = dd.get_work_bam(origin[0])
            return [sample]
    return [sample]
Пример #6
0
def _get_replicate_samples(sample, data):
    """Get input sample for each chip bam file."""
    dd.get_phenotype(sample)
    rep_bam = ""
    for origin in data:
        if  dd.get_batch(sample) in dd.get_batch(origin[0]) and dd.get_phenotype(sample) in dd.get_phenotype(origin[0]) and dd.get_work_bam(sample) != dd.get_work_bam(origin[0]) and dd.get_phenotype(origin[0]) != "control":
            if rep_bam != "":
                rep_bam = rep_bam + "," + dd.get_work_bam(origin[0])
            else:
                rep_bam = dd.get_work_bam(origin[0])
    sample["work_bam_rep"] = dd.get_work_bam(origin[0])
    return [sample]
Пример #7
0
def finalize_sv(samples, config):
    """Combine results from multiple sv callers into a single ordered 'sv' key.
    """
    by_bam = collections.OrderedDict()
    for x in samples:
        batch = dd.get_batch(x) or [dd.get_sample_name(x)]
        try:
            by_bam[x["align_bam"], tuple(batch)].append(x)
        except KeyError:
            by_bam[x["align_bam"], tuple(batch)] = [x]
    by_batch = collections.OrderedDict()
    lead_batches = {}
    for grouped_calls in by_bam.values():

        def orig_svcaller_order(x):
            orig_callers = tz.get_in(["config", "algorithm", "svcaller_orig"],
                                     x)
            cur_caller = tz.get_in(["config", "algorithm", "svcaller"], x)
            return orig_callers.index(cur_caller)

        sorted_svcalls = sorted([x for x in grouped_calls if "sv" in x],
                                key=orig_svcaller_order)
        final = grouped_calls[0]
        if len(sorted_svcalls) > 0:
            final["sv"] = reduce(operator.add,
                                 [x["sv"] for x in sorted_svcalls])
        final["config"]["algorithm"]["svcaller"] = final["config"][
            "algorithm"].pop("svcaller_orig")
        batch = dd.get_batch(final) or dd.get_sample_name(final)
        batches = batch if isinstance(batch, (list, tuple)) else [batch]
        if len(batches) > 1:
            lead_batches[(dd.get_sample_name(final),
                          dd.get_phenotype(final) == "germline")] = batches[0]
        for batch in batches:
            try:
                by_batch[batch].append(final)
            except KeyError:
                by_batch[batch] = [final]
    out = []
    for batch, items in by_batch.items():
        if any("svplots" in dd.get_tools_on(d) for d in items):
            items = plot.by_regions(items)
        for data in items:
            if lead_batches.get(
                (dd.get_sample_name(data),
                 dd.get_phenotype(data) == "germline")) in [batch, None]:
                out.append([data])
    return out
Пример #8
0
def _get_original_targets(data):
    """Back compatible: get pre-existing target BEDs.
    """
    work_dir = os.path.join(_sv_workdir(data), "raw")
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    return (glob.glob(os.path.join(work_dir, "*-%s.target.bed" % batch))[0],
            glob.glob(os.path.join(work_dir, "*-%s.antitarget.bed" % batch))[0])
Пример #9
0
def _batch_split_by_sv(samples, stage):
    to_process = collections.OrderedDict()
    extras = []
    background = []
    for data in (utils.to_single_data(x) for x in samples):
        ready_data = _handle_multiple_svcallers(data, stage)
        if len(ready_data) > 0:
            background.append(data)
            for x in ready_data:
                svcaller = tz.get_in(["config", "algorithm", "svcaller"], x)
                batch = dd.get_batch(x) or dd.get_sample_name(x)
                if stage in ["ensemble"]:  # no batching for ensemble methods
                    if isinstance(batch, six.string_types) and batch != dd.get_sample_name(x):
                        batch += "_%s" % dd.get_sample_name(x)
                    else:
                        batch = dd.get_sample_name(x)
                    if dd.get_phenotype(x) == "germline":
                        batch += "_germline"
                elif svcaller in _GLOBAL_BATCHING:  # All samples batched together for analyses
                    batch = "all"
                batches = batch if isinstance(batch, (list, tuple)) else [batch]
                for b in batches:
                    try:
                        to_process[(svcaller, b)].append(x)
                    except KeyError:
                        to_process[(svcaller, b)] = [x]
        else:
            extras.append([data])
    return to_process, extras, background
Пример #10
0
def _cnvkit_targets(raw_target_bed, access_bed, cov_interval, work_dir, data):
    """Create target and antitarget regions from target and access files.
    """
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    basename = os.path.splitext(os.path.basename(raw_target_bed))[0]
    target_bed = os.path.join(work_dir, "%s-%s.target.bed" % (basename, batch))
    # back compatible with previous runs to avoid re-calculating
    target_bed_old = os.path.join(work_dir, "%s.target.bed" % basename)
    if utils.file_exists(target_bed_old):
        target_bed = target_bed_old
    if not utils.file_exists(target_bed):
        with file_transaction(data, target_bed) as tx_out_file:
            cmd = [_get_cmd(), "target", raw_target_bed, "--split", "-o", tx_out_file]
            bin_estimates = _cnvkit_coverage_bin_estimate(raw_target_bed, access_bed, cov_interval, work_dir, data)
            if bin_estimates.get("target"):
                cmd += ["--avg-size", str(bin_estimates["target"])]
            do.run(_prep_cmd(cmd, tx_out_file), "CNVkit target")
    antitarget_bed = os.path.join(work_dir, "%s-%s.antitarget.bed" % (basename, batch))
    antitarget_bed_old = os.path.join(work_dir, "%s.antitarget.bed" % basename)
    # back compatible with previous runs to avoid re-calculating
    if os.path.exists(antitarget_bed_old):
        antitarget_bed = antitarget_bed_old
    if not os.path.exists(antitarget_bed):
        with file_transaction(data, antitarget_bed) as tx_out_file:
            cmd = [_get_cmd(), "antitarget", "-g", access_bed, target_bed, "-o", tx_out_file]
            bin_estimates = _cnvkit_coverage_bin_estimate(raw_target_bed, access_bed, cov_interval, work_dir, data)
            if bin_estimates.get("antitarget"):
                cmd += ["--avg-size", str(bin_estimates["antitarget"])]
            do.run(_prep_cmd(cmd, tx_out_file), "CNVkit antitarget")
    return target_bed, antitarget_bed
Пример #11
0
def _combine_qc_samples(samples):
    """Combine split QC analyses into single samples based on BAM files.
    """
    by_bam = collections.defaultdict(list)
    for data in [utils.to_single_data(x) for x in samples]:
        batch = dd.get_batch(data) or dd.get_sample_name(data)
        if not isinstance(batch, (list, tuple)):
            batch = [batch]
        batch = tuple(batch)
        by_bam[(dd.get_align_bam(data)
                or dd.get_work_bam(data), batch)].append(data)
    out = []
    for data_group in by_bam.values():
        data = data_group[0]
        alg_qc = []
        qc = {}
        metrics = {}
        for d in data_group:
            qc.update(dd.get_summary_qc(d))
            metrics.update(dd.get_summary_metrics(d))
            alg_qc.extend(dd.get_algorithm_qc(d))
        data["config"]["algorithm"]["qc"] = alg_qc
        data["summary"]["qc"] = qc
        data["summary"]["metrics"] = metrics
        out.append([data])
    return out
Пример #12
0
def _run_gridss(inputs, background, work_dir):
    out_file = os.path.join(work_dir, "%s-gridss.sv.vcf" % (dd.get_batch(inputs[0]) or
                                                            dd.get_sample_name(inputs[0])))
    if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"):
        with file_transaction(inputs[0], out_file) as tx_out_file:
            htsjdk_opts = ["-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true",
                           "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true"]
            cores = dd.get_cores(inputs[0])
            resources = config_utils.get_resources("gridss", inputs[0]["config"])
            jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"])
            jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust":
                                                                         {"direction": "increase",
                                                                          "magnitude": cores}}})
            jvm_opts = _finalize_memory(jvm_opts)
            tx_ref_file = _setup_reference_files(inputs[0], os.path.dirname(tx_out_file))
            blacklist_bed = sshared.prepare_exclude_file(inputs + background, out_file)
            cmd = ["gridss"] + jvm_opts + htsjdk_opts + ["gridss.CallVariants"] + \
                  ["THREADS=%s" % cores,
                   "TMP_DIR=%s" % os.path.dirname(tx_out_file), "WORKING_DIR=%s" % os.path.dirname(tx_out_file),
                   "OUTPUT=%s" % tx_out_file,
                   "ASSEMBLY=%s" % tx_out_file.replace(".sv.vcf", ".gridss.assembly.bam"),
                   "REFERENCE_SEQUENCE=%s" % tx_ref_file, "BLACKLIST=%s" % blacklist_bed]
            for data in inputs + background:
                cmd += ["INPUT=%s" % dd.get_align_bam(data), "INPUT_LABEL=%s" % dd.get_sample_name(data)]
            exports = utils.local_path_export()
            cmd = exports + " ".join(cmd)
            do.run(cmd, "GRIDSS SV analysis")
    return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
Пример #13
0
def _batch_split_by_sv(samples, stage):
    to_process = collections.OrderedDict()
    extras = []
    background = []
    for data in (utils.to_single_data(x) for x in samples):
        ready_data = _handle_multiple_svcallers(data, stage)
        if len(ready_data) > 0:
            background.append(data)
            for x in ready_data:
                svcaller = tz.get_in(["config", "algorithm", "svcaller"], x)
                batch = dd.get_batch(x) or dd.get_sample_name(x)
                if stage in ["precall", "ensemble"
                             ]:  # no batching for precall or ensemble methods
                    if isinstance(
                            batch,
                            basestring) and batch != dd.get_sample_name(x):
                        batch += "_%s" % dd.get_sample_name(x)
                    else:
                        batch = dd.get_sample_name(x)
                    if dd.get_phenotype(x) == "germline":
                        batch += "_germline"
                elif svcaller in _GLOBAL_BATCHING:  # All samples batched together for analyses
                    batch = "all"
                batches = batch if isinstance(batch,
                                              (list, tuple)) else [batch]
                for b in batches:
                    try:
                        to_process[(svcaller, b)].append(x)
                    except KeyError:
                        to_process[(svcaller, b)] = [x]
        else:
            extras.append([data])
    return to_process, extras, background
Пример #14
0
def _cnvkit_coverage(data, bed_file, input_type):
    """Calculate coverage in a BED file for CNVkit.
    """
    bam_file = dd.get_align_bam(data)
    work_dir = utils.safe_makedir(os.path.join(_sv_workdir(data), "raw"))
    exts = {".target.bed": ("target", "targetcoverage.cnn"),
            ".antitarget.bed": ("antitarget", "antitargetcoverage.cnn")}
    cnntype = None
    for orig, (cur_cnntype, ext) in exts.items():
        if bed_file.endswith(orig):
            cnntype = cur_cnntype
            break
    if cnntype is None:
        assert bed_file.endswith(".bed"), "Unexpected BED file extension for coverage %s" % bed_file
        cnntype = ""
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    base = _bam_to_outbase(bam_file, work_dir)
    out_file = "%s-%s.%s" % (base, batch, ext)
    out_file_old = "%s.%s" % (base, ext)
    # back compatible with previous runs to avoid re-calculating
    if utils.file_exists(out_file_old):
        out_file = out_file_old
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            cmd = [_get_cmd(), "coverage", "-p", str(dd.get_cores(data)), bam_file, bed_file, "-o", tx_out_file]
            do.run(_prep_cmd(cmd, tx_out_file), "CNVkit coverage")
    return {"itype": input_type, "file": out_file, "bam": bam_file, "cnntype": cnntype,
            "sample": dd.get_sample_name(data)}
Пример #15
0
def run(samples, run_parallel, initial_only=False):
    """Run structural variation detection.

    initial_only indicates we should run structural variation inputs, like
    CNV calling, we can use to inform low frequency variant calling.
    """
    to_process = collections.OrderedDict()
    extras = []
    background = []
    for data in (xs[0] for xs in samples):
        ready_data = _handle_multiple_svcallers(data)
        if len(ready_data) > 0:
            background.append(data)
            for x in ready_data:
                svcaller = x["config"]["algorithm"].get("svcaller_active")
                batch = dd.get_batch(x)
                if svcaller in _BATCH_CALLERS and batch:
                    batches = batch if isinstance(batch, (list, tuple)) else [batch]
                    for b in batches:
                        try:
                            to_process[(svcaller, b)].append(x)
                        except KeyError:
                            to_process[(svcaller, b)] = [x]
                else:
                    to_process[(svcaller, dd.get_sample_name(x))] = [x]
        else:
            extras.append([data])
    processed = run_parallel("detect_sv", ([xs, background, xs[0]["config"], initial_only]
                                           for xs in to_process.values()))
    finalized = (run_parallel("finalize_sv", [([xs[0] for xs in processed], processed[0][0]["config"])])
                 if len(processed) > 0 else [])
    return extras + finalized
Пример #16
0
def _combine_qc_samples(samples):
    """Combine split QC analyses into single samples based on BAM files.
    """
    by_bam = collections.defaultdict(list)
    for data in [utils.to_single_data(x) for x in samples]:
        batch = dd.get_batch(data) or dd.get_sample_name(data)
        if not isinstance(batch, (list, tuple)):
            batch = [batch]
        batch = tuple(batch)
        by_bam[(dd.get_align_bam(data), batch)].append(data)
    out = []
    for data_group in by_bam.values():
        data = data_group[0]
        alg_qc = []
        qc = {}
        metrics = {}
        for d in data_group:
            qc.update(dd.get_summary_qc(d))
            metrics.update(dd.get_summary_metrics(d))
            alg_qc.extend(dd.get_algorithm_qc(d))
        data["config"]["algorithm"]["qc"] = alg_qc
        data["summary"]["qc"] = qc
        data["summary"]["metrics"] = metrics
        out.append([data])
    return out
Пример #17
0
def run(samples, run_parallel, stage):
    """Run structural variation detection.

    The stage indicates which level of structural variant calling to run.
      - initial, run prior to other callers and variant calling
      - standard, regular batch calling
      - ensemble, post-calling, combine other callers-
    """
    to_process = collections.OrderedDict()
    extras = []
    background = []
    for data in (xs[0] for xs in samples):
        ready_data = _handle_multiple_svcallers(data, stage)
        if len(ready_data) > 0:
            background.append(data)
            for x in ready_data:
                svcaller = x["config"]["algorithm"].get("svcaller_active")
                if stage == "ensemble":  # no batching for ensemble methods
                    batch = dd.get_sample_name(x)
                else:
                    batch = dd.get_batch(x) or dd.get_sample_name(x)
                batches = batch if isinstance(batch, (list, tuple)) else [batch]
                for b in batches:
                    try:
                        to_process[(svcaller, b)].append(x)
                    except KeyError:
                        to_process[(svcaller, b)] = [x]
        else:
            extras.append([data])
    processed = run_parallel("detect_sv", ([xs, background, xs[0]["config"], stage]
                                           for xs in to_process.values()))
    finalized = (run_parallel("finalize_sv", [([xs[0] for xs in processed], processed[0][0]["config"])])
                 if len(processed) > 0 else [])
    return extras + finalized
Пример #18
0
def _get_original_targets(data):
    """Back compatible: get pre-existing target BEDs.
    """
    work_dir = os.path.join(_sv_workdir(data), "raw")
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    return (glob.glob(os.path.join(work_dir, "*-%s.target.bed" % batch))[0],
            glob.glob(os.path.join(work_dir, "*-%s.antitarget.bed" % batch))[0])
Пример #19
0
def calculate_sv_coverage(data):
    """Calculate coverage within bins for downstream CNV calling.
    Creates corrected cnr files with log2 ratios and depths.
    data is one sample
    """
    calcfns = {"cnvkit": _calculate_sv_coverage_cnvkit, "gatk-cnv": _calculate_sv_coverage_gatk}
    from bcbio.structural import cnvkit
    data = utils.to_single_data(data)

    from bcbio.structural import get_svcallers
    sv_callers = get_svcallers(data)
    has_cnvkit_or_gatkcnv = bool(set(["cnvkit", "gatk-cnv"]) & set(sv_callers))

    if not cnvkit.use_general_sv_bins(data) or not has_cnvkit_or_gatkcnv:
        out_target_file, out_anti_file = (None, None)
    else:
        work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural",
                                                   dd.get_sample_name(data), "bins"))
        out_target_file, out_anti_file = calcfns[cnvkit.bin_approach(data)](data, work_dir)
        if not os.path.exists(out_target_file):
            out_target_file, out_anti_file = (None, None)

    if "seq2c" in dd.get_svcaller(data):
        from bcbio.structural import seq2c
        seq2c_target = seq2c.precall(data)
    else:
        seq2c_target = None

    if "purecn" in dd.get_svcaller(data):
        # set purecn_pon_build flag
        batches = dd.get_batch(data)
        if batches and "pon_build" in dd.get_batch(data):
            data["config"]["algorithm"]["purecn_pon_build"] = True
        from bcbio.structural import purecn
        # still calculate coverage even when not building pon - for t-only analysis
        purecn_target = purecn.get_coverage(data)
    else:
        purecn_target = None

    if not tz.get_in(["depth", "bins"], data):
        data = tz.update_in(data, ["depth", "bins"], lambda x: {})
    data["depth"]["bins"] = {"target": out_target_file,
                             "antitarget": out_anti_file,
                             "seq2c": seq2c_target,
                             "purecn": purecn_target}
    return [[data]]
Пример #20
0
def _get_batches(data, require_bam=True):
    if bam_needs_processing(data) or not require_bam:
        batches = dd.get_batch(data) or dd.get_sample_name(data)
    else:
        batches = dd.get_sample_name(data)
    if not isinstance(batches, (list, tuple)):
        batches = [batches]
    return batches
Пример #21
0
def _get_batches(data, require_bam=True):
    if bam_needs_processing(data) or not require_bam:
        batches = dd.get_batch(data) or dd.get_sample_name(data)
    else:
        batches = dd.get_sample_name(data)
    if not isinstance(batches, (list, tuple)):
        batches = [batches]
    return batches
Пример #22
0
def get_samples_by_batch(samples):
    batch_samples = defaultdict(list)
    for data in dd.sample_data_iterator(samples):
        batch = dd.get_batch(data) or dd.get_sample_name(data)
        if isinstance(batch, list):
            batch = tuple(batch)
        batch_samples[batch].append(data)
    return batch_samples
Пример #23
0
def _get_batch_name(sample):
    """Retrieve batch name for use in SV calling outputs.

    Handles multiple batches split via SV calling.
    """
    batch = dd.get_batch(sample) or dd.get_sample_name(sample)
    if isinstance(batch, (list, tuple)) and len(batch) > 1:
        batch = dd.get_sample_name(sample)
    return batch
Пример #24
0
def _get_batch_name(sample):
    """Retrieve batch name for use in SV calling outputs.

    Handles multiple batches split via SV calling.
    """
    batch = dd.get_batch(sample) or dd.get_sample_name(sample)
    if isinstance(batch, (list, tuple)) and len(batch) > 1:
        batch = dd.get_sample_name(sample)
    return batch
Пример #25
0
def _cnvkit_coverage_bin_estimate(raw_target_bed, access_bed, cov_interval,
                                  work_dir, data):
    """Estimate good coverage bin sizes for target regions based on coverage.
    """
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    out_file = os.path.join(
        work_dir, "%s-%s-bin_estimate.txt" %
        (os.path.splitext(os.path.basename(raw_target_bed))[0], batch))
    method_map = {
        "genome": "wgs",
        "regional": "hybrid",
        "amplicon": "amplicon"
    }
    if not os.path.exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            cmd = [
                _get_cmd("coverage_bin_size.py"),
                dd.get_align_bam(data), "-m", method_map[cov_interval], "-t",
                raw_target_bed, "-g", access_bed
            ]
            cmd = " ".join(cmd) + " > " + tx_out_file
            try:
                do.run(_prep_cmd(cmd, tx_out_file),
                       "CNVkit coverage bin estimation",
                       log_error=False)
            except subprocess.CalledProcessError:
                logger.info("Bin size estimate failed, using default values")
                with open(tx_out_file, "w") as out_handle:
                    out_handle.write(
                        "Bin size estimate failed, using default values")
    avg_bin_sizes = {}
    estimate_map = {
        "On-target": "target",
        "Off-target": "antitarget",
        "Genome": "target",
        "Targets (sampling)": "target"
    }
    range_map = {
        ("genome", "target"): (500, 1000),
        ("regional", "target"): (50, 267),
        ("regional", "antitarget"): (20000, 200000),
        ("amplicon", "target"): (50, 267)
    }
    with open(out_file) as in_handle:
        for line in in_handle:
            if line.startswith(tuple(estimate_map.keys())):
                name, depth, bin_size = line.strip().split("\t")
                name = estimate_map[name.replace(":", "").strip()]
                try:
                    bin_size = int(bin_size)
                except ValueError:
                    bin_size = None
                if bin_size and bin_size > 0:
                    cur_min, cur_max = range_map[(cov_interval, name)]
                    avg_bin_sizes[name] = max(min(bin_size, cur_max), cur_min)
    return avg_bin_sizes
Пример #26
0
def _bam_to_outbase(bam_file, work_dir, data):
    """Convert an input BAM file into CNVkit expected output.

    Handles previous non-batch cases to avoid re-calculating,
    returning both new and old values:
    """
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    out_base = os.path.splitext(os.path.basename(bam_file))[0].split(".")[0]
    base = os.path.join(work_dir, out_base)
    return "%s-%s" % (base, batch), base
Пример #27
0
def _bam_to_outbase(bam_file, work_dir, data):
    """Convert an input BAM file into CNVkit expected output.

    Handles previous non-batch cases to avoid re-calculating,
    returning both new and old values:
    """
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    out_base = os.path.splitext(os.path.basename(bam_file))[0].split(".")[0]
    base = os.path.join(work_dir, out_base)
    return "%s-%s" % (base, batch), base
Пример #28
0
def run_peddy(samples, out_dir=None):
    vcf_file = None
    for d in samples:
        vcinfo = variant.get_active_vcinfo(d)
        if vcinfo and vcinfo.get("vrn_file") and utils.file_exists(vcinfo["vrn_file"]):
            if vcinfo["vrn_file"] and dd.get_sample_name(d) in vcfutils.get_samples(vcinfo["vrn_file"]):
                vcf_file = vcinfo["vrn_file"]
                break
    data = samples[0]
    peddy = config_utils.get_program("peddy", data) if config_utils.program_installed("peddy", data) else None
    if not peddy or not vcf_file or not is_human(data):
        logger.info("peddy is not installed, not human or sample VCFs don't match, skipping correspondence checking "
                    "for %s." % vcf_file)
        return samples
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    if out_dir:
        peddy_dir = safe_makedir(out_dir)
    else:
        peddy_dir = safe_makedir(os.path.join(dd.get_work_dir(data), "qc", batch, "peddy"))
    ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir)
    peddy_prefix = os.path.join(peddy_dir, batch)
    peddy_report = peddy_prefix + ".html"
    peddyfiles = expected_peddy_files(peddy_report, batch)
    if file_exists(peddy_report):
        return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
    if file_exists(peddy_prefix + "-failed.log"):
        return samples
    num_cores = dd.get_num_cores(data)

    with tx_tmpdir(data) as tx_dir:
        peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix))
        # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2
        stderr_log = os.path.join(tx_dir, "run-stderr.log")
        cmd = "{peddy} -p {num_cores} --plot --prefix {peddy_prefix_tx} {vcf_file} {ped_file} 2> {stderr_log}"
        message = "Running peddy on {vcf_file} against {ped_file}."
        try:
            do.run(cmd.format(**locals()), message.format(**locals()))
        except:
            to_show = collections.deque(maxlen=100)
            with open(stderr_log) as in_handle:
                for line in in_handle:
                    to_show.append(line)
            if any([l.find("IndexError") >=0 and l.find("is out of bounds for axis") >= 0
                    for l in to_show]):
                logger.info("Skipping peddy because no variants overlap with checks: %s" % batch)
                with open(peddy_prefix + "-failed.log", "w") as out_handle:
                    out_handle.write("peddy did not find overlaps with 1kg sites in VCF, skipping")
                return samples
            else:
                logger.warning("".join(to_show))
                raise
        for ext in PEDDY_OUT_EXTENSIONS:
            if os.path.exists(peddy_prefix_tx + ext):
                shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext)
    return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
Пример #29
0
def _group_by_samplename(samples):
    """Group samples split by QC method back into a single sample.
    """
    out = collections.defaultdict(list)
    for data in samples:
        batch = dd.get_batch(data) or dd.get_sample_name(data)
        if not isinstance(batch, (list, tuple)):
            batch = [batch]
        batch = tuple(batch)
        out[(dd.get_sample_name(data), dd.get_align_bam(data), batch)].append(data)
    return [xs[0] for xs in out.values()]
Пример #30
0
def _group_by_samplename(samples):
    """Group samples split by QC method back into a single sample.
    """
    out = collections.defaultdict(list)
    for data in samples:
        batch = dd.get_batch(data) or dd.get_sample_name(data)
        if not isinstance(batch, (list, tuple)):
            batch = [batch]
        batch = tuple(batch)
        out[(dd.get_sample_name(data), dd.get_align_bam(data), batch)].append(data)
    return [xs[0] for xs in out.values()]
Пример #31
0
def finalize_sv(samples, config):
    """Combine results from multiple sv callers into a single ordered 'sv' key.
    """
    by_bam = collections.OrderedDict()
    for x in samples:
        batch = dd.get_batch(x) or [dd.get_sample_name(x)]
        try:
            by_bam[x["align_bam"], tuple(batch)].append(x)
        except KeyError:
            by_bam[x["align_bam"], tuple(batch)] = [x]
    by_batch = collections.OrderedDict()
    lead_batches = {}
    for grouped_calls in by_bam.values():
        def orig_svcaller_order(x):
            orig_callers = tz.get_in(["config", "algorithm", "svcaller_orig"], x)
            cur_caller = tz.get_in(["config", "algorithm", "svcaller"], x)
            return orig_callers.index(cur_caller)
        sorted_svcalls = sorted([x for x in grouped_calls if "sv" in x],
                                key=orig_svcaller_order)
        final = grouped_calls[0]
        if len(sorted_svcalls) > 0:
            final["sv"] = reduce(operator.add, [x["sv"] for x in sorted_svcalls])
        final["config"]["algorithm"]["svcaller"] = final["config"]["algorithm"].pop("svcaller_orig")
        batch = dd.get_batch(final) or dd.get_sample_name(final)
        batches = batch if isinstance(batch, (list, tuple)) else [batch]
        lead_batches[dd.get_sample_name(final)] = batches[0]
        for batch in batches:
            try:
                by_batch[batch].append(final)
            except KeyError:
                by_batch[batch] = [final]
    out = []
    for batch, items in by_batch.items():
        if any("svplots" in dd.get_tools_on(d) for d in items):
            plot_items = plot.by_regions(items)
        else:
            plot_items = items
        for data in plot_items:
            if lead_batches[dd.get_sample_name(data)] == batch:
                out.append([data])
    return out
Пример #32
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([
        (dd.get_jointcaller(d) or ("gvcf" in dd.get_tools_on(d)))
        and dd.get_batch(d) for d in samples
    ])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    return checkpoints
Пример #33
0
def detect_sv(items, all_items=None, stage="standard"):
    """Top level parallel target for examining structural variation.
    items = sample-sv_caller list, from one batch
    """
    items = [utils.to_single_data(x) for x in items]
    items = cwlutils.unpack_tarballs(items, items[0])
    svcaller = items[0]["config"]["algorithm"].get("svcaller")
    caller_fn = _get_callers(items, stage, special_cases=True).get(svcaller)
    out = []
    batch = dd.get_batch(items[0])
    # no SV calling when just creating a PON for PureCN
    if batch == "pon_build" and "purecn" in dd.get_svcaller(items[0]):
        return out
    if svcaller and caller_fn:
        if (all_items and svcaller in _NEEDS_BACKGROUND
                and not vcfutils.is_paired_analysis(
                    [x.get("align_bam") for x in items], items)):
            names = set([dd.get_sample_name(x) for x in items])
            background = [
                x for x in all_items if dd.get_sample_name(x) not in names
            ]
            for svdata in caller_fn(items, background):
                out.append([svdata])
        else:
            for svdata in caller_fn(items):
                out.append([svdata])
    else:
        for data in items:
            out.append([data])
    # Avoid nesting of callers for CWL runs for easier extraction
    if cwlutils.is_cwl_run(items[0]):
        out_cwl = []
        for data in [utils.to_single_data(x) for x in out]:
            # Run validation directly from CWL runs since we're single stage
            data = validate.evaluate(data)
            data["svvalidate"] = {
                "summary": tz.get_in(["sv-validate", "csv"], data)
            }
            svs = data.get("sv")
            if svs:
                assert len(svs) == 1, svs
                data["sv"] = svs[0]
            else:
                data["sv"] = {}
            data = _add_supplemental(data)
            out_cwl.append([data])
        return out_cwl
    return out
Пример #34
0
def finalize_sv(samples, config, initial_only=False):
    """Combine results from multiple sv callers into a single ordered 'sv' key.

    Handles ensemble calling and plotting of results.
    """
    by_bam = collections.OrderedDict()
    for x in samples:
        try:
            by_bam[x["align_bam"]].append(x)
        except KeyError:
            by_bam[x["align_bam"]] = [x]
    by_batch = collections.OrderedDict()
    lead_batches = {}
    for grouped_calls in by_bam.values():

        def orig_svcaller_order(x):
            return _get_svcallers(x).index(x["config"]["algorithm"]["svcaller_active"])

        sorted_svcalls = sorted([x for x in grouped_calls if "sv" in x], key=orig_svcaller_order)
        final = grouped_calls[0]
        if len(sorted_svcalls) > 0:
            final_calls = reduce(operator.add, [x["sv"] for x in sorted_svcalls])
            if not initial_only:
                for caller in (c for c in _get_svcallers(final) if c in _ENSEMBLE_CALLERS):
                    final_calls = _ENSEMBLE_CALLERS[caller](final_calls, final)
                final_calls = ensemble.summarize(final_calls, final, grouped_calls)
                final_calls = validate.evaluate(final, final_calls)
            final["sv"] = final_calls
        del final["config"]["algorithm"]["svcaller_active"]
        batch = dd.get_batch(final) or dd.get_sample_name(final)
        batches = batch if isinstance(batch, (list, tuple)) else [batch]
        lead_batches[dd.get_sample_name(final)] = batches[0]
        for batch in batches:
            try:
                by_batch[batch].append(final)
            except KeyError:
                by_batch[batch] = [final]
    out = []
    for batch, items in by_batch.items():
        if any("svplots" in dd.get_tools_on(d) for d in items):
            plot_items = plot.by_regions(items)
        else:
            plot_items = items
        for data in plot_items:
            if lead_batches[dd.get_sample_name(data)] == batch:
                out.append([data])
    return out
Пример #35
0
def finalize_sv(samples, config, initial_only=False):
    """Combine results from multiple sv callers into a single ordered 'sv' key.

    Handles ensemble calling and plotting of results.
    """
    by_bam = collections.OrderedDict()
    for x in samples:
        try:
            by_bam[x["align_bam"]].append(x)
        except KeyError:
            by_bam[x["align_bam"]] = [x]
    by_batch = collections.OrderedDict()
    lead_batches = {}
    for grouped_calls in by_bam.values():
        def orig_svcaller_order(x):
            return _get_svcallers(x).index(x["config"]["algorithm"]["svcaller_active"])
        sorted_svcalls = sorted([x for x in grouped_calls if "sv" in x],
                                key=orig_svcaller_order)
        final = grouped_calls[0]
        if len(sorted_svcalls) > 0:
            final_calls = reduce(operator.add, [x["sv"] for x in sorted_svcalls])
            if not initial_only:
                for caller in (c for c in _get_svcallers(final) if c in _ENSEMBLE_CALLERS):
                    final_calls = _ENSEMBLE_CALLERS[caller](final_calls, final)
                final_calls = ensemble.summarize(final_calls, final, grouped_calls)
                final_calls = validate.evaluate(final, final_calls)
            final["sv"] = final_calls
        del final["config"]["algorithm"]["svcaller_active"]
        batch = dd.get_batch(final) or dd.get_sample_name(final)
        batches = batch if isinstance(batch, (list, tuple)) else [batch]
        lead_batches[dd.get_sample_name(final)] = batches[0]
        for batch in batches:
            try:
                by_batch[batch].append(final)
            except KeyError:
                by_batch[batch] = [final]
    out = []
    for batch, items in by_batch.items():
        if any("svplots" in dd.get_tools_on(d) for d in items):
            plot_items = plot.by_regions(items)
        else:
            plot_items = items
        for data in plot_items:
            if lead_batches[dd.get_sample_name(data)] == batch:
                out.append([data])
    return out
Пример #36
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([
        (dd.get_jointcaller(d) or ("gvcf" in dd.get_tools_on(d)))
        and dd.get_batch(d) for d in samples
    ])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d))
                                for d in samples])
    checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False
                                           or not dd.get_aligner(d))
                                          for d in samples])
    return checkpoints
Пример #37
0
def _run_gridss(inputs, background, work_dir):
    out_file = os.path.join(
        work_dir, "%s-gridss.sv.vcf" %
        (dd.get_batch(inputs[0]) or dd.get_sample_name(inputs[0])))
    if not utils.file_exists(out_file) and not utils.file_exists(out_file +
                                                                 ".gz"):
        with file_transaction(inputs[0], out_file) as tx_out_file:
            htsjdk_opts = [
                "-Dsamjdk.create_index=true",
                "-Dsamjdk.use_async_io_read_samtools=true",
                "-Dsamjdk.use_async_io_write_samtools=true",
                "-Dsamjdk.use_async_io_write_tribble=true"
            ]
            cores = dd.get_cores(inputs[0])
            resources = config_utils.get_resources("gridss",
                                                   inputs[0]["config"])
            jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"])
            jvm_opts = config_utils.adjust_opts(
                jvm_opts, {
                    "algorithm": {
                        "memory_adjust": {
                            "direction": "increase",
                            "magnitude": cores
                        }
                    }
                })
            jvm_opts = _finalize_memory(jvm_opts)
            tx_ref_file = _setup_reference_files(inputs[0],
                                                 os.path.dirname(tx_out_file))
            blacklist_bed = sshared.prepare_exclude_file(
                inputs + background, out_file)
            cmd = ["gridss"] + jvm_opts + htsjdk_opts + ["gridss.CallVariants"] + \
                  ["THREADS=%s" % cores,
                   "TMP_DIR=%s" % os.path.dirname(tx_out_file), "WORKING_DIR=%s" % os.path.dirname(tx_out_file),
                   "OUTPUT=%s" % tx_out_file,
                   "ASSEMBLY=%s" % tx_out_file.replace(".sv.vcf", ".gridss.assembly.bam"),
                   "REFERENCE_SEQUENCE=%s" % tx_ref_file, "BLACKLIST=%s" % blacklist_bed]
            for data in inputs + background:
                cmd += [
                    "INPUT=%s" % dd.get_align_bam(data),
                    "INPUT_LABEL=%s" % dd.get_sample_name(data)
                ]
            exports = utils.local_path_export()
            cmd = exports + " ".join(cmd)
            do.run(cmd, "GRIDSS SV analysis")
    return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
Пример #38
0
def finalize_sv(samples, config):
    """Combine results from multiple sv callers into a single ordered 'sv' key.

    Handles ensemble calling and plotting of results.
    """
    by_bam = collections.OrderedDict()
    for x in samples:
        try:
            by_bam[x["align_bam"]].append(x)
        except KeyError:
            by_bam[x["align_bam"]] = [x]
    highdepths = filter(
        lambda x: x is not None,
        list(set([tz.get_in(["config", "algorithm", "highdepth_regions"], x) for x in samples])),
    )
    by_batch = collections.OrderedDict()
    lead_batches = {}
    for grouped_calls in by_bam.values():

        def orig_svcaller_order(x):
            return _get_svcallers(x).index(x["config"]["algorithm"]["svcaller_active"])

        sorted_svcalls = sorted([x for x in grouped_calls if "sv" in x], key=orig_svcaller_order)
        final = grouped_calls[0]
        if len(sorted_svcalls) > 0:
            final_calls = reduce(operator.add, [x["sv"] for x in sorted_svcalls])
            final_calls = ensemble.summarize(final_calls, final, highdepths)
            final_calls = validate.evaluate(final, final_calls)
            final["sv"] = final_calls
        del final["config"]["algorithm"]["svcaller_active"]
        batch = dd.get_batch(final) or dd.get_sample_name(final)
        batches = batch if isinstance(batch, (list, tuple)) else [batch]
        lead_batches[dd.get_sample_name(final)] = batches[0]
        for batch in batches:
            try:
                by_batch[batch].append(final)
            except KeyError:
                by_batch[batch] = [final]
    out = []
    for batch, items in by_batch.items():
        plot_items = plot.by_regions(items)
        for data in plot_items:
            if lead_batches[dd.get_sample_name(data)] == batch:
                out.append([data])
    return out
Пример #39
0
def _batch_split_by_sv(samples, stage):
    """Return
    - to_process = svcaller-batch => [svcaller-sample1, svcaller-sample2...] odict
    - extras = samples without sv calling (should there be any?)
    - background - all samples
    """
    to_process = collections.OrderedDict()
    extras = []
    background = []
    for data in (utils.to_single_data(x) for x in samples):
        # data = sample
        ready_data = _handle_multiple_svcallers(data, stage)
        if len(ready_data) > 0:
            # why appending every sample to background?
            background.append(data)
            # x is sample - sv caller pair
            for x in ready_data:
                svcaller = tz.get_in(["config", "algorithm", "svcaller"], x)
                batch = dd.get_batch(x) or dd.get_sample_name(x)
                if stage in ["ensemble"]:  # no batching for ensemble methods
                    if isinstance(batch, six.string_types
                                  ) and batch != dd.get_sample_name(x):
                        batch += "_%s" % dd.get_sample_name(x)
                    else:
                        batch = dd.get_sample_name(x)
                    if dd.get_phenotype(x) == "germline":
                        batch += "_germline"
                elif svcaller in _GLOBAL_BATCHING:  # All samples batched together for analyses
                    batch = "all"
                # just creating PON - no calling
                if stage in ["standard"] and batch in ["pon_build"]:
                    extras.append(x)
                else:
                    batches = batch if isinstance(batch,
                                                  (list, tuple)) else [batch]
                    for b in batches:
                        try:
                            to_process[(svcaller, b)].append(x)
                        except KeyError:
                            to_process[(svcaller, b)] = [x]
        else:
            extras.append([data])
    return to_process, extras, background
Пример #40
0
def finalize_sv(samples, config):
    """Combine results from multiple sv callers into a single ordered 'sv' key.
    """
    by_bam = collections.OrderedDict()
    for x in samples:
        try:
            by_bam[x["align_bam"]].append(x)
        except KeyError:
            by_bam[x["align_bam"]] = [x]
    by_batch = collections.OrderedDict()
    lead_batches = {}
    for grouped_calls in by_bam.values():

        def orig_svcaller_order(x):
            return _get_svcallers(x).index(
                x["config"]["algorithm"]["svcaller_active"])

        sorted_svcalls = sorted([x for x in grouped_calls if "sv" in x],
                                key=orig_svcaller_order)
        final = grouped_calls[0]
        if len(sorted_svcalls) > 0:
            final["sv"] = reduce(operator.add,
                                 [x["sv"] for x in sorted_svcalls])
        del final["config"]["algorithm"]["svcaller_active"]
        batch = dd.get_batch(final) or dd.get_sample_name(final)
        batches = batch if isinstance(batch, (list, tuple)) else [batch]
        lead_batches[dd.get_sample_name(final)] = batches[0]
        for batch in batches:
            try:
                by_batch[batch].append(final)
            except KeyError:
                by_batch[batch] = [final]
    out = []
    for batch, items in by_batch.items():
        if any("svplots" in dd.get_tools_on(d) for d in items):
            plot_items = plot.by_regions(items)
        else:
            plot_items = items
        for data in plot_items:
            if lead_batches[dd.get_sample_name(data)] == batch:
                out.append([data])
    return out
Пример #41
0
def run(samples, run_parallel, initial_only=False):
    """Run structural variation detection.

    initial_only indicates we should run structural variation inputs, like
    CNV calling, we can use to inform low frequency variant calling.
    """
    to_process = collections.OrderedDict()
    extras = []
    background = []
    for data in (xs[0] for xs in samples):
        ready_data = _handle_multiple_svcallers(data)
        if len(ready_data) > 0:
            background.append(data)
            for x in ready_data:
                svcaller = x["config"]["algorithm"].get("svcaller_active")
                # reset SV information if we're running a second pass SV call
                if "sv" in x:
                    del x["sv"]
                batch = dd.get_batch(x)
                paired = vcfutils.get_paired_phenotype(x)
                if ((svcaller in _BATCH_CALLERS and batch) or
                      (svcaller in _SOMATIC_CALLERS and paired and batch)):
                    batches = batch if isinstance(batch, (list, tuple)) else [batch]
                    for b in batches:
                        try:
                            to_process[(svcaller, b)].append(x)
                        except KeyError:
                            to_process[(svcaller, b)] = [x]
                else:
                    to_process[(svcaller, dd.get_sample_name(x))] = [x]
        else:
            extras.append([data])
    processed = run_parallel("detect_sv", ([xs, background, xs[0]["config"], initial_only]
                                           for xs in to_process.values()))
    finalized = (run_parallel("finalize_sv", [([xs[0] for xs in processed], processed[0][0]["config"],
                                               initial_only)])
                 if len(processed) > 0 else [])
    return extras + finalized
Пример #42
0
def _cnvkit_coverage_bin_estimate(raw_target_bed, access_bed, cov_interval, work_dir, data):
    """Estimate good coverage bin sizes for target regions based on coverage.
    """
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    out_file = os.path.join(work_dir, "%s-%s-bin_estimate.txt" % (
        os.path.splitext(os.path.basename(raw_target_bed))[0], batch))
    method_map = {"genome": "wgs", "regional": "hybrid", "amplicon": "amplicon"}
    if not os.path.exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            cmd = [_get_cmd("coverage_bin_size.py"), dd.get_align_bam(data),
                   "-m", method_map[cov_interval], "-t", raw_target_bed,
                   "-g", access_bed]
            cmd = " ".join(cmd) + " > " + tx_out_file
            try:
                do.run(_prep_cmd(cmd, tx_out_file), "CNVkit coverage bin estimation", log_error=False)
            except subprocess.CalledProcessError:
                logger.info("Bin size estimate failed, using default values")
                with open(tx_out_file, "w") as out_handle:
                    out_handle.write("Bin size estimate failed, using default values")
    avg_bin_sizes = {}
    estimate_map = {"On-target": "target", "Off-target": "antitarget",
                    "Genome": "target", "Targets (sampling)": "target"}
    range_map = {("genome", "target"): (500, 1000),
                 ("regional", "target"): (50, 267), ("regional", "antitarget"): (20000, 200000),
                 ("amplicon", "target"): (50, 267)}
    with open(out_file) as in_handle:
        for line in in_handle:
            if line.startswith(tuple(estimate_map.keys())):
                name, depth, bin_size = line.strip().split("\t")
                name = estimate_map[name.replace(":", "").strip()]
                try:
                    bin_size = int(bin_size)
                except ValueError:
                    bin_size = None
                if bin_size and bin_size > 0:
                    cur_min, cur_max = range_map[(cov_interval, name)]
                    avg_bin_sizes[name] = max(min(bin_size, cur_max), cur_min)
    return avg_bin_sizes
Пример #43
0
 def _get_batch(x):
     b = dd.get_batch(x)
     return [b] if not isinstance(b, (list, tuple)) else b
Пример #44
0
def _get_batches(data):
    batches = dd.get_batch(data) or dd.get_sample_name(data)
    if not isinstance(batches, (list, tuple)):
        batches = [batches]
    return batches
Пример #45
0
def _get_batches(data):
    batches = dd.get_batch(data) or dd.get_sample_name(data)
    if not isinstance(batches, (list, tuple)):
        batches = [batches]
    return batches
Пример #46
0
def run_peddy(samples, out_dir=None):
    data = samples[0]
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    if isinstance(batch, (list, tuple)):
        batch = batch[0]
    if out_dir:
        peddy_dir = safe_makedir(out_dir)
    else:
        peddy_dir = safe_makedir(
            os.path.join(dd.get_work_dir(data), "qc", batch, "peddy"))
    peddy_prefix = os.path.join(peddy_dir, batch)
    peddy_report = peddy_prefix + ".html"

    vcf_file = None
    for d in samples:
        vcinfo = None
        if dd.get_phenotype(d) == "germline" or dd.get_phenotype(d) not in [
                "tumor"
        ]:
            vcinfo = variant.get_active_vcinfo(d, use_ensemble=False)
        if not vcinfo and dd.get_phenotype(d) in ["tumor"]:
            vcinfo = variant.extract_germline_vcinfo(d, peddy_dir)
        if vcinfo:
            for key in ["germline", "vrn_file"]:
                if vcinfo and vcinfo.get(key) and utils.file_exists(
                        vcinfo[key]):
                    if vcinfo[key] and dd.get_sample_name(
                            d) in vcfutils.get_samples(vcinfo[key]):
                        if vcinfo[
                                key] and vcfutils.vcf_has_nonfiltered_variants(
                                    vcinfo[key]):
                            vcf_file = vcinfo[key]
                            break
    peddy = config_utils.get_program("peddy",
                                     data) if config_utils.program_installed(
                                         "peddy", data) else None
    config_skips = any(["peddy" in dd.get_tools_off(d) for d in samples])
    if not peddy or not vcf_file or not vcfanno.is_human(data) or config_skips:
        if not peddy:
            reason = "peddy executable not found"
        elif config_skips:
            reason = "peddy in tools_off configuration"
        elif not vcfanno.is_human(data):
            reason = "sample is not human"
        else:
            assert not vcf_file
            reason = "no suitable VCF files found with the sample and non-filtered variants"
        msg = "Skipping peddy QC, %s: %s" % (
            reason, [dd.get_sample_name(d) for d in samples])
        with open(peddy_prefix + "-failed.log", "w") as out_handle:
            out_handle.write(msg)
        logger.info(msg)
        return samples
    if file_exists(peddy_prefix + "-failed.log"):
        return samples
    if not file_exists(peddy_report):
        ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir)
        num_cores = dd.get_num_cores(data)
        with tx_tmpdir(data) as tx_dir:
            peddy_prefix_tx = os.path.join(tx_dir,
                                           os.path.basename(peddy_prefix))
            # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2
            stderr_log = os.path.join(tx_dir, "run-stderr.log")
            sites_str = "--sites hg38" if dd.get_genome_build(
                data) == "hg38" else ""
            locale = utils.locale_export()
            cmd = (
                "{locale} {peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} "
                "{vcf_file} {ped_file} 2> {stderr_log}")
            message = "Running peddy on {vcf_file} against {ped_file}."
            try:
                do.run(cmd.format(**locals()), message.format(**locals()))
            except:
                to_show = collections.deque(maxlen=100)
                with open(stderr_log) as in_handle:
                    for line in in_handle:
                        to_show.append(line)

                def allowed_errors(l):
                    return (
                        (l.find("IndexError") >= 0
                         and l.find("is out of bounds for axis") >= 0) or
                        (l.find("n_components=") >= 0
                         and l.find("must be between 1 and n_features=") >= 0)
                        or (l.find("n_components=") >= 0
                            and l.find("must be between 1 and min") >= 0)
                        or (l.find(
                            "Input contains NaN, infinity or a value too large for dtype"
                        ) >= 0))

                def all_line_errors(l):
                    return (l.find("no intervals found for") >= 0)

                if any([allowed_errors(l) for l in to_show]) or all(
                    [all_line_errors(l) for l in to_show]):
                    logger.info(
                        "Skipping peddy because no variants overlap with checks: %s"
                        % batch)
                    with open(peddy_prefix + "-failed.log", "w") as out_handle:
                        out_handle.write(
                            "peddy did not find overlaps with 1kg sites in VCF, skipping"
                        )
                    return samples
                else:
                    logger.warning("".join(to_show))
                    raise
            for ext in PEDDY_OUT_EXTENSIONS:
                if os.path.exists(peddy_prefix_tx + ext):
                    shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext)
    peddyfiles = expected_peddy_files(peddy_report, batch)
    return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
Пример #47
0
 def _get_batch(x):
     b = dd.get_batch(x)
     return [b] if not isinstance(b, (list, tuple)) else b
Пример #48
0
def mutect2_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's MuTect2.
    This requires the full non open-source version of GATK 3.5+.
    items = 1 sample or T/N pair
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        # call somatic variants keeping germline sites and using germline 1KG resource
        # use --native-pair-hmm-threads?
        broad_runner = broad.runner_from_config(items[0]["config"])
        gatk_type = broad_runner.gatk_type()
        # shared Mutect2 settings for PureCN analysis in the case of:
        # - PON creation
        # - Tumor-only PureCN run
        # - T/N PureCN run
        # PURECN requirement alters Mutect2 variants calling!
        if "purecn" in dd.get_svcaller(items[0]):
            # mutect call for PON creation or purecn T-only analysis
            _prep_inputs(align_bams, ref_file, items)
            with file_transaction(items[0], out_file) as tx_out_file:
                germline_resource = tz.get_in(["genome_resources", "variation", "af_only_gnomad"], items[0])
                germline_path = os.path.normpath(os.path.join(os.path.dirname(ref_file), germline_resource))
                input_bam = dd.get_work_bam(items[0])
                tx_prefilt_vcf = utils.splitext_plus(tx_out_file)[0] + ".prefilt.vcf"
                tx_vcf = os.path.splitext(tx_out_file)[0]
                out_file_ungz = os.path.splitext(out_file)[0]
                params = ["-T", "Mutect2"]
                # T/N pair
                if len(items) == 2:
                    paired = vcfutils.get_paired_bams(align_bams, items)
                    # not really running purecn with mutect1/gatk3
                    params += _add_tumor_params(paired, items, gatk_type)
                    logger.debug("You are running mutect2 in PureCN analysis in T/N mode, T-only + PON is recommended")
                else: #T only
                    params += ["-I", input_bam]
                    # adding SNV PON from background/variant
                    snv_pon = tz.get_in(["config", "algorithm", "background", "variant"], items[0])
                    if snv_pon and dd.get_batch(items[0]) != "pon_build":
                        params += ["-pon", snv_pon]
                        params += ["--genotype-pon-sites"]

                opt_list = config_utils.get_resources("mutect2", items[0]["config"]).get("options")
                # default is 50, sometimes 100 or 200 is recommended for better sensitivity in detection
                # hom del CNVs (calling more variants helps)
                interval_padding = 50
                if opt_list:
                    opt_dict = dict(zip(opt_list[::2], opt_list[1::2]))
                    if "--interval_padding" in opt_dict:
                        interval_padding = opt_dict["--interval_padding"]

                params += ["--max-mnp-distance", "0",
                           "--interval-padding", interval_padding,
                           "--germline-resource", germline_path,
                           "--genotype-germline-sites",
                           "--reference", ref_file,
                           "-O", tx_prefilt_vcf]

                params += _add_region_params(region, out_file, items, gatk_type)
                broad_runner.new_resources("mutect2")
                gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file))
                filter_cmd = _mutect2_filter(broad_runner, items, tx_prefilt_vcf, out_file_ungz, ref_file)
                cmd = "{gatk_cmd} && {filter_cmd}"
                do.run(cmd.format(**locals()), "MuTect2")
                # no AF filter for PureCN variants
                out_file = vcfutils.bgzip_and_index(out_file_ungz, items[0]["config"])
        else:
            # a regular mutect call
            paired = vcfutils.get_paired_bams(align_bams, items)
            f1r2_file = None
            _prep_inputs(align_bams, ref_file, items)
            with file_transaction(items[0], out_file) as tx_out_file:
                params = ["-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2",
                          "--annotation", "ClippingRankSumTest",
                          "--annotation", "DepthPerSampleHC"]
                if gatk_type == "gatk4":
                    params += ["--reference", ref_file]
                else:
                    params += ["-R", ref_file]
                for a in annotation.get_gatk_annotations(items[0]["config"], include_baseqranksum=False):
                    params += ["--annotation", a]
                # Avoid issues with BAM CIGAR reads that GATK doesn't like
                if gatk_type == "gatk4":
                    params += ["--read-validation-stringency", "LENIENT"]
                params += _add_tumor_params(paired, items, gatk_type)
                params += _add_region_params(region, out_file, items, gatk_type)
                if all(is_paired(bam) for bam in align_bams) and (
                        "mutect2_readmodel" in utils.get_in(items[0], "config", "tools_on")):
                    orientation_filter = True
                else:
                    orientation_filter = False

                if gatk_type == "gatk4" and orientation_filter:
                    f1r2_file = "{}-f1r2.tar.gz".format(utils.splitext_plus(out_file)[0])
                    params += ["--f1r2-tar-gz", f1r2_file]

                # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm
                # Not yet clear how this helps or hurts in a general case.
                #params += _add_assoc_params(assoc_files)
                resources = config_utils.get_resources("mutect2", items[0]["config"])
                if "options" in resources:
                    params += [str(x) for x in resources.get("options", [])]
                assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \
                    "Require full version of GATK 3.5+ for mutect2 calling"
                broad_runner.new_resources("mutect2")
                gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file))
                if gatk_type == "gatk4":
                    tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus(out_file)
                    tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus(tx_out_file)

                    if orientation_filter:
                        tx_f1r2_file = "{}-read-orientation-model.tar.gz"
                        tx_f1r2_file = tx_f1r2_file.format(utils.splitext_plus(f1r2_file)[0])
                        tx_read_orient_cmd = _mutect2_read_filter(broad_runner,
                                                                  f1r2_file,
                                                                  tx_f1r2_file)

                        filter_cmd = _mutect2_filter(broad_runner, items,
                                                     tx_raw_prefilt_file,
                                                     tx_raw_file, ref_file,
                                                     tx_f1r2_file)
                    else:
                        filter_cmd = _mutect2_filter(broad_runner, items,
                                                     tx_raw_prefilt_file,
                                                     tx_raw_file, ref_file)
                    if orientation_filter:
                        cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {tx_read_orient_cmd} && {filter_cmd}"
                    else:
                        cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}"
                else:
                    tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file)
                    cmd = "{gatk_cmd} > {tx_raw_file}"
                do.run(cmd.format(**locals()), "MuTect2")
                out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file)
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])