def run_autopair(args):
    outdir = utils.safe_makedir(args.outdir)
    to_run = []
    extras = []
    for fnames in fastq.combine_pairs(sorted(args.files)):
        if len(fnames) == 2:
            to_run.append(fnames)
        elif len(fnames) == 3:
            r1, r2, r3 = sorted(fnames)
            to_run.append([r1, r2])
            extras.append(r3)
        else:
            assert len(fnames) == 1, fnames
            extras.append(fnames[0])
    ready_to_run = []
    for r1, r2 in to_run:
        target = os.path.commonprefix([r1, r2])
        r3 = None
        for test_r3 in extras:
            if os.path.commonprefix([r1, test_r3]) == target and os.path.commonprefix([r2, test_r3]) == target:
                r3 = test_r3
                break
        assert r3, (r1, r2, extras)
        base_name = os.path.join(outdir, os.path.commonprefix([r1, r2, r3]).rstrip("_R"))
        ready_to_run.append([base_name, r1, r3, r2, {"algorithm": {}, "resources": {}}])

    parallel = {"type": "local", "cores": len(ready_to_run), "progs": []}
    run_multicore(add_umis_to_fastq_parallel, ready_to_run, {"algorithm": {}}, parallel)
def run_autopair(args):
    outdir = utils.safe_makedir(args.outdir)
    to_run = []
    extras = []
    for fnames in fastq.combine_pairs(sorted(args.files)):
        if len(fnames) == 2:
            to_run.append(fnames)
        elif len(fnames) == 3:
            r1, r2, r3 = sorted(fnames)
            to_run.append([r1, r2])
            extras.append(r3)
        else:
            assert len(fnames) == 1, fnames
            extras.append(fnames[0])
    ready_to_run = []
    tags = [args.tag1, args.tag2] if args.tag1 and args.tag2 else None
    for r1, r2 in to_run:
        target = _commonprefix([r1, r2])
        if tags:
            base_name = os.path.join(outdir, os.path.basename(_commonprefix([r1, r2])))
            umi = None
        else:
            r3 = None
            for test_r3 in extras:
                if (_commonprefix([r1, test_r3]) == target and
                      _commonprefix([r2, test_r3]) == target):
                    r3 = test_r3
                    break
            assert r3, (r1, r2, extras)
            base_name = os.path.join(outdir, os.path.basename(_commonprefix([r1, r2, r3])))
            r1, r2, umi = _find_umi([r1, r2, r3])
        ready_to_run.append([base_name, r1, r2, umi, tags, {"algorithm": {}, "resources": {}}])
    parallel = {"type": "local", "cores": args.cores, "progs": []}
    run_multicore(add_umis_to_fastq_parallel, ready_to_run, {"algorithm": {}}, parallel)
Exemplo n.º 3
0
def _prep_grabix_indexes(in_files, dirs, config):
    if in_files[0].endswith(".bam") and len(in_files) == 1 or in_files[1] is None:
        out = _bgzip_from_bam(in_files[0], dirs, config)
    else:
        out = run_multicore(_bgzip_from_fastq,
                            [[{"in_file": x, "dirs": dirs, "config": config}] for x in in_files if x],
                            config)
    items = [[{"bgzip_file": x, "config": copy.deepcopy(config)}] for x in out if x]
    run_multicore(_grabix_index, items, config)
    return out
Exemplo n.º 4
0
def _run_cnvkit_shared(items, test_bams, background_bams, work_dir, background_name=None):
    """Shared functionality to run CNVkit, parallelizing over multiple BAM files.
    """
    raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))

    background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name if background_name else "flat"))
    ckouts = []
    for test_bam in test_bams:
        out_base = _bam_to_outbase(test_bam, raw_work_dir)
        ckouts.append({"cnr": "%s.cns" % out_base, "cns": "%s.cns" % out_base, "back_cnn": background_cnn})
    if not utils.file_exists(ckouts[0]["cnr"]):
        data = items[0]
        cov_interval = dd.get_coverage_interval(data)
        raw_target_bed, access_bed = _get_target_access_files(cov_interval, data, work_dir)
        # bail out if we ended up with no regions
        if not utils.file_exists(raw_target_bed):
            return {}
        raw_target_bed = annotate.add_genes(raw_target_bed, data)
        parallel = {"type": "local", "cores": dd.get_cores(data), "progs": ["cnvkit"]}
        target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval, raw_work_dir, data)

        def _bam_to_itype(bam):
            return "background" if bam in background_bams else "evaluate"

        split_cnns = run_multicore(
            _cnvkit_coverage,
            [
                (bam, bed, _bam_to_itype(bam), raw_work_dir, data)
                for bam in test_bams + background_bams
                for bed in _split_bed(target_bed, data) + _split_bed(antitarget_bed, data)
            ],
            data["config"],
            parallel,
        )
        coverage_cnns = _merge_coverage(split_cnns, data)
        background_cnn = _cnvkit_background(
            [x["file"] for x in coverage_cnns if x["itype"] == "background"],
            background_cnn,
            target_bed,
            antitarget_bed,
            data,
        )
        fixed_cnrs = run_multicore(
            _cnvkit_fix,
            [
                (cnns, background_cnn, data)
                for cnns in tz.groupby("bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values()
            ],
            data["config"],
            parallel,
        )
        called_segs = run_multicore(
            _cnvkit_segment, [(cnr, cov_interval, data) for cnr in fixed_cnrs], data["config"], parallel
        )
    return ckouts
Exemplo n.º 5
0
def _prep_grabix_indexes(in_files, dirs, data):
    if _is_bam_input(in_files):
        out = _bgzip_from_bam(in_files[0], dirs, data["config"])
    elif _is_cram_input(in_files):
        out = _bgzip_from_cram(in_files[0], dirs, data)
    else:
        out = run_multicore(_bgzip_from_fastq,
                            [[{"in_file": x, "dirs": dirs, "config": data["config"]}] for x in in_files if x],
                            data["config"])
    items = [[{"bgzip_file": x, "config": copy.deepcopy(data["config"])}] for x in out if x]
    run_multicore(_grabix_index, items, data["config"])
    return out
Exemplo n.º 6
0
def _prep_grabix_indexes(in_files, dirs, data):
    if _is_bam_input(in_files):
        out = _bgzip_from_bam(in_files[0], dirs, data["config"])
    elif _is_cram_input(in_files):
        out = _bgzip_from_cram(in_files[0], dirs, data)
    else:
        inputs = [{"in_file": x, "dirs": dirs, "config": data["config"], "rgnames": data["rgnames"]}
                  for x in in_files if x]
        if "pbgzip" not in dd.get_tools_off(data):
            out = [_bgzip_from_fastq(d) for d in inputs]
        else:
            out = run_multicore(_bgzip_from_fastq_parallel, [[d] for d in inputs], data["config"])
    items = [[{"bgzip_file": x, "config": copy.deepcopy(data["config"])}] for x in out if x]
    run_multicore(_grabix_index, items, data["config"])
    return out
Exemplo n.º 7
0
def _run_cnvkit_shared(inputs, backgrounds):
    """Shared functionality to run CNVkit, parallelizing over multiple BAM files.
    """
    work_dir = _sv_workdir(inputs[0])
    raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
    background_name = dd.get_sample_name(backgrounds[0]) if backgrounds else "flat"
    background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name))
    ckouts = []
    for cur_input in inputs:
        cur_raw_work_dir = utils.safe_makedir(os.path.join(_sv_workdir(cur_input), "raw"))
        out_base = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir)
        ckouts.append({"cnr": "%s.cnr" % out_base,
                       "cns": "%s.cns" % out_base,
                       "back_cnn": background_cnn})
    if not utils.file_exists(ckouts[0]["cnr"]):
        cov_interval = dd.get_coverage_interval(inputs[0])
        raw_target_bed, access_bed = _get_target_access_files(cov_interval, inputs[0], work_dir)
        # bail out if we ended up with no regions
        if not utils.file_exists(raw_target_bed):
            return {}
        raw_target_bed = annotate.add_genes(raw_target_bed, inputs[0])
        parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]}
        pct_coverage = (pybedtools.BedTool(raw_target_bed).total_coverage() /
                        float(pybedtools.BedTool(access_bed).total_coverage())) * 100.0
        target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval,
                                                     pct_coverage, raw_work_dir, inputs[0])
        split_beds = _split_bed(target_bed, inputs[0]) + _split_bed(antitarget_bed, inputs[0])
        samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \
                         zip(["evaluate"] * len(inputs), inputs)
        split_cnns = run_multicore(_cnvkit_coverage,
                                   [(cdata, bed, itype) for itype, cdata in samples_to_run for bed in split_beds],
                                   inputs[0]["config"], parallel)
        raw_coverage_cnns = _merge_coverage(split_cnns, inputs[0])
        coverage_cnns = run_multicore(_cnvkit_metrics,
                                      [(cnns, target_bed, antitarget_bed, cov_interval, inputs + backgrounds)
                                       for cnns in tz.groupby("bam", raw_coverage_cnns).values()],
                                      inputs[0]["config"], parallel)
        background_cnn = _cnvkit_background(_select_background_cnns(coverage_cnns),
                                            background_cnn, target_bed, antitarget_bed, inputs[0])
        fixed_cnrs = run_multicore(_cnvkit_fix,
                                   [(cnns, background_cnn, inputs + backgrounds) for cnns in
                                    tz.groupby("bam", [x for x in coverage_cnns
                                                       if x["itype"] == "evaluate"]).values()],
                                      inputs[0]["config"], parallel)
        run_multicore(_cnvkit_segment,
                      [(cnr, cov_interval, data) for cnr, data in fixed_cnrs],
                      inputs[0]["config"], parallel)
    return ckouts
Exemplo n.º 8
0
def run(items):
    """Perform detection of structural variations with delly.
    """
    work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural",
                                               items[0]["name"][-1], "delly"))
    work_bams = [data["align_bam"] for data in items]
    ref_file = utils.get_in(items[0], ("reference", "fasta", "base"))
    # Add core request for delly
    config = copy.deepcopy(items[0]["config"])
    delly_config = utils.get_in(config, ("resources", "delly"), {})
    delly_config["cores"] = len(items)
    config["resources"]["delly"] = delly_config
    parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1),
                "progs": ["delly"]}
    sv_types = ["DEL", "DUP", "INV"]  # "TRA" has invalid VCF END specifications that GATK doesn't like
    with closing(pysam.Samfile(work_bams[0], "rb")) as pysam_work_bam:
        bytype_vcfs = run_multicore(_run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items)
                                                 for (chrom, sv_type)
                                                 in itertools.product(pysam_work_bam.references, sv_types)],
                                    config, parallel)
    out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs)
    delly_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"])
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = {}
        data["sv"]["delly"] = delly_vcf
        out.append(data)
    return out
Exemplo n.º 9
0
def _prep_grabix_indexes(in_files, data):
    """Parallel preparation of grabix indexes for files.
    """
    # if we have gzipped but not bgzipped, add a fake index for CWL support
    # Also skips bgzip indexing if we don't need alignment splitting
    if _ready_gzip_fastq(in_files, data) and (not _ready_gzip_fastq(in_files, data, require_bgzip=True) or
                                              dd.get_align_split_size(data) is False):
        for in_file in in_files:
            if not utils.file_exists(in_file + ".gbi"):
                with file_transaction(data, in_file + ".gbi") as tx_gbi_file:
                    with open(tx_gbi_file, "w") as out_handle:
                        out_handle.write("Not grabix indexed; index added for compatibility.\n")
    else:
        items = [[{"bgzip_file": x, "config": copy.deepcopy(data["config"])}] for x in in_files if x]
        run_multicore(_grabix_index, items, data["config"])
    return data
Exemplo n.º 10
0
def prep_fastq_inputs(in_files, data):
    """Prepare bgzipped fastq inputs
    """
    if len(in_files) == 1 and _is_bam_input(in_files):
        out = _bgzip_from_bam(in_files[0], data["dirs"], data)
    elif len(in_files) == 1 and _is_cram_input(in_files):
        out = _bgzip_from_cram(in_files[0], data["dirs"], data)
    elif len(in_files) in [1, 2] and _ready_gzip_fastq(in_files, data):
        out = _symlink_in_files(in_files, data)
    else:
        if len(in_files) > 2:
            fpairs = fastq.combine_pairs(in_files)
            pair_types = set([len(xs) for xs in fpairs])
            assert len(pair_types) == 1
            fpairs.sort(key=lambda x: os.path.basename(x[0]))
            organized = [[xs[0] for xs in fpairs]]
            if len(fpairs[0]) > 1:
                organized.append([xs[1] for xs in fpairs])
            in_files = organized
        parallel = {"type": "local", "num_jobs": len(in_files),
                    "cores_per_job": max(1, data["config"]["algorithm"]["num_cores"] // len(in_files))}
        inputs = [{"in_file": x, "read_num": i, "dirs": data["dirs"], "config": data["config"],
                   "is_cwl": "cwl_keys" in data,
                   "rgnames": data["rgnames"]}
                  for i, x in enumerate(in_files) if x]
        out = run_multicore(_bgzip_from_fastq_parallel, [[d] for d in inputs], data["config"], parallel)
    return out
Exemplo n.º 11
0
def concat_variant_files(orig_files, out_file, regions, ref_file, config):
    """Concatenate multiple variant files from regions into a single output file.

    Lightweight approach to merging VCF files split by regions with the same
    sample information, so no complex merging needed. Handles both plain text
    and bgzipped/tabix indexed outputs.

    Falls back to slower CombineVariants if fails due to GATK stringency issues.
    """
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            sorted_files = _sort_by_region(orig_files, regions, ref_file, config)
            exist_files = [x for x in sorted_files if os.path.exists(x)]
            ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config)
            input_file_list = "%s-files.list" % utils.splitext_plus(out_file)[0]
            with open(input_file_list, "w") as out_handle:
                for fname in ready_files:
                    out_handle.write(fname + "\n")
            params = ["org.broadinstitute.gatk.tools.CatVariants",
                      "-R" , ref_file,
                      "-V", input_file_list,
                      "-out", tx_out_file,
                      "-assumeSorted"]
            jvm_opts = broad.get_gatk_framework_opts(config, include_gatk=False)
            cmd = [config_utils.get_program("gatk-framework", config)] + params + jvm_opts
            try:
                do.run(cmd, "Concat variant files", log_error=False)
            except subprocess.CalledProcessError, msg:
                if str(msg).find("We require all VCFs to have complete VCF headers"):
                    return combine_variant_files(orig_files, out_file, ref_file, config)
                else:
                    raise
Exemplo n.º 12
0
def parallel_combine_variants(orig_files, out_file, ref_file, config, run_parallel):
    """Combine variants in parallel by chromosome, concatenating final outputs.
    """
    file_key = "vcf_files"

    def split_by_region(data):
        base, ext = utils.splitext_plus(os.path.basename(out_file))
        args = []
        for region in [x.name for x in ref.file_contigs(ref_file, config)]:
            region_out = os.path.join(os.path.dirname(out_file), "%s-regions" % base, "%s-%s%s" % (base, region, ext))
            utils.safe_makedir(os.path.dirname(region_out))
            args.append((region_out, ref_file, config, region))
        return out_file, args

    config = copy.deepcopy(config)
    config["file_key"] = file_key
    prep_files = run_multicore(p_bgzip_and_index, [[x, config] for x in orig_files], config)
    items = [[{file_key: prep_files}]]
    parallel_split_combine(
        items,
        split_by_region,
        run_parallel,
        "merge_variant_files",
        "concat_variant_files",
        file_key,
        ["region", "sam_ref", "config"],
        split_outfile_i=0,
    )
    return out_file
Exemplo n.º 13
0
def run(items, background=None):
    """Detect copy number variations from batched set of samples using cn.mops.
    """
    if not background: background = []
    names = [tz.get_in(["rgnames", "sample"], x) for x in items + background]
    work_bams = [x["align_bam"] for x in items + background]
    if len(items + background) < 2:
        raise ValueError("cn.mops only works on batches with multiple samples")
    data = items[0]
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural", names[0],
                                               "cn_mops"))
    parallel = {"type": "local", "cores": data["config"]["algorithm"].get("num_cores", 1),
                "progs": ["delly"]}
    with closing(pysam.Samfile(work_bams[0], "rb")) as pysam_work_bam:
        chroms = [None] if _get_regional_bed_file(items[0]) else pysam_work_bam.references
        out_files = run_multicore(_run_on_chrom, [(chrom, work_bams, names, work_dir, items)
                                                  for chrom in chroms],
                                  data["config"], parallel)
    out_file = _combine_out_files(out_files, work_dir, data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        data["sv"].append({"variantcaller": "cn_mops",
                           "vrn_file": _prep_sample_cnvs(out_file, data)})
        out.append(data)
    return out
Exemplo n.º 14
0
def concat_variant_files(orig_files, out_file, regions, ref_file, config):
    """Concatenate multiple variant files from regions into a single output file.

    Lightweight approach to merging VCF files split by regions with the same
    sample information, so no complex merging needed. Handles both plain text
    and bgzipped/tabix indexed outputs.
    """
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            sorted_files = _sort_by_region(orig_files, regions, ref_file, config)
            filtered_files = [x for x in sorted_files if vcf_has_variants(x)]
            if len(filtered_files) > 0 and filtered_files[0].endswith(".gz"):
                filtered_files = run_multicore(p_bgzip_and_index, [[x, config] for x in filtered_files], config)
            input_vcf_file = "%s-files.txt" % utils.splitext_plus(out_file)[0]
            with open(input_vcf_file, "w") as out_handle:
                for fname in filtered_files:
                    out_handle.write(fname + "\n")
            if len(filtered_files) > 0:
                compress_str = "| bgzip -c " if out_file.endswith(".gz") else ""
                cmd = "vcfcat `cat {input_vcf_file}` {compress_str} > {tx_out_file}"
                do.run(cmd.format(**locals()), "Concatenate variants")
            else:
                # try to rescue sample names from individual vcf files
                my_samples = None
                for vrn_file in sorted_files:
                    if vrn_file.endswith(".gz"):
                        tabix_index(vrn_file, config)
                    my_reader = vcf.Reader(filename = vrn_file)
                    if len(my_reader.samples) > 0:
                        my_samples = my_reader.samples[:]
                        break
                write_empty_vcf(tx_out_file, None, my_samples)
    if out_file.endswith(".gz"):
        bgzip_and_index(out_file, config)
    return out_file
Exemplo n.º 15
0
def combine_variant_files(orig_files, out_file, ref_file, config,
                          quiet_out=True, region=None):
    """Combine VCF files from the same sample into a single output file.

    Handles cases where we split files into SNPs/Indels for processing then
    need to merge back into a final file.
    """
    in_pipeline = False
    if isinstance(orig_files, dict):
        file_key = config["file_key"]
        in_pipeline = True
        orig_files = orig_files[file_key]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            exist_files = [x for x in orig_files if os.path.exists(x)]
            ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config)
            dict_file = "%s.dict" % utils.splitext_plus(ref_file)[0]
            cores = dd.get_num_cores({"config": config})
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            cmd = ["picard"] + broad.get_picard_opts(config, memscale) + \
                  ["MergeVcfs", "D=%s" % dict_file, "O=%s" % tx_out_file] + \
                  ["I=%s" % f for f in ready_files]
            cmd = "%s && %s" % (utils.get_java_clprep(), " ".join(cmd))
            do.run(cmd, "Combine variant files")
    if out_file.endswith(".gz"):
        bgzip_and_index(out_file, config)
    if in_pipeline:
        return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}]
    else:
        return out_file
Exemplo n.º 16
0
def run(items):
    """Perform detection of structural variations with delly.
    """
    work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural",
                                               items[0]["name"][-1], "delly"))
    work_bams = [data["align_bam"] for data in items]
    ref_file = utils.get_in(items[0], ("reference", "fasta", "base"))
    # Add core request for delly
    config = copy.deepcopy(items[0]["config"])
    delly_config = utils.get_in(config, ("resources", "delly"), {})
    delly_config["cores"] = len(items)
    config["resources"]["delly"] = delly_config
    parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1),
                "progs": ["delly"]}
    bytype_vcfs = run_multicore(_run_delly, [(work_bams, sv_type, ref_file, work_dir, items)
                                             for sv_type in ["DEL", "DUP", "INV", "TRA"]],
                                config, parallel)
    out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs)
    delly_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"])
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = {}
        data["sv"]["delly"] = delly_vcf
        out.append(data)
    return out
Exemplo n.º 17
0
def run_autopair(args):
    outdir = utils.safe_makedir(args.outdir)
    to_run = []
    extras = []
    for fnames in fastq.combine_pairs(sorted(args.files)):
        if len(fnames) == 2:
            to_run.append(fnames)
        elif len(fnames) == 3:
            r1, r2, r3 = sorted(fnames)
            to_run.append([r1, r2])
            extras.append(r3)
        else:
            assert len(fnames) == 1, fnames
            extras.append(fnames[0])
    ready_to_run = []
    tags = [args.tag1, args.tag2] if args.tag1 and args.tag2 else None
    if not tags:
        # Aim for 2 or 3 simultaneous processes, each with multiple cores
        target_processes = 2
        process_cores = max(1, (args.cores // target_processes) + (args.cores % target_processes))
        overall_processes = max(1, int(math.ceil(args.cores / float(process_cores))))
    else:
        process_cores = 1
        overall_processes = args.cores
    for r1, r2 in to_run:
        target = _commonprefix([r1, r2])
        if tags:
            base_name = os.path.join(outdir, os.path.basename(_commonprefix([r1, r2])))
            umi = None
        else:
            r3 = None
            for test_r3 in extras:
                if (_commonprefix([r1, test_r3]) == target and
                      _commonprefix([r2, test_r3]) == target):
                    r3 = test_r3
                    break
            assert r3, (r1, r2, extras)
            base_name = os.path.join(outdir, os.path.basename(_commonprefix([r1, r2, r3])))
            r1, r2, umi = _find_umi([r1, r2, r3])
        # fastp handles a single pair of reads so we split processing to run on each
        if umi and not tags:
            ready_to_run.append([base_name, r1, None, umi, None, process_cores, {"algorithm": {}, "resources": {}}])
            ready_to_run.append([base_name, None, r2, umi, None, process_cores, {"algorithm": {}, "resources": {}}])
        else:
            ready_to_run.append([base_name, r1, r2, umi, tags, process_cores, {"algorithm": {}, "resources": {}}])
    parallel = {"type": "local", "cores": overall_processes, "progs": []}
    run_multicore(add_umis_to_fastq_parallel, ready_to_run, {"algorithm": {}}, parallel)
Exemplo n.º 18
0
def run(items):
    """Perform detection of structural variations with delly.

    Performs post-call filtering with a custom filter tuned based
    on NA12878 Moleculo and PacBio data, using calls prepared by
    @ryanlayer and @cc2qe

    Filters using the high quality variant pairs (DV) compared with
    high quality reference pairs (DR).
    """
    work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural",
                                               items[0]["name"][-1], "delly"))
    # Add core request for delly
    config = copy.deepcopy(items[0]["config"])
    delly_config = utils.get_in(config, ("resources", "delly"), {})
    delly_config["cores"] = 1
    config["resources"]["delly"] = delly_config
    parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1),
                "progs": ["delly"]}
    work_bams = run_multicore(_prep_subsampled_bams,
                              [(data, work_dir) for data in items],
                              config, parallel)
    ref_file = utils.get_in(items[0], ("reference", "fasta", "base"))
    sv_types = ["DEL", "DUP"]  # "TRA" has invalid VCF END specifications that GATK doesn't like, "INV" very slow
    exclude_file = _get_full_exclude_file(items, work_dir)
    bytype_vcfs = run_multicore(_run_delly,
                                [(work_bams, chrom, sv_type, ref_file, work_dir, items)
                                 for (chrom, sv_type)
                                 in itertools.product(sshared.get_sv_chroms(items, exclude_file), sv_types)],
                                config, parallel)
    out_file = "%s.vcf.gz" % sshared.outname_from_inputs(bytype_vcfs)
    combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, config)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        base, ext = utils.splitext_plus(combo_vcf)
        sample = tz.get_in(["rgnames", "sample"], data)
        delly_sample_vcf = vcfutils.select_sample(combo_vcf, sample,
                                                  "%s-%s%s" % (base, sample, ext), data["config"])
        delly_vcf = _delly_count_evidence_filter(delly_sample_vcf, data)
        data["sv"].append({"variantcaller": "delly", "vrn_file": delly_vcf,
                           "exclude": exclude_file})
        out.append(data)
    return out
Exemplo n.º 19
0
def concat_variant_files_bcftools(orig_files, out_file, config):
    if not utils.file_exists(out_file):
        exist_files = [x for x in orig_files if os.path.exists(x)]
        ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config)
        input_file_list = "%s-files.list" % utils.splitext_plus(out_file)[0]
        with open(input_file_list, "w") as out_handle:
            for fname in ready_files:
                out_handle.write(fname + "\n")
        return _run_concat_variant_files_bcftools(input_file_list, out_file, config)
    else:
        return bgzip_and_index(out_file, config)
Exemplo n.º 20
0
def _run_cnvkit_shared_orig(inputs, backgrounds):
    """Original CNVkit implementation with full normalization and segmentation.
    """
    work_dir = _sv_workdir(inputs[0])
    raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
    background_name = dd.get_sample_name(backgrounds[0]) if backgrounds else "flat"
    background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name))
    ckouts = []
    for cur_input in inputs:
        cur_raw_work_dir = utils.safe_makedir(os.path.join(_sv_workdir(cur_input), "raw"))
        out_base, out_base_old = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir, cur_input)
        if utils.file_exists(out_base_old + ".cns"):
            out_base = out_base_old
        ckouts.append({"cnr": "%s.cnr" % out_base,
                       "cns": "%s.cns" % out_base})
    if not utils.file_exists(ckouts[0]["cns"]):
        cov_interval = dd.get_coverage_interval(inputs[0])
        samples_to_run = list(zip(["background"] * len(backgrounds), backgrounds)) + \
                         list(zip(["evaluate"] * len(inputs), inputs))
        # New style shared SV bins
        if tz.get_in(["depth", "bins", "target"], inputs[0]):
            target_bed = tz.get_in(["depth", "bins", "target"], inputs[0])
            antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], inputs[0])
            raw_coverage_cnns = reduce(operator.add,
                                       [_get_general_coverage(cdata, itype) for itype, cdata in samples_to_run])
        # Back compatible with pre-existing runs
        else:
            target_bed, antitarget_bed = _get_original_targets(inputs[0])
            raw_coverage_cnns = reduce(operator.add,
                                       [_get_original_coverage(cdata, itype) for itype, cdata in samples_to_run])
        # Currently metrics not calculated due to speed and needing re-evaluation
        # We could re-enable with larger truth sets to evaluate background noise
        # But want to reimplement in a more general fashion as part of normalization
        if False:
            coverage_cnns = reduce(operator.add,
                                [_cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval,
                                                    inputs + backgrounds)
                                    for cnns in tz.groupby("bam", raw_coverage_cnns).values()])
            background_cnn = cnvkit_background(_select_background_cnns(coverage_cnns),
                                                background_cnn, inputs, target_bed, antitarget_bed)
        else:
            coverage_cnns = raw_coverage_cnns
            background_cnn = cnvkit_background([x["file"] for x in coverage_cnns if x["itype"] == "background"],
                                                background_cnn, inputs, target_bed, antitarget_bed)
        parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]}
        fixed_cnrs = run_multicore(_cnvkit_fix,
                                   [(cnns, background_cnn, inputs, ckouts) for cnns in
                                    tz.groupby("bam", [x for x in coverage_cnns
                                                       if x["itype"] == "evaluate"]).values()],
                                   inputs[0]["config"], parallel)
        [_cnvkit_segment(cnr, cov_interval, data, inputs + backgrounds) for cnr, data in fixed_cnrs]
    return ckouts
Exemplo n.º 21
0
def _get_file_list(orig_files, out_file, regions, ref_file, config):
    """Create file with region sorted list of non-empty VCFs for concatenating.
    """
    sorted_files = _sort_by_region(orig_files, regions, ref_file, config)
    exist_files = [x for x in sorted_files if os.path.exists(x) and vcf_has_variants(x)]
    if len(exist_files) == 0:  # no non-empty inputs, merge the empty ones
        exist_files = [x for x in sorted_files if os.path.exists(x)]
    ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config)
    input_file_list = "%s-files.list" % utils.splitext_plus(out_file)[0]
    with open(input_file_list, "w") as out_handle:
        for fname in ready_files:
            out_handle.write(fname + "\n")
    return input_file_list
Exemplo n.º 22
0
def combine_variant_files(orig_files, out_file, ref_file, config,
                          quiet_out=True, region=None):
    """Combine VCF files from the same sample into a single output file.

    Handles cases where we split files into SNPs/Indels for processing then
    need to merge back into a final file.

    Will parallelize up to 4 cores based on documented recommendations:
    https://www.broadinstitute.org/gatk/gatkdocs/
    org_broadinstitute_gatk_tools_walkers_variantutils_CombineVariants.php
    """
    in_pipeline = False
    if isinstance(orig_files, dict):
        file_key = config["file_key"]
        in_pipeline = True
        orig_files = orig_files[file_key]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            exist_files = [x for x in orig_files if os.path.exists(x)]
            ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config)
            params = ["-T", "CombineVariants",
                      "-R", ref_file,
                      "--out", tx_out_file]
            priority_order = []
            for i, ready_file in enumerate(ready_files):
                name = "v%s" % i
                params.extend(["--variant:{name}".format(name=name), ready_file])
                priority_order.append(name)
            params.extend(["--rod_priority_list", ",".join(priority_order)])
            params.extend(["--genotypemergeoption", "PRIORITIZE"])
            if quiet_out:
                params.extend(["--suppressCommandLineHeader", "--setKey", "null"])
            if region:
                variant_regions = config["algorithm"].get("variant_regions", None)
                cur_region = shared.subset_variant_regions(variant_regions, region, out_file)
                if cur_region:
                    params += ["-L", bamprep.region_to_gatk(cur_region),
                               "--interval_set_rule", "INTERSECTION"]
            cores = tz.get_in(["algorithm", "num_cores"], config, 1)
            if cores > 1:
                params += ["-nt", min(cores, 4)]
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            jvm_opts = broad.get_gatk_framework_opts(config, memscale=memscale)
            cmd = [config_utils.get_program("gatk-framework", config)] + jvm_opts + params
            do.run(cmd, "Combine variant files")
    if out_file.endswith(".gz"):
        bgzip_and_index(out_file, config)
    if in_pipeline:
        return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}]
    else:
        return out_file
Exemplo n.º 23
0
def _normalize_sv_coverage_cnvkit(group_id, inputs, backgrounds, work_dir, back_files, out_files):
    """Normalize CNV coverage depths by GC, repeats and background using CNVkit

    - reference: calculates reference backgrounds from normals and pools
      including GC and repeat information
    - fix: Uses background to normalize coverage estimations
    http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix
    """
    from bcbio.structural import cnvkit
    cnns = reduce(operator.add, [[tz.get_in(["depth", "bins", "target"], x),
                                    tz.get_in(["depth", "bins", "antitarget"], x)] for x in backgrounds], [])
    for d in inputs:
        if tz.get_in(["depth", "bins", "target"], d):
            target_bed = tz.get_in(["depth", "bins", "target"], d)
            antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d)
    input_backs = set(filter(lambda x: x is not None,
                                [dd.get_background_cnv_reference(d, "cnvkit") for d in inputs]))
    if input_backs:
        assert len(input_backs) == 1, "Multiple backgrounds in group: %s" % list(input_backs)
        back_file = list(input_backs)[0]
    else:
        back_file = cnvkit.cnvkit_background(cnns,
                                             os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)),
                                             backgrounds or inputs, target_bed, antitarget_bed)
    fix_cmd_inputs = []
    for data in inputs:
        work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural",
                                                    dd.get_sample_name(data), "bins"))
        if tz.get_in(["depth", "bins", "target"], data):
            fix_file = os.path.join(work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data)))
            fix_cmd_inputs.append((tz.get_in(["depth", "bins", "target"], data),
                                    tz.get_in(["depth", "bins", "antitarget"], data),
                                    back_file, fix_file, data))
            out_files[dd.get_sample_name(data)] = fix_file
            back_files[dd.get_sample_name(data)] = back_file
    parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]}
    run_multicore(cnvkit.run_fix_parallel, fix_cmd_inputs, inputs[0]["config"], parallel)
    return back_files, out_files
Exemplo n.º 24
0
def _do_merge(orig_files, out_file, config, region):
    """Do the actual work of merging with bcftools merge.
    """
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            with short_filenames(run_multicore(p_bgzip_and_index, [[x, config] for x in orig_files], config)) as fs:
                prep_files = " ".join(fs)
                bcftools = config_utils.get_program("bcftools", config)
                output_type = "z" if out_file.endswith(".gz") else "v"
                region_str = "-r {}".format(region) if region else ""
                cmd = "{bcftools} merge -O {output_type} {region_str} {prep_files} > {tx_out_file}"
                do.run(cmd.format(**locals()), "Merge variants")
    if out_file.endswith(".gz"):
        bgzip_and_index(out_file, config)
    return out_file
Exemplo n.º 25
0
def _run_wham(inputs, background_bams):
    """Run WHAM on a defined set of inputs and targets.
    """
    out_file = os.path.join(_sv_workdir(inputs[0]), "%s-wham.vcf.gz" % dd.get_sample_name(inputs[0]))
    if not utils.file_exists(out_file):
        with file_transaction(inputs[0], out_file) as tx_out_file:
            coords = chromhacks.autosomal_or_x_coords(dd.get_ref_file(inputs[0]))
            parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": []}
            rs = run_multicore(_run_wham_coords,
                                [(inputs, background_bams, coord, out_file)
                                 for coord in coords],
                                inputs[0]["config"], parallel)
            rs = {coord: fname for (coord, fname) in rs}
            vcfutils.concat_variant_files([rs[c] for c in coords], tx_out_file, coords,
                                          dd.get_ref_file(inputs[0]), inputs[0]["config"])
    return out_file
Exemplo n.º 26
0
def _cram_to_fastq_regions(regions, cram_file, dirs, data):
    """Convert CRAM files to fastq, potentially within sub regions.

    Returns multiple fastq files that can be merged back together.
    """
    base_name = utils.splitext_plus(os.path.basename(cram_file))[0]
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep",
                                               "%s-parts" % base_name))
    fnames = run_multicore(_cram_to_fastq_region,
                           [(cram_file, work_dir, base_name, region, data) for region in regions],
                           data["config"])
    # check if we have paired or single end data
    if any(not _is_gzip_empty(p1) for p1, p2, s in fnames):
        out = [[p1, p2] for p1, p2, s in fnames]
    else:
        out = [[s] for p1, p2, s in fnames]
    return out, work_dir
Exemplo n.º 27
0
def _prep_fastq_inputs(in_files, data):
    """Prepare bgzipped fastq inputs
    """
    if _is_bam_input(in_files):
        out = _bgzip_from_bam(in_files[0], data["dirs"], data)
    elif _is_cram_input(in_files):
        out = _bgzip_from_cram(in_files[0], data["dirs"], data)
    elif _ready_bgzip_fastq(in_files, data):
        out = in_files
    else:
        parallel = {"type": "local", "num_jobs": len(in_files),
                    "cores_per_job": max(1, data["config"]["algorithm"]["num_cores"] // len(in_files))}
        inputs = [{"in_file": x, "read_num": i, "dirs": data["dirs"], "config": data["config"],
                   "is_cwl": "cwl_keys" in data,
                   "rgnames": data["rgnames"]}
                  for i, x in enumerate(in_files) if x]
        out = run_multicore(_bgzip_from_fastq_parallel, [[d] for d in inputs], data["config"], parallel)
    return out
Exemplo n.º 28
0
def _run_wham(inputs, background_bams):
    """Run WHAM on a defined set of inputs and targets.
    """
    out_file = os.path.join(_sv_workdir(inputs[0]), "%s-wham.bedpe" % dd.get_sample_name(inputs[0]))
    if not utils.file_exists(out_file):
        with file_transaction(inputs[0], out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                coords = chromhacks.autosomal_or_x_coords(dd.get_ref_file(inputs[0]))
                parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["wham"]}
                rs = run_multicore(_run_wham_coords,
                                   [(inputs, background_bams, coord, out_file)
                                    for coord in coords],
                                   inputs[0]["config"], parallel)
                rs = {coord: fname for (coord, fname) in rs}
                for coord in coords:
                    with open(rs[coord]) as in_handle:
                        shutil.copyfileobj(in_handle, out_handle)
    return out_file
Exemplo n.º 29
0
def _do_merge(orig_files, out_file, config, region):
    """Do the actual work of merging with bcftools merge.
    """
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            _check_samples_nodups(orig_files)
            prep_files = run_multicore(p_bgzip_and_index, [[x, config] for x in orig_files], config)
            input_vcf_file = "%s-files.txt" % utils.splitext_plus(out_file)[0]
            with open(input_vcf_file, "w") as out_handle:
                for fname in prep_files:
                    out_handle.write(fname + "\n")
            bcftools = config_utils.get_program("bcftools", config)
            output_type = "z" if out_file.endswith(".gz") else "v"
            region_str = "-r {}".format(region) if region else ""
            cmd = "{bcftools} merge -O {output_type} {region_str} `cat {input_vcf_file}` > {tx_out_file}"
            do.run(cmd.format(**locals()), "Merge variants")
    if out_file.endswith(".gz"):
        bgzip_and_index(out_file, config)
    return out_file
Exemplo n.º 30
0
def run(items):
    """Perform detection of structural variations with delly.

    Performs post-call filtering with a custom filter tuned based
    on NA12878 Moleculo and PacBio data, using calls prepared by
    @ryanlayer and @cc2qe

    Filters using the high quality variant pairs (DV) compared with
    high quality reference pairs (DR).
    """
    work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural",
                                               items[0]["name"][-1], "delly"))
    work_bams = [data["align_bam"] for data in items]
    ref_file = utils.get_in(items[0], ("reference", "fasta", "base"))
    # Add core request for delly
    config = copy.deepcopy(items[0]["config"])
    delly_config = utils.get_in(config, ("resources", "delly"), {})
    delly_config["cores"] = len(items)
    config["resources"]["delly"] = delly_config
    parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1),
                "progs": ["delly"]}
    sv_types = ["DEL", "DUP", "INV"]  # "TRA" has invalid VCF END specifications that GATK doesn't like
    with closing(pysam.Samfile(work_bams[0], "rb")) as pysam_work_bam:
        bytype_vcfs = run_multicore(_run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items)
                                                 for (chrom, sv_type)
                                                 in itertools.product(pysam_work_bam.references, sv_types)],
                                    config, parallel)
    out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs)
    combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"])
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        base, ext = utils.splitext_plus(combo_vcf)
        sample = tz.get_in(["rgnames", "sample"], data)
        delly_sample_vcf = vcfutils.select_sample(combo_vcf, sample,
                                                  "%s-%s%s" % (base, sample, ext), data["config"])
        delly_vcf = vfilter.hard_w_expression(delly_sample_vcf,
                                              "FMT/DV < 4 || (FMT/DV / (FMT/DV + FMT/DR)) < 0.2", data,
                                              name="DVSupport")
        data["sv"].append({"variantcaller": "delly", "vrn_file": delly_vcf})
        out.append(data)
    return out