示例#1
0
def _piped_realign_gatk(data, region, cl, out_base_file, tmp_dir, prep_params):
    """Perform realignment with GATK, using input commandline.
    GATK requires writing to disk and indexing before realignment.
    """
    broad_runner = broad.runner_from_config(data["config"])
    pa_bam = "%s-prealign%s" % os.path.splitext(out_base_file)
    if not utils.file_exists(pa_bam):
        with file_transaction(data, pa_bam) as tx_out_file:
            cmd = "{cl} -o {tx_out_file}".format(**locals())
            do.run(cmd, "GATK pre-alignment {0}".format(region), data)
    bam.index(pa_bam, data["config"])
    recal_file = realign.gatk_realigner_targets(
        broad_runner,
        pa_bam,
        data["sam_ref"],
        data["config"],
        region=region_to_gatk(region),
        known_vrns=dd.get_variation_resources(data),
    )
    recal_cl = realign.gatk_indel_realignment_cl(
        broad_runner,
        pa_bam,
        data["sam_ref"],
        recal_file,
        tmp_dir,
        region=region_to_gatk(region),
        known_vrns=dd.get_variation_resources(data),
    )
    return pa_bam, " ".join(recal_cl)
示例#2
0
def _piped_realign_gatk(data, region, cl, out_base_file, tmp_dir, prep_params):
    """Perform realignment with GATK, using input commandline.
    GATK requires writing to disk and indexing before realignment.
    """
    broad_runner = broad.runner_from_config(data["config"])
    pa_bam = "%s-prealign%s" % os.path.splitext(out_base_file)
    if not utils.file_exists(pa_bam):
        with file_transaction(data, pa_bam) as tx_out_file:
            cmd = "{cl} -o {tx_out_file}".format(**locals())
            do.run(cmd, "GATK re-alignment {0}".format(region), data)
    bam.index(pa_bam, data["config"])
    recal_file = realign.gatk_realigner_targets(
        broad_runner,
        pa_bam,
        data["sam_ref"],
        data["config"],
        region=region_to_gatk(region),
        known_vrns=dd.get_variation_resources(data))
    recal_cl = realign.gatk_indel_realignment_cl(
        broad_runner,
        pa_bam,
        data["sam_ref"],
        recal_file,
        tmp_dir,
        region=region_to_gatk(region),
        known_vrns=dd.get_variation_resources(data))
    return pa_bam, recal_cl
示例#3
0
def process_intervals(data):
    """Prepare intervals file"""
    bed_file = regions.get_sv_bed(data)
    if not bed_file:
         bed_file = bedutils.clean_file(dd.get_variant_regions(data), data)
    if not bed_file:
        return None

    basename = os.path.splitext(bed_file)[0]
    ready_file = basename + ".txt"
    if os.path.exists(ready_file):
        return ready_file
    optimized_bed = basename + ".optimized.bed"
    rscript = utils.Rscript_cmd("r36")
    interval_file_r = utils.R_package_script("r36", "PureCN", "extdata/IntervalFile.R")
    ref_file = dd.get_ref_file(data)
    mappability_resource = dd.get_variation_resources(data)["purecn_mappability"]
    genome = dd.get_genome_build(data)
    cmd = [rscript, interval_file_r, "--infile", bed_file,
          "--fasta", ref_file,
          "--outfile", ready_file,
          "--offtarget",
          "--genome", genome,
          "--export", optimized_bed,
          "--mappability", mappability_resource]
    try:
        cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"),
                                                     utils.get_R_exports(env = "r36"),
                                                     " ".join([str(x) for x in cmd]))
        do.run(cmd_line, "PureCN intervals")
    except subprocess.CalledProcessError as msg:
        logger.info("PureCN failed to prepare intervals")
    logger.debug("Saved PureCN interval file into " + ready_file)
    return ready_file
示例#4
0
def _run_cobalt(paired, work_dir):
    """Run Cobalt for counting read depth across genomic windows.

    PURPLE requires even 1000bp windows so use integrated counting solution
    directly rather than converting from CNVkit calculations. If this approach
    is useful should be moved upstream to be available to other tools as
    an input comparison.

    https://github.com/hartwigmedical/hmftools/tree/master/count-bam-lines
    """
    cobalt_dir = utils.safe_makedir(os.path.join(work_dir, "cobalt"))
    out_file = os.path.join(
        cobalt_dir, "%s.cobalt" % dd.get_sample_name(paired.tumor_data))
    if not utils.file_exists(out_file):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            cmd = [
                "COBALT", "-reference", paired.normal_name, "-reference_bam",
                paired.normal_bam, "-tumor", paired.tumor_name, "-tumor_bam",
                paired.tumor_bam, "-threads",
                dd.get_num_cores(paired.tumor_data), "-output_dir",
                os.path.dirname(tx_out_file), "-gc_profile",
                dd.get_variation_resources(paired.tumor_data)["gc_profile"]
            ]
            cmd = "%s && %s" % (utils.get_R_exports(), " ".join(
                [str(x) for x in cmd]))
            do.run(cmd, "PURPLE: COBALT read depth normalization")
            for f in os.listdir(os.path.dirname(tx_out_file)):
                if f != os.path.basename(tx_out_file):
                    shutil.move(os.path.join(os.path.dirname(tx_out_file), f),
                                os.path.join(cobalt_dir, f))
    return out_file
示例#5
0
def annotate_gemini(data):
    """Annotate with population calls if have data installed.
    """
    r = dd.get_variation_resources(data)
    if r.get("exac") and os.path.exists(r["exac"]):
        return True
    return False
示例#6
0
def _run_cobalt(paired, work_dir):
    """Run Cobalt for counting read depth across genomic windows.

    PURPLE requires even 1000bp windows so use integrated counting solution
    directly rather than converting from CNVkit calculations. If this approach
    is useful should be moved upstream to be available to other tools as
    an input comparison.

    https://github.com/hartwigmedical/hmftools/tree/master/count-bam-lines
    """
    cobalt_dir = utils.safe_makedir(os.path.join(work_dir, "cobalt"))
    out_file = os.path.join(cobalt_dir, "%s.cobalt" % dd.get_sample_name(paired.tumor_data))
    if not utils.file_exists(out_file):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            cmd = ["COBALT"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \
                  ["-reference", paired.normal_name, "-reference_bam", paired.normal_bam,
                   "-tumor", paired.tumor_name, "-tumor_bam", paired.tumor_bam,
                   "-threads", dd.get_num_cores(paired.tumor_data),
                   "-output_dir", os.path.dirname(tx_out_file),
                   "-gc_profile", dd.get_variation_resources(paired.tumor_data)["gc_profile"]]
            cmd = "%s && %s" % (utils.get_R_exports(), " ".join([str(x) for x in cmd]))
            do.run(cmd, "PURPLE: COBALT read depth normalization")
            for f in os.listdir(os.path.dirname(tx_out_file)):
                if f != os.path.basename(tx_out_file):
                    shutil.move(os.path.join(os.path.dirname(tx_out_file), f),
                                os.path.join(cobalt_dir, f))
    return out_file
示例#7
0
def _run_purecn_dx(out, paired):
    """Extract signatures and mutational burdens from PureCN rds file."""
    # no solution - no signatures
    if not "rds" in out:
        return out
    rscript = utils.Rscript_cmd()
    purecndx_r = utils.R_package_script("PureCN", "extdata/Dx.R", env="base")
    simple_repeat_bed = dd.get_variation_resources(
        paired.tumor_data)["simple_repeat"]
    callable_bed = dd.get_sample_callable(paired.tumor_data)
    out_base = utils.splitext_plus(out["rds"])[0]
    mutation_burden_csv = out_base + "_mutation_burden.csv"
    if not utils.file_uptodate(mutation_burden_csv, out["rds"]):
        # no signatures - so we generate them
        with file_transaction(paired.tumor_data, out_base) as tx_out_base:
            cmd = [
                rscript, purecndx_r, "--rds", out["rds"], "--callable",
                callable_bed, "--signatures", "--exclude", simple_repeat_bed,
                "--out", tx_out_base
            ]
            do.run(cmd, "PureCN Dx mutational burden and signatures")
            out_base, out, all_files = _get_purecn_dx_files(paired,
                                                            out,
                                                            require_exist=True)
            # if a file was not generated it would not go to the upload
            for f in all_files:
                if os.path.exists(os.path.join(os.path.dirname(tx_out_base),
                                               f)):
                    shutil.move(os.path.join(os.path.dirname(tx_out_base), f),
                                os.path.join(os.path.dirname(out_base), f))
    return out
示例#8
0
def _run_purecn_normaldb(paired, out):
    """Run PureCN with normaldb and native segmentation
       paired is one t/n pair or only """
    sample = utils.to_single_data(paired.tumor_data)
    bed_file = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample)
    sample_name = dd.get_sample_name(sample)
    work_dir = _sv_workdir(sample)
    rscript = utils.Rscript_cmd("r36")
    purecn_r = utils.R_package_script("r36", "PureCN", "extdata/PureCN.R")
    intervals = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample)
    bam_file = dd.get_align_bam(sample)
    # termline and somatic - just annotated and filters assigned
    variants_vcf =  tz.get_in(["variants"], sample)[0].get("germline")
    # in a T/N case, there is no germline file - vrn file with all variants
    if not variants_vcf:
        variants_vcf = tz.get_in(["variants"], sample)[0].get("vrn_file")
    normaldb = tz.get_in(["config", "algorithm", "background", "cnv_reference", "purecn_normaldb"], sample)
    mappingbiasfile = tz.get_in(["config", "algorithm", "background", "cnv_reference", "purecn_mapping_bias"], sample)
    sample_coverage = tz.get_in(["depth", "bins", "purecn"], sample)
    simple_repeat_bed = dd.get_variation_resources(sample)["simple_repeat"]
    result_file = os.path.join(work_dir, sample_name + ".rds")
    genome = dd.get_genome_build(sample)
    cmd = [ rscript, purecn_r,
            "--out", work_dir,
            "--tumor", sample_coverage,
            "--sampleid", sample_name,
            "--vcf", variants_vcf,
            "--normaldb", normaldb,
            "--mappingbiasfile", mappingbiasfile,
            "--intervals", intervals,
            "--snpblacklist", simple_repeat_bed,
            "--genome", genome,
            "--force",
            "--postoptimize",
            "--seed", "123",
            "--bootstrapn", "500",
            "--cores", dd.get_num_cores(sample)]
    resources = config_utils.get_resources("purecn", sample)
    if "options" in resources:
        cmd += [str(x) for x in resources.get("options", [])]
    # it is not recommended to use matched normal sample in PureCN analysis,
    # because then it skips PON coverage normalization and denoising steps!
    # but still, if it is supplied, we useit
    if paired.normal_data:
        normal_sample = utils.to_single_data(paired.normal_data)
        if normal_sample:
            normal_coverage = tz.get_in(["depth", "bins", "purecn"], normal_sample)
            cmd.extend(["--normal", normal_coverage])
    if not os.path.exists(result_file):
        try:
            cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"),
                                                              utils.get_R_exports(env = "r36"),
                                                              " ".join([str(x) for x in cmd]))
            do.run(cmd_line, "PureCN copy number calling")
            logger.debug("Saved PureCN output to " + work_dir)
        except subprocess.CalledProcessError as msg:
            logger.info("PureCN failed")
    out_base, out, all_files  = _get_purecn_files(paired, work_dir, require_exist = True)
    return out
示例#9
0
def annotate_gemini(data, retriever=None):
    """Annotate with population calls if have data installed.
    """
    r = dd.get_variation_resources(data)
    return all([
        r.get(k) and objectstore.file_exists_or_remote(r[k])
        for k in ["exac", "gnomad_exome"]
    ])
示例#10
0
def _run_purple(paired, het_file, depth_file, work_dir):
    """Run PURPLE with pre-calculated AMBER and COBALT compatible inputs.

    XXX Need to add output conversion into VCF for standard formats
    """
    purple_dir = utils.safe_makedir(os.path.join(work_dir, "purple"))
    out_file = os.path.join(
        purple_dir, "%s.purple.cnv" % dd.get_sample_name(paired.tumor_data))
    if not utils.file_exists(out_file):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            cmd = ["PURPLE"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \
                  ["-amber", os.path.dirname(het_file), "-baf", het_file,
                   "-cobalt", os.path.dirname(depth_file),
                   "-gc_profile", dd.get_variation_resources(paired.tumor_data)["gc_profile"],
                   "-output_dir", os.path.dirname(tx_out_file),
                   "-ref_genome", "hg38" if dd.get_genome_build(paired.tumor_data) == "hg38" else "hg19",
                   "-run_dir", work_dir,
                   "-threads", dd.get_num_cores(paired.tumor_data),
                   "-tumor_sample", dd.get_sample_name(paired.tumor_data),
                   "-ref_sample", dd.get_sample_name(paired.normal_data)]
            # Avoid X11 display errors when writing plots
            cmd = "unset DISPLAY && %s" % " ".join([str(x) for x in cmd])
            do.run(cmd, "PURPLE: purity and ploidy estimation")
            for f in os.listdir(os.path.dirname(tx_out_file)):
                if f != os.path.basename(tx_out_file):
                    shutil.move(os.path.join(os.path.dirname(tx_out_file), f),
                                os.path.join(purple_dir, f))
    out_file_export = os.path.join(
        purple_dir,
        "%s-purple-cnv.tsv" % (dd.get_sample_name(paired.tumor_data)))
    if not utils.file_exists(out_file_export):
        utils.symlink_plus(out_file, out_file_export)
    out = {
        "variantcaller": "purple",
        "call_file": out_file_export,
        "plot": {},
        "metrics": {}
    }
    for name, ext in [("copy_number", "copyNumber"),
                      ("minor_allele", "minor_allele"),
                      ("variant", "variant")]:
        plot_file = os.path.join(
            purple_dir, "plot",
            "%s.%s.png" % (dd.get_sample_name(paired.tumor_data), ext))
        if os.path.exists(plot_file):
            out["plot"][name] = plot_file
    purity_file = os.path.join(
        purple_dir, "%s.purple.purity" % dd.get_sample_name(paired.tumor_data))
    with open(purity_file) as in_handle:
        header = in_handle.readline().replace("#", "").split("\t")
        vals = in_handle.readline().split("\t")
        for h, v in zip(header, vals):
            try:
                v = float(v)
            except ValueError:
                pass
            out["metrics"][h] = v
    return out
示例#11
0
def _annotate_somatic(data, retriever=None):
    """Annotate somatic calls if we have cosmic data installed.
    """
    if is_human(data):
        paired = vcfutils.get_paired([data])
        if paired:
            r = dd.get_variation_resources(data)
            if r.get("cosmic") and objectstore.file_exists_or_remote(r["cosmic"]):
                return True
    return False
示例#12
0
def _annotate_somatic(data):
    """Annotate somatic calls if we have cosmic data installed.
    """
    if is_human(data):
        paired = vcfutils.get_paired([data])
        if paired:
            r = dd.get_variation_resources(data)
            if r.get("cosmic") and os.path.exists(r["cosmic"]):
                return True
    return False
示例#13
0
def _annotate_somatic(data, retriever=None):
    """Annotate somatic calls if we have cosmic data installed.
    """
    if is_human(data):
        paired = vcfutils.get_paired([data])
        if paired:
            r = dd.get_variation_resources(data)
            if r.get("cosmic") and objectstore.file_exists_or_remote(r["cosmic"]):
                return True
    return False
示例#14
0
def _run_purple(paired, het_file, depth_file, vrn_files, work_dir):
    """Run PURPLE with pre-calculated AMBER and COBALT compatible inputs.
    """
    purple_dir = utils.safe_makedir(os.path.join(work_dir, "purple"))
    out_file = os.path.join(purple_dir, "%s.purple.cnv" % dd.get_sample_name(paired.tumor_data))
    if not utils.file_exists(out_file):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            cmd = ["PURPLE"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \
                  ["-amber", os.path.dirname(het_file), "-baf", het_file,
                   "-cobalt", os.path.dirname(depth_file),
                   "-gc_profile", dd.get_variation_resources(paired.tumor_data)["gc_profile"],
                   "-output_dir", os.path.dirname(tx_out_file),
                   "-ref_genome", "hg38" if dd.get_genome_build(paired.tumor_data) == "hg38" else "hg19",
                   "-run_dir", work_dir,
                   "-threads", dd.get_num_cores(paired.tumor_data),
                   "-tumor_sample", dd.get_sample_name(paired.tumor_data),
                   "-ref_sample", dd.get_sample_name(paired.normal_data)]
            if vrn_files:
                cmd += ["-somatic_vcf", vrn_files[0]["vrn_file"]]
            # Avoid X11 display errors when writing plots
            cmd = "unset DISPLAY && %s" % " ".join([str(x) for x in cmd])
            do.run(cmd, "PURPLE: purity and ploidy estimation")
            for f in os.listdir(os.path.dirname(tx_out_file)):
                if f != os.path.basename(tx_out_file):
                    shutil.move(os.path.join(os.path.dirname(tx_out_file), f),
                                os.path.join(purple_dir, f))
    out_file_export = os.path.join(purple_dir, "%s-purple-cnv.tsv" % (dd.get_sample_name(paired.tumor_data)))
    if not utils.file_exists(out_file_export):
        utils.symlink_plus(out_file, out_file_export)
    out = {"variantcaller": "purple", "call_file": out_file_export,
           "vrn_file": titancna.to_vcf(out_file_export, "PURPLE", _get_header, _export_to_vcf,
                                       paired.tumor_data),
           "plot": {}, "metrics": {}}
    for name, ext in [("copy_number", "copyNumber"), ("minor_allele", "minor_allele"), ("variant", "variant")]:
        plot_file = os.path.join(purple_dir, "plot", "%s.%s.png" % (dd.get_sample_name(paired.tumor_data), ext))
        if os.path.exists(plot_file):
            out["plot"][name] = plot_file
    purity_file = os.path.join(purple_dir, "%s.purple.purity" % dd.get_sample_name(paired.tumor_data))
    with open(purity_file) as in_handle:
        header = in_handle.readline().replace("#", "").split("\t")
        vals = in_handle.readline().split("\t")
        for h, v in zip(header, vals):
            try:
                v = float(v)
            except ValueError:
                pass
            out["metrics"][h] = v
    return out
示例#15
0
def bqsr_table(data):
    """Generate recalibration tables as inputs to BQSR.
    """
    in_file = dd.get_align_bam(data)
    out_file = "%s-recal-table.txt" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            assoc_files = dd.get_variation_resources(data)
            known = "-k %s" % (assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else ""
            license = license_export(data)
            cores = dd.get_num_cores(data)
            ref_file = dd.get_ref_file(data)
            cmd = ("{license}sentieon driver -t {cores} -r {ref_file} "
                   "-i {in_file} --algo QualCal {known} {tx_out_file}")
            do.run(cmd.format(**locals()), "Sentieon QualCal generate table")
    return out_file
示例#16
0
def bqsr_table(data):
    """Generate recalibration tables as inputs to BQSR.
    """
    in_file = dd.get_align_bam(data)
    out_file = "%s-recal-table.txt" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            assoc_files = dd.get_variation_resources(data)
            known = "-k %s" % (
                assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else ""
            license = license_export(data)
            cores = dd.get_num_cores(data)
            ref_file = dd.get_ref_file(data)
            cmd = ("{license}sentieon driver -t {cores} -r {ref_file} "
                   "-i {in_file} --algo QualCal {known} {tx_out_file}")
            do.run(cmd.format(**locals()), "Sentieon QualCal generate table")
    return out_file
示例#17
0
def apply_bqsr(data):
    """Apply recalibration, producing a updated BAM file.
    """
    in_file = dd.get_align_bam(data)
    out_table_file = "%s-recal-table-post.txt" % utils.splitext_plus(in_file)[0]
    out_file = "%s-recal.bam" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file, out_table_file) as (tx_out_file, tx_table_file):
            assoc_files = dd.get_variation_resources(data)
            known = "-k %s" % (assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else ""
            license = license_export(data)
            cores = dd.get_num_cores(data)
            ref_file = dd.get_ref_file(data)
            cmd = ("{license}sentieon driver -t {cores} -r {ref_file} "
                   "-i {in_file} --algo QualCal {known} {tx_table_file} "
                   "--algo ReadWriter {tx_out_file}")
            do.run(cmd.format(**locals()), "Sentieon QualCal apply recalibration")
    return out_file
示例#18
0
def _run_purecn_dx(out, paired):
    """Extract signatures and mutational burdens from PureCN rds file."""
    out_base, out, all_files = _get_purecn_dx_files(paired, out)
    rscript = utils.Rscript_cmd("r36")
    purecndx_r = utils.R_package_script("r36", "PureCN", "extdata/Dx.R")
    simple_repeat_bed = dd.get_variation_resources(paired.tumor_data)["simple_repeat"]
    callable_bed = dd.get_sample_callable(paired.tumor_data)
    if not utils.file_uptodate(out["mutation_burden"], out["rds"]):
        with file_transaction(paired.tumor_data, out_base) as tx_out_base:
            cmd = [rscript, purecndx_r, 
                   "--rds", out["rds"], 
                   "--callable", callable_bed,
                   "--signatures",
                   "--exclude", simple_repeat_bed,
                   "--out", tx_out_base]
            do.run(cmd, "PureCN Dx mutational burden and signatures")
            for f in all_files:
                if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)):
                    shutil.move(os.path.join(os.path.dirname(tx_out_base), f),
                                os.path.join(os.path.dirname(out_base), f))
    return out
示例#19
0
def apply_bqsr(data):
    """Apply recalibration, producing a updated BAM file.
    """
    in_file = dd.get_align_bam(data)
    out_table_file = "%s-recal-table-post.txt" % utils.splitext_plus(
        in_file)[0]
    out_file = "%s-recal.bam" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file,
                              out_table_file) as (tx_out_file, tx_table_file):
            assoc_files = dd.get_variation_resources(data)
            known = "-k %s" % (
                assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else ""
            license = license_export(data)
            cores = dd.get_num_cores(data)
            ref_file = dd.get_ref_file(data)
            cmd = ("{license}sentieon driver -t {cores} -r {ref_file} "
                   "-i {in_file} --algo QualCal {known} {tx_table_file} "
                   "--algo ReadWriter {tx_out_file}")
            do.run(cmd.format(**locals()),
                   "Sentieon QualCal apply recalibration")
    return out_file
示例#20
0
def annotate_gemini(data, retriever=None):
    """Annotate with population calls if have data installed.
    """
    r = dd.get_variation_resources(data)
    return all([r.get(k) and objectstore.file_exists_or_remote(r[k]) for k in ["exac", "gnomad_exome"]])
示例#21
0
def annotate_gemini(data):
    """Annotate with population calls if have data installed.
    """
    r = dd.get_variation_resources(data)
    return all([r.get(k) and os.path.exists(r[k]) for k in ["exac", "gnomad_exome"]])