Пример #1
0
def _cnvkit_segment(cnr_file, cov_interval, data):
    """Perform segmentation and copy number calling on normalized inputs
    """
    out_file = "%s.cns" % os.path.splitext(cnr_file)[0]
    if not utils.file_uptodate(out_file, cnr_file):
        with file_transaction(data, out_file) as tx_out_file:
            if not _cna_has_values(cnr_file):
                with open(tx_out_file, "w") as out_handle:
                    out_handle.write(
                        "chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n"
                    )
            else:
                cmd = [
                    _get_cmd(), "segment", "-p",
                    str(dd.get_cores(data)), "-o", tx_out_file, cnr_file
                ]
                small_vrn_files = _compatible_small_variants(data)
                if len(small_vrn_files) > 0 and _cna_has_values(
                        cnr_file) and cov_interval != "genome":
                    cmd += ["-v", small_vrn_files[0]]
                if cov_interval == "genome":
                    cmd += ["--threshold", "0.00001"]
                # preferentially use conda installed Rscript
                export_cmd = (
                    "%s && export TMPDIR=%s && " %
                    (utils.get_R_exports(), os.path.dirname(tx_out_file)))
                do.run(export_cmd + " ".join(cmd), "CNVkit segment")
    return out_file
Пример #2
0
def _run_bubbletree(vcf_csv, cnv_csv, data, wide_lrr=False, do_plots=True,
                    handle_failures=True):
    """Create R script and run on input data

    BubbleTree has some internal hardcoded paramters that assume a smaller
    distribution of log2 scores. This is not true for tumor-only calls, so if
    we specify wide_lrr we scale the calculations to actually get calls. Need a
    better long term solution with flexible parameters.
    """
    lrr_scale = 10.0 if wide_lrr else 1.0
    local_sitelib = utils.R_sitelib()
    base = utils.splitext_plus(vcf_csv)[0]
    r_file = "%s-run.R" % base
    bubbleplot_out = "%s-bubbleplot.pdf" % base
    trackplot_out = "%s-trackplot.pdf" % base
    calls_out = "%s-calls.rds" % base
    freqs_out = "%s-bubbletree_prevalence.txt" % base
    sample = dd.get_sample_name(data)
    do_plots = "yes" if do_plots else "no"
    with open(r_file, "w") as out_handle:
        out_handle.write(_script.format(**locals()))
    if not utils.file_exists(freqs_out):
        cmd = "%s && %s --no-environ %s" % (utils.get_R_exports(), utils.Rscript_cmd(), r_file)
        try:
            do.run(cmd, "Assess heterogeneity with BubbleTree")
        except subprocess.CalledProcessError as msg:
            if handle_failures and _allowed_bubbletree_errorstates(str(msg)):
                with open(freqs_out, "w") as out_handle:
                    out_handle.write('bubbletree failed:\n %s"\n' % (str(msg)))
            else:
                logger.exception()
                raise
    return {"caller": "bubbletree",
            "report": freqs_out,
            "plot": {"bubble": bubbleplot_out, "track": trackplot_out}}
Пример #3
0
def process_intervals(data):
    """Prepare intervals file"""
    bed_file = regions.get_sv_bed(data)
    if not bed_file:
         bed_file = bedutils.clean_file(dd.get_variant_regions(data), data)
    if not bed_file:
        return None

    basename = os.path.splitext(bed_file)[0]
    ready_file = basename + ".txt"
    if os.path.exists(ready_file):
        return ready_file
    optimized_bed = basename + ".optimized.bed"
    rscript = utils.Rscript_cmd("r36")
    interval_file_r = utils.R_package_script("r36", "PureCN", "extdata/IntervalFile.R")
    ref_file = dd.get_ref_file(data)
    mappability_resource = dd.get_variation_resources(data)["purecn_mappability"]
    genome = dd.get_genome_build(data)
    cmd = [rscript, interval_file_r, "--infile", bed_file,
          "--fasta", ref_file,
          "--outfile", ready_file,
          "--offtarget",
          "--genome", genome,
          "--export", optimized_bed,
          "--mappability", mappability_resource]
    try:
        cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"),
                                                     utils.get_R_exports(env = "r36"),
                                                     " ".join([str(x) for x in cmd]))
        do.run(cmd_line, "PureCN intervals")
    except subprocess.CalledProcessError as msg:
        logger.info("PureCN failed to prepare intervals")
    logger.debug("Saved PureCN interval file into " + ready_file)
    return ready_file
Пример #4
0
def _run_bubbletree(vcf_csv, cnv_csv, data, wide_lrr=False, do_plots=True,
                    handle_failures=True):
    """Create R script and run on input data

    BubbleTree has some internal hardcoded paramters that assume a smaller
    distribution of log2 scores. This is not true for tumor-only calls, so if
    we specify wide_lrr we scale the calculations to actually get calls. Need a
    better long term solution with flexible parameters.
    """
    lrr_scale = 10.0 if wide_lrr else 1.0
    local_sitelib = utils.R_sitelib()
    base = utils.splitext_plus(vcf_csv)[0]
    r_file = "%s-run.R" % base
    bubbleplot_out = "%s-bubbleplot.pdf" % base
    trackplot_out = "%s-trackplot.pdf" % base
    calls_out = "%s-calls.rds" % base
    freqs_out = "%s-bubbletree_prevalence.txt" % base
    sample = dd.get_sample_name(data)
    do_plots = "yes" if do_plots else "no"
    with open(r_file, "w") as out_handle:
        out_handle.write(_script.format(**locals()))
    if not utils.file_exists(freqs_out):
        cmd = "%s && %s --no-environ %s" % (utils.get_R_exports(), utils.Rscript_cmd(), r_file)
        try:
            do.run(cmd, "Assess heterogeneity with BubbleTree")
        except subprocess.CalledProcessError as msg:
            if handle_failures and _allowed_bubbletree_errorstates(str(msg)):
                with open(freqs_out, "w") as out_handle:
                    out_handle.write('bubbletree failed:\n %s"\n' % (str(msg)))
            else:
                logger.exception()
                raise
    return {"caller": "bubbletree",
            "report": freqs_out,
            "plot": {"bubble": bubbleplot_out, "track": trackplot_out}}
Пример #5
0
def create_normal_db(coverage_files_txt, snv_pon, out_dir, genome_build):
    """create normal db
       input: coverage files calculated by purecn for each sample
              snv_pon - mutect2 SNV PON
       output:
              mapping_bias_hg38.rds
              normalDB_hg38.rds
    """
    rscript = utils.Rscript_cmd("r36")
    normaldb_r = utils.R_package_script("r36", "PureCN", "extdata/NormalDB.R")
    cmd = [rscript, normaldb_r,
           "--outdir", out_dir,
           "--coveragefiles", coverage_files_txt,
           "--normal_panel" , snv_pon,
           "--genome", genome_build,
           "--force"]
    try:
        cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"),
                                                          utils.get_R_exports(env = "r36"),
                                                          " ".join([str(x) for x in cmd]))
        do.run(cmd_line, "PureCN normalDB")
    except subprocess.CalledProcessError as msg:
        logger.info("PureCN failed to create a normal db")

    return out_dir
Пример #6
0
def get_coverage(data):
    """Calculate coverage for a sample.bam, account for GC content
       data is single sample
    """
    data = utils.to_single_data(data)
    bed_file = tz.get_in(["config", "algorithm", "purecn_bed_ready"], data)
    sample_name = dd.get_sample_name(data)
    work_dir = _sv_workdir(data)
    rscript = utils.Rscript_cmd("r36")
    coverage_r = utils.R_package_script("r36", "PureCN", "extdata/Coverage.R")
    intervals = tz.get_in(["config", "algorithm", "purecn_bed_ready"], data)
    # PureCN resolves symlinks and the actual output PureCN coverage file name
    # is derived from the end bam not from bam_file
    bam_file = os.path.realpath(dd.get_align_bam(data))
    bam_name = os.path.basename(bam_file)
    (bname, ext) = os.path.splitext(bam_name)
    result_file = os.path.join(work_dir, bname + "_coverage_loess.txt.gz")
    if not os.path.exists(result_file):
        cmd = [rscript, coverage_r,
               "--outdir", work_dir,
               "--bam", bam_file,
               "--intervals", intervals]
        try:
            cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"),
                                                              utils.get_R_exports(env = "r36"),
                                                              " ".join([str(x) for x in cmd]))
            do.run(cmd_line, "PureCN coverage")
        except subprocess.CalledProcessError as msg:
            logger.info("PureCN failed to calculate coverage")
        logger.debug("Saved PureCN coverage files to " + result_file)
    return result_file
Пример #7
0
def _amber_het_file(vrn_files, work_dir, paired):
    """Create file of BAFs in normal heterozygous positions compatible with AMBER.

    https://github.com/hartwigmedical/hmftools/tree/master/amber
    https://github.com/hartwigmedical/hmftools/blob/637e3db1a1a995f4daefe2d0a1511a5bdadbeb05/hmf-common/src/test/resources/amber/new.amber.baf
    """
    assert vrn_files, "Did not find compatible variant calling files for TitanCNA inputs"
    from bcbio.heterogeneity import bubbletree

    prep_file = bubbletree.prep_vrn_file(vrn_files[0]["vrn_file"],
                                         vrn_files[0]["variantcaller"],
                                         work_dir, paired, AmberWriter)
    amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber"))
    out_file = os.path.join(
        amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data))
    utils.symlink_plus(prep_file, out_file)
    pcf_file = out_file + ".pcf"
    if not utils.file_exists(pcf_file):
        with file_transaction(paired.tumor_data, pcf_file) as tx_out_file:
            r_file = os.path.join(os.path.dirname(tx_out_file),
                                  "bafSegmentation.R")
            with open(r_file, "w") as out_handle:
                out_handle.write(_amber_seg_script)
            cmd = "%s && %s --no-environ %s %s %s" % (utils.get_R_exports(
            ), utils.Rscript_cmd(), r_file, out_file, pcf_file)
            do.run(cmd, "PURPLE: AMBER baf segmentation")
    return out_file
Пример #8
0
def _run_cobalt(paired, work_dir):
    """Run Cobalt for counting read depth across genomic windows.

    PURPLE requires even 1000bp windows so use integrated counting solution
    directly rather than converting from CNVkit calculations. If this approach
    is useful should be moved upstream to be available to other tools as
    an input comparison.

    https://github.com/hartwigmedical/hmftools/tree/master/count-bam-lines
    """
    cobalt_dir = utils.safe_makedir(os.path.join(work_dir, "cobalt"))
    out_file = os.path.join(
        cobalt_dir, "%s.cobalt" % dd.get_sample_name(paired.tumor_data))
    if not utils.file_exists(out_file):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            cmd = [
                "COBALT", "-reference", paired.normal_name, "-reference_bam",
                paired.normal_bam, "-tumor", paired.tumor_name, "-tumor_bam",
                paired.tumor_bam, "-threads",
                dd.get_num_cores(paired.tumor_data), "-output_dir",
                os.path.dirname(tx_out_file), "-gc_profile",
                dd.get_variation_resources(paired.tumor_data)["gc_profile"]
            ]
            cmd = "%s && %s" % (utils.get_R_exports(), " ".join(
                [str(x) for x in cmd]))
            do.run(cmd, "PURPLE: COBALT read depth normalization")
            for f in os.listdir(os.path.dirname(tx_out_file)):
                if f != os.path.basename(tx_out_file):
                    shutil.move(os.path.join(os.path.dirname(tx_out_file), f),
                                os.path.join(cobalt_dir, f))
    return out_file
Пример #9
0
def _cnvkit_segment(cnr_file, cov_interval, data, items, out_file=None):
    """Perform segmentation and copy number calling on normalized inputs
    """
    if not out_file:
        out_file = "%s.cns" % os.path.splitext(cnr_file)[0]
    if not utils.file_uptodate(out_file, cnr_file):
        with file_transaction(data, out_file) as tx_out_file:
            if not _cna_has_values(cnr_file):
                with open(tx_out_file, "w") as out_handle:
                    out_handle.write("chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n")
            else:
                cmd = [_get_cmd(), "segment", "-p", str(dd.get_cores(data)),
                       "-o", tx_out_file, cnr_file]
                small_vrn_files = _compatible_small_variants(data, items)
                if len(small_vrn_files) > 0 and _cna_has_values(cnr_file) and cov_interval != "genome":
                    cmd += ["--vcf", small_vrn_files[0].name, "--sample-id", small_vrn_files[0].sample]
                    if small_vrn_files[0].normal:
                        cmd += ["--normal-id", small_vrn_files[0].normal]
                if cov_interval == "genome":
                    cmd += ["--threshold", "0.00001"]
                # For tumors, remove very low normalized regions, avoiding upcaptured noise
                # https://github.com/chapmanb/bcbio-nextgen/issues/2171#issuecomment-348333650
                paired = vcfutils.get_paired(items)
                if paired:
                    cmd += ["--drop-low-coverage"]
                # preferentially use conda installed Rscript
                export_cmd = ("%s && export TMPDIR=%s && "
                              % (utils.get_R_exports(), os.path.dirname(tx_out_file)))
                do.run(export_cmd + " ".join(cmd), "CNVkit segment")
    return out_file
Пример #10
0
def _amber_het_file(method, vrn_files, work_dir, paired):
    """Create file of BAFs in normal heterozygous positions compatible with AMBER.

    Two available methods:
      - pon -- Use panel of normals with likely heterozygous sites.
      - variants -- Use pre-existing variant calls, filtered to likely heterozygotes.

    https://github.com/hartwigmedical/hmftools/tree/master/amber
    https://github.com/hartwigmedical/hmftools/blob/637e3db1a1a995f4daefe2d0a1511a5bdadbeb05/hmf-common/src/test/resources/amber/new.amber.baf
    """
    assert vrn_files, "Did not find compatible variant calling files for PURPLE inputs"
    from bcbio.heterogeneity import bubbletree

    if method == "variants":
        amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber"))
        out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data))
        prep_file = bubbletree.prep_vrn_file(vrn_files[0]["vrn_file"], vrn_files[0]["variantcaller"],
                                             work_dir, paired, AmberWriter)
        utils.symlink_plus(prep_file, out_file)
        pcf_file = out_file + ".pcf"
        if not utils.file_exists(pcf_file):
            with file_transaction(paired.tumor_data, pcf_file) as tx_out_file:
                r_file = os.path.join(os.path.dirname(tx_out_file), "bafSegmentation.R")
                with open(r_file, "w") as out_handle:
                    out_handle.write(_amber_seg_script)
                cmd = "%s && %s --no-environ %s %s %s" % (utils.get_R_exports(), utils.Rscript_cmd(), r_file,
                                                          out_file, pcf_file)
                do.run(cmd, "PURPLE: AMBER baf segmentation")
    else:
        assert method == "pon"
        out_file = _run_amber(paired, work_dir)
    return out_file
Пример #11
0
def _run_cobalt(paired, work_dir):
    """Run Cobalt for counting read depth across genomic windows.

    PURPLE requires even 1000bp windows so use integrated counting solution
    directly rather than converting from CNVkit calculations. If this approach
    is useful should be moved upstream to be available to other tools as
    an input comparison.

    https://github.com/hartwigmedical/hmftools/tree/master/count-bam-lines
    """
    cobalt_dir = utils.safe_makedir(os.path.join(work_dir, "cobalt"))
    out_file = os.path.join(cobalt_dir, "%s.cobalt" % dd.get_sample_name(paired.tumor_data))
    if not utils.file_exists(out_file):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            cmd = ["COBALT"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \
                  ["-reference", paired.normal_name, "-reference_bam", paired.normal_bam,
                   "-tumor", paired.tumor_name, "-tumor_bam", paired.tumor_bam,
                   "-threads", dd.get_num_cores(paired.tumor_data),
                   "-output_dir", os.path.dirname(tx_out_file),
                   "-gc_profile", dd.get_variation_resources(paired.tumor_data)["gc_profile"]]
            cmd = "%s && %s" % (utils.get_R_exports(), " ".join([str(x) for x in cmd]))
            do.run(cmd, "PURPLE: COBALT read depth normalization")
            for f in os.listdir(os.path.dirname(tx_out_file)):
                if f != os.path.basename(tx_out_file):
                    shutil.move(os.path.join(os.path.dirname(tx_out_file), f),
                                os.path.join(cobalt_dir, f))
    return out_file
Пример #12
0
def _run_titancna(cn_file, het_file, ploidy, num_clusters, work_dir, data):
    """Run titanCNA wrapper script on given ploidy and clusters.
    """
    sample = dd.get_sample_name(data)
    cores = dd.get_num_cores(data)
    export_cmd = utils.get_R_exports()
    ploidy_dir = utils.safe_makedir(os.path.join(work_dir, "run_ploidy%s" % ploidy))

    cluster_dir = "%s_cluster%02d" % (sample, num_clusters)
    out_dir = os.path.join(ploidy_dir, cluster_dir)
    if not utils.file_uptodate(out_dir + ".titan.txt", cn_file):
        with tx_tmpdir(data) as tmp_dir:
            with utils.chdir(tmp_dir):
                cmd = ("{export_cmd} && titanCNA.R --id {sample} --hetFile {het_file} --cnFile {cn_file} "
                       "--numClusters {num_clusters} --ploidy {ploidy} --numCores {cores} --outDir {tmp_dir} "
                       "--libdir None")
                if data["genome_build"] in ("hg19", "hg38"):
                    cmd += " --genomeStyle UCSC"
                # TitanCNA's model is influenced by the variance in read coverage data
                # and data type: set reasonable defaults for non-WGS runs
                # (see https://github.com/gavinha/TitanCNA/tree/master/scripts/R_scripts)
                if dd.get_coverage_interval(data) != "genome":
                    cmd += " --alphaK=2500 --alphaKHigh=2500"
                do.run(cmd.format(**locals()), "TitanCNA CNV detection: ploidy %s, cluster %s" % (ploidy, num_clusters))
            for fname in glob.glob(os.path.join(tmp_dir, cluster_dir + "*")):
                shutil.move(fname, ploidy_dir)
            if os.path.exists(os.path.join(tmp_dir, "Rplots.pdf")):
                shutil.move(os.path.join(tmp_dir, "Rplots.pdf"),
                            os.path.join(ploidy_dir, "%s.Rplots.pdf" % cluster_dir))
    return ploidy_dir
Пример #13
0
def _run_titancna(cn_file, het_file, ploidy, num_clusters, work_dir, data):
    """Run titanCNA wrapper script on given ploidy and clusters.
    """
    sample = dd.get_sample_name(data)
    cores = dd.get_num_cores(data)
    export_cmd = utils.get_R_exports()
    ploidy_dir = utils.safe_makedir(
        os.path.join(work_dir, "run_ploidy%s" % ploidy))
    cluster_dir = "%s_cluster%02d" % (sample, num_clusters)
    out_dir = os.path.join(ploidy_dir, cluster_dir)
    if not utils.file_uptodate(out_dir + ".titan.txt", cn_file):
        with tx_tmpdir(data) as tmp_dir:
            with utils.chdir(tmp_dir):
                cmd = (
                    "{export_cmd} && titanCNA.R --id {sample} --hetFile {het_file} --cnFile {cn_file} "
                    "--numClusters {num_clusters} --ploidy {ploidy} --numCores {cores} --outDir {tmp_dir}"
                )
                do.run(
                    cmd.format(**locals()),
                    "TitanCNA CNV detection: ploidy %s, cluster %s" %
                    (ploidy, num_clusters))
            for fname in glob.glob(os.path.join(tmp_dir, cluster_dir + "*")):
                shutil.move(fname, ploidy_dir)
            if os.path.exists(os.path.join(tmp_dir, "Rplots.pdf")):
                shutil.move(
                    os.path.join(tmp_dir, "Rplots.pdf"),
                    os.path.join(ploidy_dir, "%s.Rplots.pdf" % cluster_dir))
    return ploidy_dir
Пример #14
0
def _run_purecn_normaldb(paired, out):
    """Run PureCN with normaldb and native segmentation
       paired is one t/n pair or only """
    sample = utils.to_single_data(paired.tumor_data)
    bed_file = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample)
    sample_name = dd.get_sample_name(sample)
    work_dir = _sv_workdir(sample)
    rscript = utils.Rscript_cmd("r36")
    purecn_r = utils.R_package_script("r36", "PureCN", "extdata/PureCN.R")
    intervals = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample)
    bam_file = dd.get_align_bam(sample)
    # termline and somatic - just annotated and filters assigned
    variants_vcf =  tz.get_in(["variants"], sample)[0].get("germline")
    # in a T/N case, there is no germline file - vrn file with all variants
    if not variants_vcf:
        variants_vcf = tz.get_in(["variants"], sample)[0].get("vrn_file")
    normaldb = tz.get_in(["config", "algorithm", "background", "cnv_reference", "purecn_normaldb"], sample)
    mappingbiasfile = tz.get_in(["config", "algorithm", "background", "cnv_reference", "purecn_mapping_bias"], sample)
    sample_coverage = tz.get_in(["depth", "bins", "purecn"], sample)
    simple_repeat_bed = dd.get_variation_resources(sample)["simple_repeat"]
    result_file = os.path.join(work_dir, sample_name + ".rds")
    genome = dd.get_genome_build(sample)
    cmd = [ rscript, purecn_r,
            "--out", work_dir,
            "--tumor", sample_coverage,
            "--sampleid", sample_name,
            "--vcf", variants_vcf,
            "--normaldb", normaldb,
            "--mappingbiasfile", mappingbiasfile,
            "--intervals", intervals,
            "--snpblacklist", simple_repeat_bed,
            "--genome", genome,
            "--force",
            "--postoptimize",
            "--seed", "123",
            "--bootstrapn", "500",
            "--cores", dd.get_num_cores(sample)]
    resources = config_utils.get_resources("purecn", sample)
    if "options" in resources:
        cmd += [str(x) for x in resources.get("options", [])]
    # it is not recommended to use matched normal sample in PureCN analysis,
    # because then it skips PON coverage normalization and denoising steps!
    # but still, if it is supplied, we useit
    if paired.normal_data:
        normal_sample = utils.to_single_data(paired.normal_data)
        if normal_sample:
            normal_coverage = tz.get_in(["depth", "bins", "purecn"], normal_sample)
            cmd.extend(["--normal", normal_coverage])
    if not os.path.exists(result_file):
        try:
            cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"),
                                                              utils.get_R_exports(env = "r36"),
                                                              " ".join([str(x) for x in cmd]))
            do.run(cmd_line, "PureCN copy number calling")
            logger.debug("Saved PureCN output to " + work_dir)
        except subprocess.CalledProcessError as msg:
            logger.info("PureCN failed")
    out_base, out, all_files  = _get_purecn_files(paired, work_dir, require_exist = True)
    return out
Пример #15
0
def _cnvkit_segment(cnr_file,
                    cov_interval,
                    data,
                    items,
                    out_file=None,
                    detailed=False):
    """Perform segmentation and copy number calling on normalized inputs
    """
    if not out_file:
        out_file = "%s.cns" % os.path.splitext(cnr_file)[0]
    if not utils.file_uptodate(out_file, cnr_file):
        with file_transaction(data, out_file) as tx_out_file:
            if not _cna_has_values(cnr_file):
                with open(tx_out_file, "w") as out_handle:
                    out_handle.write(
                        "chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n"
                    )
            else:
                # Scale cores to avoid memory issues with segmentation
                # https://github.com/etal/cnvkit/issues/346
                if cov_interval == "genome":
                    cores = max(1, dd.get_cores(data) // 2)
                else:
                    cores = dd.get_cores(data)
                cmd = [
                    _get_cmd(), "segment", "-p",
                    str(cores), "-o", tx_out_file, cnr_file
                ]
                small_vrn_files = _compatible_small_variants(data, items)
                if len(small_vrn_files) > 0 and _cna_has_values(
                        cnr_file) and cov_interval != "genome":
                    cmd += [
                        "--vcf", small_vrn_files[0].name, "--sample-id",
                        small_vrn_files[0].sample
                    ]
                    if small_vrn_files[0].normal:
                        cmd += ["--normal-id", small_vrn_files[0].normal]
                resources = config_utils.get_resources("cnvkit_segment",
                                                       data["config"])
                user_options = resources.get("options", [])
                cmd += [str(x) for x in user_options]
                if cov_interval == "genome" and "--threshold" not in user_options:
                    cmd += ["--threshold", "0.00001"]
                # For tumors, remove very low normalized regions, avoiding upcaptured noise
                # https://github.com/bcbio/bcbio-nextgen/issues/2171#issuecomment-348333650
                # unless we want detailed segmentation for downstream tools
                paired = vcfutils.get_paired(items)
                if paired:
                    #if detailed:
                    #    cmd += ["-m", "hmm-tumor"]
                    if "--drop-low-coverage" not in user_options:
                        cmd += ["--drop-low-coverage"]
                # preferentially use conda installed Rscript
                export_cmd = (
                    "%s && export TMPDIR=%s && " %
                    (utils.get_R_exports(), os.path.dirname(tx_out_file)))
                do.run(export_cmd + " ".join(cmd), "CNVkit segment")
    return out_file
Пример #16
0
def _run_purecn(paired, work_dir):
    """Run PureCN.R wrapper with pre-segmented CNVkit or GATK4 inputs.
    """
    segfns = {
        "cnvkit": _segment_normalized_cnvkit,
        "gatk-cnv": _segment_normalized_gatk
    }
    out_base, out, all_files = _get_purecn_files(paired, work_dir)
    failed_file = out_base + "-failed.log"
    cnr_file = tz.get_in(["depth", "bins", "normalized"], paired.tumor_data)
    if not utils.file_uptodate(
            out["rds"], cnr_file) and not utils.file_exists(failed_file):
        cnr_file, seg_file = segfns[cnvkit.bin_approach(paired.tumor_data)](
            cnr_file, work_dir, paired)
        from bcbio import heterogeneity
        vcf_file = heterogeneity.get_variants(
            paired.tumor_data, include_germline=False)[0]["vrn_file"]
        vcf_file = germline.filter_to_pass_and_reject(vcf_file,
                                                      paired,
                                                      out_dir=work_dir)
        with file_transaction(paired.tumor_data, out_base) as tx_out_base:
            # Use UCSC style naming for human builds to support BSgenome
            genome = ("hg19" if dd.get_genome_build(paired.tumor_data) in [
                "GRCh37", "hg19"
            ] else dd.get_genome_build(paired.tumor_data))
            cmd = [
                "PureCN.R", "--seed", "42", "--out", tx_out_base, "--rds",
                "%s.rds" % tx_out_base, "--sampleid",
                dd.get_sample_name(paired.tumor_data), "--genome", genome,
                "--vcf", vcf_file, "--tumor", cnr_file, "--segfile", seg_file,
                "--funsegmentation", "Hclust", "--maxnonclonal", "0.3"
            ]
            if dd.get_num_cores(paired.tumor_data) > 1:
                cmd += ["--cores", str(dd.get_num_cores(paired.tumor_data))]
            try:
                cmd = "export R_LIBS_USER=%s && %s && %s" % (
                    utils.R_sitelib(), utils.get_R_exports(), " ".join(
                        [str(x) for x in cmd]))
                do.run(cmd, "PureCN copy number calling")
            except subprocess.CalledProcessError as msg:
                if _allowed_errors(str(msg)):
                    logger.info(
                        "PureCN failed to find solution for %s: skipping" %
                        dd.get_sample_name(paired.tumor_data))
                    with open(failed_file, "w") as out_handle:
                        out_handle.write(str(msg))
                else:
                    logger.exception()
                    raise
            for f in all_files:
                if os.path.exists(os.path.join(os.path.dirname(tx_out_base),
                                               f)):
                    shutil.move(os.path.join(os.path.dirname(tx_out_base), f),
                                os.path.join(os.path.dirname(out_base), f))
    out = _get_purecn_files(paired, work_dir, require_exist=True)[1]
    return out if (out.get("rds") and os.path.exists(out["rds"])) else None
Пример #17
0
def cpat(assembled_fasta, hexamer, logit, data, out_file=None):
    if out_file and file_exists(out_file):
        return out_file
    if not out_file:
        out_file = tempfile.NamedTemporaryFile(delete=False, suffix=".cpat").name
    cpat_cmd = config_utils.get_program("cpat.py", data)
    r_setup = utils.get_R_exports()
    cmd = ("{r_setup} && {cpat_cmd} --gene={assembled_fasta} --hex={hexamer} "
           "--logitModel={logit} --outfile={tx_out_file}")
    message = "Predicing coding potential of %s." % (assembled_fasta)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    return out_file
Пример #18
0
def cpat(assembled_fasta, hexamer, logit, data, out_file=None):
    if out_file and file_exists(out_file):
        return out_file
    if not out_file:
        out_file = tempfile.NamedTemporaryFile(delete=False, suffix=".cpat").name
    cpat_cmd = config_utils.get_program("cpat.py", data)
    r_setup = utils.get_R_exports()
    cmd = ("{r_setup} && {cpat_cmd} --gene={assembled_fasta} --hex={hexamer} "
           "--logitModel={logit} --outfile={tx_out_file}")
    message = "Predicing coding potential of %s." % (assembled_fasta)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    return out_file
Пример #19
0
def _do_run(paired):
    """Perform Battenberg caling with the paired dataset.

    This purposely does not use a temporary directory for the output
    since Battenberg does smart restarts.
    """
    work_dir = _sv_workdir(paired.tumor_data)
    out = _get_battenberg_out(paired, work_dir)
    ignore_file = os.path.join(work_dir, "ignore_chromosomes.txt")
    if len(_missing_files(out)) > 0:
        ref_file = dd.get_ref_file(paired.tumor_data)
        bat_datadir = os.path.normpath(
            os.path.join(os.path.dirname(ref_file), os.pardir, "battenberg"))
        ignore_file, gl_file = _make_ignore_file(
            work_dir, ref_file, ignore_file,
            os.path.join(bat_datadir, "impute", "impute_info.txt"))
        local_sitelib = os.path.join(
            install.get_defaults().get("tooldir", "/usr/local"), "lib", "R",
            "site-library")
        tumor_bam = paired.tumor_bam
        normal_bam = paired.normal_bam
        platform = dd.get_platform(paired.tumor_data)
        genome_build = paired.tumor_data["genome_build"]
        # scale cores to avoid over-using memory during imputation
        cores = max(1, int(dd.get_num_cores(paired.tumor_data) * 0.5))
        gender = {
            "male": "XY",
            "female": "XX",
            "unknown": "L"
        }.get(population.get_gender(paired.tumor_data))
        if gender == "L":
            gender_str = "-ge %s -gl %s" % (gender, gl_file)
        else:
            gender_str = "-ge %s" % (gender)
        r_export_cmd = utils.get_R_exports()
        cmd = (
            "export R_LIBS_USER={local_sitelib} && {r_export_cmd} && "
            "battenberg.pl -t {cores} -o {work_dir} -r {ref_file}.fai "
            "-tb {tumor_bam} -nb {normal_bam} -e {bat_datadir}/impute/impute_info.txt "
            "-u {bat_datadir}/1000genomesloci -c {bat_datadir}/probloci.txt "
            "-ig {ignore_file} {gender_str} "
            "-assembly {genome_build} -species Human -platform {platform}")
        do.run(cmd.format(**locals()), "Battenberg CNV calling")
    assert len(_missing_files(
        out)) == 0, "Missing Battenberg output: %s" % _missing_files(out)
    out["plot"] = _get_battenberg_out_plots(paired, work_dir)
    out["ignore"] = ignore_file
    return out
Пример #20
0
def make_logit_model(coding_fasta, noncoding_fasta, hexamers, data, out_dir=None):
    safe_makedir(out_dir)
    out_prefix = os.path.join(out_dir, "logit")
    out_file = out_prefix + ".logit.RData"
    if file_exists(out_file):
        return out_file
    tx_prefix = tempfile.NamedTemporaryFile(delete=False).name
    tx_out_file = tx_prefix + ".logit.RData"
    logit_cmd = config_utils.get_program("make_logitModel.py", data)
    r_setup = utils.get_R_exports()
    cmd = ("{r_setup} && {logit_cmd} --cgene={coding_fasta} --ngene={noncoding_fasta} "
           "--hex={hexamers} --outfile={tx_prefix}")
    message = "Building coding/noncoding logistical model."
    do.run(cmd.format(**locals()), message)
    shutil.move(tx_out_file, out_file)
    return out_file
Пример #21
0
def make_logit_model(coding_fasta, noncoding_fasta, hexamers, data, out_dir=None):
    safe_makedir(out_dir)
    out_prefix = os.path.join(out_dir, "logit")
    out_file = out_prefix + ".logit.RData"
    if file_exists(out_file):
        return out_file
    tx_prefix = tempfile.NamedTemporaryFile(delete=False).name
    tx_out_file = tx_prefix + ".logit.RData"
    logit_cmd = config_utils.get_program("make_logitModel.py", data)
    r_setup = utils.get_R_exports()
    cmd = ("{r_setup} && {logit_cmd} --cgene={coding_fasta} --ngene={noncoding_fasta} "
           "--hex={hexamers} --outfile={tx_prefix}")
    message = "Building coding/noncoding logistical model."
    do.run(cmd.format(**locals()), message)
    shutil.move(tx_out_file, out_file)
    return out_file
Пример #22
0
def _run_titancna(cn_file, het_file, ploidy, num_clusters, work_dir, data):
    """Run titanCNA wrapper script on given ploidy and clusters.
    """
    sample = dd.get_sample_name(data)
    cores = dd.get_num_cores(data)
    export_cmd = utils.get_R_exports()
    ploidy_dir = utils.safe_makedir(os.path.join(work_dir, "run_ploidy%s" % ploidy))

    cluster_dir = "%s_cluster%02d" % (sample, num_clusters)
    out_dir = os.path.join(ploidy_dir, cluster_dir)
    if not utils.file_uptodate(out_dir + ".titan.txt", cn_file):
        with tx_tmpdir(data) as tmp_dir:
            with utils.chdir(tmp_dir):
                cmd = ("{export_cmd} && titanCNA.R --id {sample} --hetFile {het_file} --cnFile {cn_file} "
                       "--numClusters {num_clusters} --ploidy {ploidy} --numCores {cores} --outDir {tmp_dir} "
                       "--libdir None")
                chroms = ["'%s'" % c.name.replace("chr", "") for c in ref.file_contigs(dd.get_ref_file(data))
                          if chromhacks.is_autosomal_or_x(c.name)]
                if "'X'" not in chroms:
                    chroms += ["'X'"]
                # Use UCSC style naming for human builds to support BSgenome
                genome_build = ("hg19" if dd.get_genome_build(data) in ["GRCh37", "hg19"]
                                else dd.get_genome_build(data))
                cmd += """ --chrs "c(%s)" """ % ",".join(chroms)
                cmd += " --genomeBuild {genome_build}"
                if data["genome_build"] in ("hg19", "hg38"):
                    cmd += " --genomeStyle UCSC"
                if data["genome_build"] in ["hg38"]:
                    data_dir = os.path.normpath(os.path.join(
                        os.path.dirname(os.path.realpath(os.path.join(
                            os.path.dirname(utils.Rscript_cmd()), "titanCNA.R"))),
                        os.pardir, os.pardir, "data"))
                    cytoband_file = os.path.join(data_dir, "cytoBand_hg38.txt")
                    assert os.path.exists(cytoband_file), cytoband_file
                    cmd += " --cytobandFile %s" % cytoband_file
                # TitanCNA's model is influenced by the variance in read coverage data
                # and data type: set reasonable defaults for non-WGS runs
                # (see https://github.com/gavinha/TitanCNA/tree/master/scripts/R_scripts)
                if dd.get_coverage_interval(data) != "genome":
                    cmd += " --alphaK=2500 --alphaKHigh=2500"
                do.run(cmd.format(**locals()), "TitanCNA CNV detection: ploidy %s, cluster %s" % (ploidy, num_clusters))
            for fname in glob.glob(os.path.join(tmp_dir, cluster_dir + "*")):
                shutil.move(fname, ploidy_dir)
            if os.path.exists(os.path.join(tmp_dir, "Rplots.pdf")):
                shutil.move(os.path.join(tmp_dir, "Rplots.pdf"),
                            os.path.join(ploidy_dir, "%s.Rplots.pdf" % cluster_dir))
    return ploidy_dir
Пример #23
0
def rnaseq_vardict_variant_calling(data):
    sample = dd.get_sample_name(data)
    out_dir = utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "vardict"))
    out_file = os.path.join(out_dir, sample + "-vardict.vcf.gz")
    if file_exists(out_file):
        data = dd.set_vrn_file(data, out_file)
        return data
    vardict_cmd = vardict.get_vardict_command(data)
    strandbias = "teststrandbias.R"
    var2vcf = "var2vcf_valid.pl"
    vcfstreamsort = config_utils.get_program("vcfstreamsort", data)
    compress_cmd = "| bgzip -c"
    freq = float(dd.get_min_allele_fraction(data, 20) / 100.0)
    var2vcf_opts = "-v 50"
    fix_ambig = vcfutils.fix_ambiguous_cl()
    remove_dup = vcfutils.remove_dup_cl()
    r_setup = get_R_exports()
    ref_file = dd.get_ref_file(data)
    bamfile = dd.get_work_bam(data)
    data = _setup_variant_regions(data, out_dir)
    opts, _ = vardict._vardict_options_from_config(
        [data],
        data["config"],
        out_file,
        dd.get_variant_regions(data),
        is_rnaseq=True)
    cores = dd.get_num_cores(data)
    if cores and cores > 1:
        opts += " -th %s" % str(cores)
    with file_transaction(data, out_file) as tx_out_file:
        jvm_opts = vardict._get_jvm_opts(data, tx_out_file)
        cmd = ("{r_setup} && {jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} "
               "-N {sample} -b {bamfile} {opts} "
               "| {strandbias}"
               "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
               "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} "
               "> {tx_out_file}")
        message = "Calling RNA-seq variants with VarDict"
        do.run(cmd.format(**locals()), message)
    out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    data = dd.set_vrn_file(data, out_file)
    return data
Пример #24
0
def _cnvkit_segment(cnr_file, cov_interval, data, items, out_file=None, detailed=False):
    """Perform segmentation and copy number calling on normalized inputs
    """
    if not out_file:
        out_file = "%s.cns" % os.path.splitext(cnr_file)[0]
    if not utils.file_uptodate(out_file, cnr_file):
        with file_transaction(data, out_file) as tx_out_file:
            if not _cna_has_values(cnr_file):
                with open(tx_out_file, "w") as out_handle:
                    out_handle.write("chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n")
            else:
                # Scale cores to avoid memory issues with segmentation
                # https://github.com/etal/cnvkit/issues/346
                if cov_interval == "genome":
                    cores = max(1, dd.get_cores(data) // 2)
                else:
                    cores = dd.get_cores(data)
                cmd = [_get_cmd(), "segment", "-p", str(cores), "-o", tx_out_file, cnr_file]
                small_vrn_files = _compatible_small_variants(data, items)
                if len(small_vrn_files) > 0 and _cna_has_values(cnr_file) and cov_interval != "genome":
                    cmd += ["--vcf", small_vrn_files[0].name, "--sample-id", small_vrn_files[0].sample]
                    if small_vrn_files[0].normal:
                        cmd += ["--normal-id", small_vrn_files[0].normal]
                resources = config_utils.get_resources("cnvkit_segment", data["config"])
                user_options = resources.get("options", [])
                cmd += [str(x) for x in user_options]
                if cov_interval == "genome" and "--threshold" not in user_options:
                    cmd += ["--threshold", "0.00001"]
                # For tumors, remove very low normalized regions, avoiding upcaptured noise
                # https://github.com/bcbio/bcbio-nextgen/issues/2171#issuecomment-348333650
                # unless we want detailed segmentation for downstream tools
                paired = vcfutils.get_paired(items)
                if paired:
                    #if detailed:
                    #    cmd += ["-m", "hmm-tumor"]
                    if "--drop-low-coverage" not in user_options:
                        cmd += ["--drop-low-coverage"]
                # preferentially use conda installed Rscript
                export_cmd = ("%s && export TMPDIR=%s && "
                              % (utils.get_R_exports(), os.path.dirname(tx_out_file)))
                do.run(export_cmd + " ".join(cmd), "CNVkit segment")
    return out_file
Пример #25
0
def _run_purecn(paired, work_dir):
    """Run PureCN.R wrapper with pre-segmented CNVkit or GATK4 inputs.
    """
    segfns = {"cnvkit": _segment_normalized_cnvkit, "gatk-cnv": _segment_normalized_gatk}
    out_base, out, all_files = _get_purecn_files(paired, work_dir)
    failed_file = out_base + "-failed.log"
    cnr_file = tz.get_in(["depth", "bins", "normalized"], paired.tumor_data)
    if not utils.file_uptodate(out["rds"], cnr_file) and not utils.file_exists(failed_file):
        cnr_file, seg_file = segfns[cnvkit.bin_approach(paired.tumor_data)](cnr_file, work_dir, paired)
        from bcbio import heterogeneity
        vcf_file = heterogeneity.get_variants(paired.tumor_data, include_germline=False)[0]["vrn_file"]
        vcf_file = germline.filter_to_pass_and_reject(vcf_file, paired, out_dir=work_dir)
        with file_transaction(paired.tumor_data, out_base) as tx_out_base:
            # Use UCSC style naming for human builds to support BSgenome
            genome = ("hg19" if dd.get_genome_build(paired.tumor_data) in ["GRCh37", "hg19"]
                      else dd.get_genome_build(paired.tumor_data))
            cmd = ["PureCN.R", "--seed", "42", "--out", tx_out_base, "--rds", "%s.rds" % tx_out_base,
                   "--sampleid", dd.get_sample_name(paired.tumor_data),
                   "--genome", genome,
                   "--vcf", vcf_file, "--tumor", cnr_file,
                   "--segfile", seg_file, "--funsegmentation", "Hclust", "--maxnonclonal", "0.3"]
            if dd.get_num_cores(paired.tumor_data) > 1:
                cmd += ["--cores", str(dd.get_num_cores(paired.tumor_data))]
            try:
                cmd = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(), utils.get_R_exports(),
                                                             " ".join([str(x) for x in cmd]))
                do.run(cmd, "PureCN copy number calling")
            except subprocess.CalledProcessError as msg:
                if _allowed_errors(str(msg)):
                    logger.info("PureCN failed to find solution for %s: skipping" %
                                dd.get_sample_name(paired.tumor_data))
                    with open(failed_file, "w") as out_handle:
                        out_handle.write(str(msg))
                else:
                    logger.exception()
                    raise
            for f in all_files:
                if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)):
                    shutil.move(os.path.join(os.path.dirname(tx_out_base), f),
                                os.path.join(os.path.dirname(out_base), f))
    out = _get_purecn_files(paired, work_dir, require_exist=True)[1]
    return out if (out.get("rds") and os.path.exists(out["rds"])) else None
Пример #26
0
def _cnvkit_segment(cnr_file, cov_interval, data):
    """Perform segmentation and copy number calling on normalized inputs
    """
    out_file = "%s.cns" % os.path.splitext(cnr_file)[0]
    if not utils.file_uptodate(out_file, cnr_file):
        with file_transaction(data, out_file) as tx_out_file:
            if not _cna_has_values(cnr_file):
                with open(tx_out_file, "w") as out_handle:
                    out_handle.write("chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n")
            else:
                cmd = [_get_cmd(), "segment", "-p", str(dd.get_cores(data)),
                       "-o", tx_out_file, cnr_file]
                small_vrn_files = _compatible_small_variants(data)
                if len(small_vrn_files) > 0 and _cna_has_values(cnr_file) and cov_interval != "genome":
                    cmd += ["-v", small_vrn_files[0]]
                if cov_interval == "genome":
                    cmd += ["--threshold", "0.00001"]
                # preferentially use conda installed Rscript
                export_cmd = ("%s && export TMPDIR=%s && "
                              % (utils.get_R_exports(), os.path.dirname(tx_out_file)))
                do.run(export_cmd + " ".join(cmd), "CNVkit segment")
    return out_file
Пример #27
0
def rnaseq_vardict_variant_calling(data):
    sample = dd.get_sample_name(data)
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                              "variation", "rnaseq", "vardict"))
    out_file = os.path.join(out_dir, sample + "-vardict.vcf.gz")
    if file_exists(out_file):
        data = dd.set_vrn_file(data, out_file)
        return data
    vardict_cmd = vardict.get_vardict_command(data)
    strandbias = "teststrandbias.R"
    var2vcf = "var2vcf_valid.pl"
    vcfstreamsort = config_utils.get_program("vcfstreamsort", data)
    compress_cmd = "| bgzip -c"
    freq = float(dd.get_min_allele_fraction(data, 20) / 100.0)
    var2vcf_opts = "-v 50"
    fix_ambig = vcfutils.fix_ambiguous_cl()
    remove_dup = vcfutils.remove_dup_cl()
    r_setup = get_R_exports()
    ref_file = dd.get_ref_file(data)
    bamfile = dd.get_work_bam(data)
    data = _setup_variant_regions(data, out_dir)
    opts, _ = vardict._vardict_options_from_config([data], data["config"], out_file, dd.get_variant_regions(data),
                                                   is_rnaseq=True)
    cores = dd.get_num_cores(data)
    if cores and cores > 1:
        opts += " -th %s" % str(cores)
    with file_transaction(data, out_file) as tx_out_file:
        jvm_opts = vardict._get_jvm_opts(data, tx_out_file)
        cmd = ("{r_setup} && {jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} "
               "-N {sample} -b {bamfile} {opts} "
               "| {strandbias}"
               "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
               "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} "
               "> {tx_out_file}")
        message = "Calling RNA-seq variants with VarDict"
        do.run(cmd.format(**locals()), message)
    out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    data = dd.set_vrn_file(data, out_file)
    return data
Пример #28
0
def rnaseq_vardict_variant_calling(data):
    sample = dd.get_sample_name(data)
    variation_dir = os.path.join(dd.get_work_dir(data), "variation")
    safe_makedir(variation_dir)
    out_file = os.path.join(variation_dir, sample + "-vardict.vcf.gz")
    if file_exists(out_file):
        data = dd.set_vrn_file(data, out_file)
        return data
    vardict_cmd = vardict.get_vardict_command(data)
    strandbias = "teststrandbias.R"
    var2vcf = "var2vcf_valid.pl"
    vcfstreamsort = config_utils.get_program("vcfstreamsort", data)
    compress_cmd = "| bgzip -c"
    freq = float(dd.get_min_allele_fraction(data, 20) / 100.0)
    var2vcf_opts = "-v 50"
    fix_ambig = vcfutils.fix_ambiguous_cl()
    remove_dup = vcfutils.remove_dup_cl()
    r_setup = get_R_exports()
    ref_file = dd.get_ref_file(data)
    bamfile = dd.get_work_bam(data)
    bed_file = gtf.gtf_to_bed(dd.get_gtf_file(data))
    opts = " -c 1 -S 2 -E 3 -g 4 "
    resources = config_utils.get_resources("vardict", data)
    if resources.get("options"):
        opts += " ".join([str(x) for x in resources["options"]])
    with file_transaction(data, out_file) as tx_out_file:
        jvm_opts = vardict._get_jvm_opts(data, tx_out_file)
        cmd = ("{r_setup} && {jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} "
                "-N {sample} -b {bamfile} {opts} {bed_file} "
                "| {strandbias}"
                "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
                "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} "
                "> {tx_out_file}")
        message = "Calling RNA-seq variants with VarDict"
        do.run(cmd.format(**locals()), message)
    data = dd.set_vrn_file(data, out_file)
    return data
Пример #29
0
def rnaseq_vardict_variant_calling(data):
    sample = dd.get_sample_name(data)
    variation_dir = os.path.join(dd.get_work_dir(data), "variation")
    safe_makedir(variation_dir)
    out_file = os.path.join(variation_dir, sample + "-vardict.vcf.gz")
    if file_exists(out_file):
        data = dd.set_vrn_file(data, out_file)
        return data
    vardict_cmd = vardict.get_vardict_command(data)
    strandbias = "teststrandbias.R"
    var2vcf = "var2vcf_valid.pl"
    vcfstreamsort = config_utils.get_program("vcfstreamsort", data)
    compress_cmd = "| bgzip -c"
    freq = float(dd.get_min_allele_fraction(data, 20) / 100.0)
    var2vcf_opts = "-v 50"
    fix_ambig = vcfutils.fix_ambiguous_cl()
    remove_dup = vcfutils.remove_dup_cl()
    r_setup = get_R_exports()
    ref_file = dd.get_ref_file(data)
    bamfile = dd.get_work_bam(data)
    bed_file = gtf.gtf_to_bed(dd.get_gtf_file(data))
    opts = " -c 1 -S 2 -E 3 -g 4 "
    resources = config_utils.get_resources("vardict", data)
    if resources.get("options"):
        opts += " ".join([str(x) for x in resources["options"]])
    with file_transaction(data, out_file) as tx_out_file:
        jvm_opts = vardict._get_jvm_opts(data, tx_out_file)
        cmd = ("{r_setup} && {jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} "
                "-N {sample} -b {bamfile} {opts} {bed_file} "
                "| {strandbias}"
                "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
                "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} "
                "> {tx_out_file}")
        message = "Calling RNA-seq variants with VarDict"
        do.run(cmd.format(**locals()), message)
    data = dd.set_vrn_file(data, out_file)
    return data
Пример #30
0
def _run_vardict_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with VarDict.

    var2vcf_valid uses -A flag which reports all alleles and improves sensitivity:
    https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            vrs = bedutils.population_variant_regions(items)
            target = shared.subset_variant_regions(
                vrs, region, out_file, items=items, do_merge=False)
            num_bams = len(align_bams)
            sample_vcf_names = []  # for individual sample names, given batch calling may be required
            for bamfile, item in zip(align_bams, items):
                # prepare commands
                sample = dd.get_sample_name(item)
                vardict = get_vardict_command(items[0])
                opts, var2vcf_opts = _vardict_options_from_config(items, config, out_file, target)
                vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if tx_out_file.endswith("gz") else ""
                freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                py_cl = os.path.join(utils.get_bcbio_bin(), "py")
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports())
                contig_cl = vcfutils.add_contig_to_header_cl(ref_file, tx_out_file)
                lowfreq_filter = _lowfreq_linear_filter(0, False)
                cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                       "-N {sample} -b {bamfile} {opts} "
                       "| teststrandbias.R "
                       "| var2vcf_valid.pl -A -N {sample} -E -f {freq} {var2vcf_opts} "
                       "| {contig_cl} | bcftools filter -i 'QUAL >= 0' | {lowfreq_filter} "
                       "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}")
                if num_bams > 1:
                    temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if out_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        if not _is_bed_file(target):
                            vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample])
                        else:
                            cmd += " > {tx_tmp_file}"
                            do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
                else:
                    if not _is_bed_file(target):
                        vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample])
                    else:
                        cmd += " > {tx_out_file}"
                        do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(orig_files=sample_vcf_names,
                                             out_file=tx_out_file, ref_file=ref_file,
                                             config=config, region=bamprep.region_to_gatk(region))
    return out_file
Пример #31
0
def _run_vardict_paired(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect variants with Vardict.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            target = shared.subset_variant_regions(dd.get_variant_regions(
                items[0]),
                                                   region,
                                                   out_file,
                                                   do_merge=True)
            paired = vcfutils.get_paired_bams(align_bams, items)
            if not _is_bed_file(target):
                vcfutils.write_empty_vcf(
                    tx_out_file,
                    config,
                    samples=[
                        x for x in [paired.tumor_name, paired.normal_name] if x
                    ])
            else:
                if not paired.normal_bam:
                    ann_file = _run_vardict_caller(align_bams, items, ref_file,
                                                   assoc_files, region,
                                                   out_file)
                    return ann_file
                vardict = get_vardict_command(items[0])
                vcfstreamsort = config_utils.get_program(
                    "vcfstreamsort", config)
                strandbias = "testsomatic.R"
                var2vcf = "var2vcf_paired.pl"
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(
                    utils.get_in(config, ("algorithm", "min_allele_fraction"),
                                 10)) / 100.0
                # merge bed file regions as amplicon VarDict is only supported in single sample mode
                opts, var2vcf_opts = _vardict_options_from_config(
                    items, config, out_file, target)
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                if any("vardict_somatic_filter" in tz.get_in((
                        "config", "algorithm", "tools_off"), data, [])
                       for data in items):
                    somatic_filter = ""
                    freq_filter = ""
                else:
                    var2vcf_opts += " -M "  # this makes VarDict soft filter non-differential variants
                    somatic_filter = (
                        "| sed 's/\\\\.*Somatic\\\\/Somatic/' "
                        "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' "
                        """| %s -c 'from bcbio.variation import freebayes; """
                        """freebayes.call_somatic("%s", "%s")' """ %
                        (sys.executable, paired.tumor_name,
                         paired.normal_name))
                    freq_filter = (
                        "| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null "
                        "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'"
                        % (os.path.join(os.path.dirname(sys.executable), "py"),
                           0, dd.get_aligner(paired.tumor_data)))
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                py_cl = os.path.join(utils.get_bcbio_bin(), "py")
                setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports())
                contig_cl = vcfutils.add_contig_to_header_cl(
                    ref_file, tx_out_file)
                cmd = (
                    "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                    "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} "
                    "| {strandbias} "
                    "| {var2vcf} -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} "
                    "-N \"{paired.tumor_name}|{paired.normal_name}\" "
                    "| {contig_cl} {freq_filter} "
                    "| bcftools filter -i 'QUAL >= 0' "
                    "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} "
                    "{compress_cmd} > {tx_out_file}")
                do.run(cmd.format(**locals()),
                       "Genotyping with VarDict: Inference", {})
    return out_file
Пример #32
0
def _run_vardict_caller(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect SNPs and indels with VarDict.

    var2vcf_valid uses -A flag which reports all alleles and improves sensitivity:
    https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            vrs = bedutils.population_variant_regions(items)
            target = shared.subset_variant_regions(vrs,
                                                   region,
                                                   out_file,
                                                   items=items,
                                                   do_merge=False)
            num_bams = len(align_bams)
            sample_vcf_names = [
            ]  # for individual sample names, given batch calling may be required
            for bamfile, item in zip(align_bams, items):
                # prepare commands
                sample = dd.get_sample_name(item)
                vardict = get_vardict_command(items[0])
                strandbias = "teststrandbias.R"
                var2vcf = "var2vcf_valid.pl"
                opts, var2vcf_opts = _vardict_options_from_config(
                    items, config, out_file, target)
                vcfstreamsort = config_utils.get_program(
                    "vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if tx_out_file.endswith(
                    "gz") else ""
                freq = float(
                    utils.get_in(config, ("algorithm", "min_allele_fraction"),
                                 10)) / 100.0
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                py_cl = os.path.join(utils.get_bcbio_bin(), "py")
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports())
                contig_cl = vcfutils.add_contig_to_header_cl(
                    ref_file, tx_out_file)
                cmd = (
                    "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                    "-N {sample} -b {bamfile} {opts} "
                    "| {strandbias}"
                    "| {var2vcf} -A -N {sample} -E -f {freq} {var2vcf_opts} "
                    "| {contig_cl} | bcftools filter -i 'QUAL >= 0' "
                    "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}"
                )
                if num_bams > 1:
                    temp_file_prefix = out_file.replace(".gz", "").replace(
                        ".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if out_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        if not _is_bed_file(target):
                            vcfutils.write_empty_vcf(tx_tmp_file,
                                                     config,
                                                     samples=[sample])
                        else:
                            cmd += " > {tx_tmp_file}"
                            do.run(cmd.format(**locals()),
                                   "Genotyping with VarDict: Inference", {})
                else:
                    if not _is_bed_file(target):
                        vcfutils.write_empty_vcf(tx_out_file,
                                                 config,
                                                 samples=[sample])
                    else:
                        cmd += " > {tx_out_file}"
                        do.run(cmd.format(**locals()),
                               "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(
                    orig_files=sample_vcf_names,
                    out_file=tx_out_file,
                    ref_file=ref_file,
                    config=config,
                    region=bamprep.region_to_gatk(region))
    return out_file
Пример #33
0
def _run_vardict_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect variants with Vardict.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            vrs = bedutils.population_variant_regions(items)
            target = shared.subset_variant_regions(vrs, region,
                                                   out_file, items=items, do_merge=True)
            paired = vcfutils.get_paired_bams(align_bams, items)
            if not _is_bed_file(target):
                vcfutils.write_empty_vcf(tx_out_file, config,
                                         samples=[x for x in [paired.tumor_name, paired.normal_name] if x])
            else:
                if not paired.normal_bam:
                    ann_file = _run_vardict_caller(align_bams, items, ref_file,
                                                   assoc_files, region, out_file)
                    return ann_file
                vardict = get_vardict_command(items[0])
                vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
                # merge bed file regions as amplicon VarDict is only supported in single sample mode
                opts, var2vcf_opts = _vardict_options_from_config(items, config, out_file, target)
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, [])
                       for data in items):
                    somatic_filter = ""
                    freq_filter = ""
                else:
                    var2vcf_opts += " -M "  # this makes VarDict soft filter non-differential variants
                    somatic_filter = ("| sed 's/\\\\.*Somatic\\\\/Somatic/' "
                                      "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' "
                                      """| %s -c 'from bcbio.variation import freebayes; """
                                      """freebayes.call_somatic("%s", "%s")' """
                                      % (sys.executable, paired.tumor_name, paired.normal_name))
                    freq_filter = ("| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null "
                                   "| %s -x 'bcbio.variation.vardict.add_db_germline_flag(x)' "
                                   "| %s "
                                   "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" %
                                   (os.path.join(os.path.dirname(sys.executable), "py"),
                                    _lowfreq_linear_filter(0, True),
                                    os.path.join(os.path.dirname(sys.executable), "py"),
                                    0, bam.aligner_from_header(paired.tumor_bam)))
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                py_cl = os.path.join(utils.get_bcbio_bin(), "py")
                setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports())
                contig_cl = vcfutils.add_contig_to_header_cl(ref_file, tx_out_file)
                cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                       "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} "
                       "| awk 'NF>=48' | testsomatic.R "
                       "| var2vcf_paired.pl -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} "
                       "-N \"{paired.tumor_name}|{paired.normal_name}\" "
                       "| {contig_cl} {freq_filter} "
                       "| bcftools filter -i 'QUAL >= 0' "
                       "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} "
                       "{compress_cmd} > {tx_out_file}")
                do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
    return out_file