示例#1
0
def _run_break_point_inspector(data, variant_file, paired, work_dir):
    output_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(variant_file)[0], "bpi")
    stats_file = "%s-%s_stats.txt" % (utils.splitext_plus(variant_file)[0],
                                      "bpi")
    if not utils.file_exists(output_vcf):
        with file_transaction(data,
                              output_vcf) as tx_output_vcf, file_transaction(
                                  data, stats_file) as tx_stats_file:
            cores = dd.get_num_cores(data)
            resources = config_utils.get_resources("break-point-inspector",
                                                   data["config"])
            jvm_mem_opts = config_utils.adjust_opts(
                resources.get("jvm_opts", ["-Xms1000m", "-Xmx2000m"]), {
                    "algorithm": {
                        "memory_adjust": {
                            "magnitude": cores,
                            "direction": "increase"
                        }
                    }
                })
            jvm_tmp_arg = "-Djava.io.tmpdir=" + utils.safe_makedir(
                os.path.join(work_dir, "bpi_tmp"))
            cmd = ["break-point-inspector"
                   ] + jvm_mem_opts + [jvm_tmp_arg, "-vcf", variant_file]
            if paired:
                cmd += ["-ref", paired.normal_bam, "-tumor", paired.tumor_bam]
            cmd += ["-output_vcf", tx_output_vcf, ">", tx_stats_file]
            do.run(cmd, "Running Break Point Inspector for Manta SV calls")
    return output_vcf
示例#2
0
 def cl_gatk(self, params, tmp_dir):
     support_nt = set()
     support_nct = set(["BaseRecalibrator"])
     gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"])
     local_args = []
     cores = self._config["algorithm"].get("num_cores", 1)
     config = copy.deepcopy(self._config)
     if cores and int(cores) > 1:
         atype_index = params.index("-T") if params.count("-T") > 0 \
                       else params.index("--analysis_type")
         prog = params[atype_index + 1]
         if prog in support_nt:
             params.extend(["-nt", str(cores)])
         elif prog in support_nct:
             params.extend(["-nct", str(cores)])
             if config["algorithm"].get("memory_adjust") is None:
                 config["algorithm"]["memory_adjust"] = {"direction": "increase",
                                                         "magnitude": int(cores) // 2}
     if self.get_gatk_version() > "1.9":
         if len([x for x in params if x.startswith(("-U", "--unsafe"))]) == 0:
             params.extend(["-U", "LENIENT_VCF_PROCESSING"])
         params.extend(["--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment"])
     local_args.append("-Djava.io.tmpdir=%s" % tmp_dir)
     return ["java"] + config_utils.adjust_opts(self._jvm_opts, config) + local_args + \
       ["-jar", gatk_jar] + [str(x) for x in params]
示例#3
0
def _get_gatk_opts(config,
                   names,
                   tmp_dir=None,
                   memscale=None,
                   include_gatk=True):
    """Retrieve GATK memory specifications, moving down a list of potential specifications.
    """
    if include_gatk:
        opts = [
            "-U", "LENIENT_VCF_PROCESSING", "--read_filter", "BadCigar",
            "--read_filter", "NotPrimaryAlignment"
        ]
    else:
        opts = []
    jvm_opts = ["-Xms750m", "-Xmx2g"]
    for n in names:
        resources = config_utils.get_resources(n, config)
        if resources and resources.get("jvm_opts"):
            jvm_opts = resources.get("jvm_opts")
            break
    if memscale:
        jvm_opts = config_utils.adjust_opts(
            jvm_opts, {"algorithm": {
                "memory_adjust": memscale
            }})
    jvm_opts += get_default_jvm_opts(tmp_dir)
    return jvm_opts + opts
示例#4
0
 def cl_gatk(self, params, tmp_dir, memscale=None):
     support_nt = set()
     support_nct = set(["BaseRecalibrator"])
     gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"])
     cores = self._config["algorithm"].get("num_cores", 1)
     config = self._config
     if cores and int(cores) > 1:
         atype_index = params.index("-T") if params.count("-T") > 0 \
                       else params.index("--analysis_type")
         prog = params[atype_index + 1]
         if prog in support_nt:
             params.extend(["-nt", str(cores)])
         elif prog in support_nct:
             params.extend(["-nct", str(cores)])
             if config["algorithm"].get("memory_adjust") is None:
                 config = utils.deepish_copy(config)
                 config["algorithm"]["memory_adjust"] = {"direction": "increase",
                                                         "magnitude": int(cores) // 2}
     if LooseVersion(self.gatk_major_version()) > LooseVersion("1.9"):
         if len([x for x in params if x.startswith(("-U", "--unsafe"))]) == 0:
             params.extend(["-U", "LENIENT_VCF_PROCESSING"])
         params.extend(["--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment"])
     if memscale:
         jvm_opts = get_gatk_opts(config, tmp_dir=tmp_dir, memscale=memscale, include_gatk=False)
     else:
         # Decrease memory slightly from configuration to avoid memory allocation errors
         jvm_opts = config_utils.adjust_opts(self._jvm_opts,
                                             {"algorithm": {"memory_adjust":
                                                            {"magnitude": 1.1, "direction": "decrease"}}})
         jvm_opts += get_default_jvm_opts(tmp_dir)
     if "keyfile" in self._gatk_resources:
         params = ["-et", "NO_ET", "-K", self._gatk_resources["keyfile"]] + params
     return ["java"] + jvm_opts + ["-jar", gatk_jar] + [str(x) for x in params]
示例#5
0
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data):
    """Provide prioritized tab delimited output for a single caller.
    """
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller))
    simple_vcf = os.path.join(work_dir, "%s-%s-simple.vcf.gz" % (sample, caller))
    if not utils.file_exists(simple_vcf):
        gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data)
        # If we have a standard gene list we can skip BED based prioritization
        priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0]
        if gene_list:
            if vcf_file.endswith(".vcf.gz"):
                utils.symlink_plus(vcf_file, priority_vcf)
            else:
                assert vcf_file.endswith(".vcf")
                utils.symlink_plus(vcf_file, priority_vcf.replace(".vcf.gz", ".vcf"))
                vcfutils.bgzip_and_index(priority_vcf.replace(".vcf.gz", ".vcf"),
                                         data["config"], remove_orig=False)
        # otherwise prioritize based on BED and proceed
        else:
            if not utils.file_exists(priority_vcf):
                with file_transaction(data, priority_vcf) as tx_out_file:
                    resources = config_utils.get_resources("bcbio_prioritize", data["config"])
                    jvm_opts = resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"])
                    jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust":
                                                                                 {"direction": "increase",
                                                                                  "maximum": "30000M",
                                                                                  "magnitude": dd.get_cores(data)}}})
                    jvm_opts = " ".join(jvm_opts)
                    export = utils.local_path_export()
                    cmd = ("{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} "
                           " -k {prioritize_by}")
                    do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest")

        data_dir = os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py")))
        with file_transaction(data, simple_vcf) as tx_out_file:
            fusion_file = os.path.join(data_dir, "fusion_pairs.txt")
            opts = ""
            if os.path.exists(fusion_file):
                opts += " --known_fusion_pairs %s" % fusion_file
            if not gene_list:
                opts += " --gene_list %s" % os.path.join(data_dir, "az-cancer-panel.txt")
            else:
                opts += " --gene_list %s" % gene_list
            cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}"
            do.run(cmd.format(**locals()), "Prioritize: simplified annotation output")
    simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"])
    if post_prior_fn:
        simple_vcf = post_prior_fn(simple_vcf, work_dir, data)
    if not utils.file_uptodate(out_file, simple_vcf):
        with file_transaction(data, out_file) as tx_out_file:
            export = utils.local_path_export(env_cmd="vawk")
            cmd = ("{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} "
                   """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """
                   "print CALLER,SNAME,$1,$2,I$END,"
                   """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,"""
                   "I$LOF,I$SIMPLE_ANN,"
                   "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}")
            do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited")
    return out_file, simple_vcf
示例#6
0
def _get_snpeff_cmd(cmd_name, datadir, data, out_file):
    """Retrieve snpEff base command line.
    """
    resources = config_utils.get_resources("snpeff", data["config"])
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx3g"])
    # scale by cores, defaulting to 2x base usage to ensure we have enough memory
    # for single core runs to use with human genomes.
    # Sets a maximum amount of memory to avoid core dumps exceeding 32Gb
    # We shouldn't need that much memory for snpEff, so avoid issues
    # https://www.elastic.co/guide/en/elasticsearch/guide/current/heap-sizing.html#compressed_oops
    jvm_opts = config_utils.adjust_opts(
        jvm_opts, {
            "algorithm": {
                "memory_adjust": {
                    "direction": "increase",
                    "maximum": "30000M",
                    "magnitude": max(2, dd.get_cores(data))
                }
            }
        })
    memory = " ".join(jvm_opts)
    snpeff = config_utils.get_program("snpEff", data["config"])
    java_args = "-Djava.io.tmpdir=%s" % utils.safe_makedir(
        os.path.join(os.path.dirname(out_file), "tmp"))
    export = "unset JAVA_HOME && export PATH=%s:$PATH && " % (
        utils.get_java_binpath())
    cmd = "{export} {snpeff} {memory} {java_args} {cmd_name} -dataDir {datadir}"
    return cmd.format(**locals())
示例#7
0
def summary(samples, config):
    """Provide summary information on a single sample across regions of interest.
    """
    try:
        bc_jar = config_utils.get_jar("bcbio.coverage", config_utils.get_program("bcbio_coverage", config, "dir"))
    except ValueError:
        logger.warning("No coverage calculations: Did not find bcbio.coverage jar from system config")
        return [[x] for x in samples]
    config_file, out_file = _prep_coverage_config(samples, config)
    tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "tmp"))
    resources = config_utils.get_resources("bcbio_coverage", config)
    config = copy.deepcopy(config)
    config["algorithm"]["memory_adjust"] = {"direction": "increase",
                                            "magnitude": config["algorithm"].get("num_cores", 1)}
    jvm_opts = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]), config)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            java_args = ["-Djava.io.tmpdir=%s" % tmp_dir, "-Djava.awt.headless=true"]
            cmd = ["java"] + jvm_opts + java_args + ["-jar", bc_jar, "multicompare", config_file,
                                                     tx_out_file, "-c", str(config["algorithm"].get("num_cores", 1))]
            do.run(cmd, "Summarizing coverage with bcbio.coverage", samples[0])
    out = []
    for x in samples:
        x["coverage"] = {"summary": out_file}
        out.append([x])
    return out
示例#8
0
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data):
    """Provide prioritized tab delimited output for a single caller.
    """
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller))
    simple_vcf = os.path.join(work_dir, "%s-%s-simple.vcf.gz" % (sample, caller))
    if not utils.file_exists(simple_vcf):
        gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data)
        # If we have a standard gene list we can skip BED based prioritization
        priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0]
        if gene_list:
            if vcf_file.endswith(".vcf.gz"):
                utils.symlink_plus(vcf_file, priority_vcf)
            else:
                assert vcf_file.endswith(".vcf")
                utils.symlink_plus(vcf_file, priority_vcf.replace(".vcf.gz", ".vcf"))
                vcfutils.bgzip_and_index(priority_vcf.replace(".vcf.gz", ".vcf"),
                                         data["config"], remove_orig=False)
        # otherwise prioritize based on BED and proceed
        else:
            if not utils.file_exists(priority_vcf):
                with file_transaction(data, priority_vcf) as tx_out_file:
                    resources = config_utils.get_resources("bcbio_prioritize", data["config"])
                    jvm_opts = resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"])
                    jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust":
                                                                                 {"direction": "increase",
                                                                                  "maximum": "30000M",
                                                                                  "magnitude": dd.get_cores(data)}}})
                    jvm_opts = " ".join(jvm_opts)
                    export = utils.local_path_export()
                    cmd = ("{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} "
                           " -k {prioritize_by}")
                    do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest")

        data_dir = os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py")))
        with file_transaction(data, simple_vcf) as tx_out_file:
            fusion_file = os.path.join(data_dir, "fusion_pairs.txt")
            opts = ""
            if os.path.exists(fusion_file):
                opts += " --known_fusion_pairs %s" % fusion_file
            if not gene_list:
                opts += " --gene_list %s" % os.path.join(data_dir, "az-cancer-panel.txt")
            else:
                opts += " --gene_list %s" % gene_list
            cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}"
            do.run(cmd.format(**locals()), "Prioritize: simplified annotation output")
    simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"])
    if post_prior_fn:
        simple_vcf = post_prior_fn(simple_vcf, work_dir, data)
    if not utils.file_uptodate(out_file, simple_vcf):
        with file_transaction(data, out_file) as tx_out_file:
            export = utils.local_path_export()
            cmd = ("{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} "
                   """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """
                   "print CALLER,SNAME,$1,$2,I$END,"
                   """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,"""
                   "I$LOF,I$SIMPLE_ANN,"
                   "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}")
            do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited")
    return out_file, simple_vcf
示例#9
0
def _square_batch_bcbio_variation(data, region, bam_files, vrn_files, out_file,
                                  todo="square"):
    """Run squaring or merging analysis using bcbio.variation.recall.
    """
    ref_file = tz.get_in(("reference", "fasta", "base"), data)
    cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
    resources = config_utils.get_resources("bcbio-variation-recall", data["config"])
    # adjust memory by cores but leave room for run program memory
    memcores = int(math.ceil(float(cores) / 5.0))
    jvm_opts = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms250m", "-Xmx2g"]),
                                        {"algorithm": {"memory_adjust": {"direction": "increase",
                                                                         "magnitude": memcores}}})
    # Write unique VCFs and BAMs to input file
    input_file = "%s-inputs.txt" % os.path.splitext(out_file)[0]
    with open(input_file, "w") as out_handle:
        out_handle.write("\n".join(sorted(list(set(vrn_files)))) + "\n")
        if todo == "square":
            out_handle.write("\n".join(sorted(list(set(bam_files)))) + "\n")
    variantcaller = tz.get_in(("config", "algorithm", "jointcaller"), data).replace("-joint", "")
    cmd = ["bcbio-variation-recall", todo] + jvm_opts + broad.get_default_jvm_opts() + \
          ["-c", cores, "-r", bamprep.region_to_gatk(region)]
    if todo == "square":
        cmd += ["--caller", variantcaller]
    cmd += [out_file, ref_file, input_file]
    bcbio_env = utils.get_bcbio_env()
    cmd = " ".join(str(x) for x in cmd)
    do.run(cmd, "%s in region: %s" % (cmd, bamprep.region_to_gatk(region)), env=bcbio_env)
    return out_file
示例#10
0
 def cl_gatk(self, params, tmp_dir):
     support_nt = set()
     support_nct = set(["BaseRecalibrator"])
     gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"])
     local_args = []
     cores = self._config["algorithm"].get("num_cores", 1)
     config = copy.deepcopy(self._config)
     if cores and int(cores) > 1:
         atype_index = params.index("-T") if params.count("-T") > 0 \
                       else params.index("--analysis_type")
         prog = params[atype_index + 1]
         if prog in support_nt:
             params.extend(["-nt", str(cores)])
         elif prog in support_nct:
             params.extend(["-nct", str(cores)])
             if config["algorithm"].get("memory_adjust") is None:
                 config["algorithm"]["memory_adjust"] = {
                     "direction": "increase",
                     "magnitude": int(cores) // 2
                 }
     if LooseVersion(self.gatk_major_version()) > LooseVersion("1.9"):
         if len([x for x in params
                 if x.startswith(("-U", "--unsafe"))]) == 0:
             params.extend(["-U", "LENIENT_VCF_PROCESSING"])
         params.extend([
             "--read_filter", "BadCigar", "--read_filter",
             "NotPrimaryAlignment"
         ])
     local_args.append("-Djava.io.tmpdir=%s" % tmp_dir)
     if "keyfile" in self._gatk_resources:
         params = ["-et", "NO_ET", "-K", self._gatk_resources["keyfile"]
                   ] + params
     return ["java"] + config_utils.adjust_opts(self._jvm_opts, config) + local_args + \
       ["-jar", gatk_jar] + [str(x) for x in params]
示例#11
0
def _run_gridss(inputs, background, work_dir):
    out_file = os.path.join(work_dir, "%s-gridss.sv.vcf" % (dd.get_batch(inputs[0]) or
                                                            dd.get_sample_name(inputs[0])))
    if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"):
        with file_transaction(inputs[0], out_file) as tx_out_file:
            htsjdk_opts = ["-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true",
                           "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true"]
            cores = dd.get_cores(inputs[0])
            resources = config_utils.get_resources("gridss", inputs[0]["config"])
            jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"])
            jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust":
                                                                         {"direction": "increase",
                                                                          "magnitude": cores}}})
            jvm_opts = _finalize_memory(jvm_opts)
            tx_ref_file = _setup_reference_files(inputs[0], os.path.dirname(tx_out_file))
            blacklist_bed = sshared.prepare_exclude_file(inputs + background, out_file)
            cmd = ["gridss"] + jvm_opts + htsjdk_opts + ["gridss.CallVariants"] + \
                  ["THREADS=%s" % cores,
                   "TMP_DIR=%s" % os.path.dirname(tx_out_file), "WORKING_DIR=%s" % os.path.dirname(tx_out_file),
                   "OUTPUT=%s" % tx_out_file,
                   "ASSEMBLY=%s" % tx_out_file.replace(".sv.vcf", ".gridss.assembly.bam"),
                   "REFERENCE_SEQUENCE=%s" % tx_ref_file, "BLACKLIST=%s" % blacklist_bed]
            for data in inputs + background:
                cmd += ["INPUT=%s" % dd.get_align_bam(data), "INPUT_LABEL=%s" % dd.get_sample_name(data)]
            exports = utils.local_path_export()
            cmd = exports + " ".join(cmd)
            do.run(cmd, "GRIDSS SV analysis")
    return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
示例#12
0
 def cl_gatk(self, params, tmp_dir, memscale=None):
     support_nt = set()
     support_nct = set(["BaseRecalibrator"])
     gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"])
     cores = self._config["algorithm"].get("num_cores", 1)
     config = self._config
     if cores and int(cores) > 1:
         atype_index = params.index("-T") if params.count("-T") > 0 \
                       else params.index("--analysis_type")
         prog = params[atype_index + 1]
         if prog in support_nt:
             params.extend(["-nt", str(cores)])
         elif prog in support_nct:
             params.extend(["-nct", str(cores)])
             if config["algorithm"].get("memory_adjust") is None:
                 config = utils.deepish_copy(config)
                 config["algorithm"]["memory_adjust"] = {"direction": "increase",
                                                         "magnitude": int(cores) // 2}
     if LooseVersion(self.gatk_major_version()) > LooseVersion("1.9"):
         if len([x for x in params if x.startswith(("-U", "--unsafe"))]) == 0:
             params.extend(["-U", "LENIENT_VCF_PROCESSING"])
         params.extend(["--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment"])
     if memscale:
         jvm_opts = get_gatk_opts(config, tmp_dir=tmp_dir, memscale=memscale, include_gatk=False)
     else:
         # Decrease memory slightly from configuration to avoid memory allocation errors
         jvm_opts = config_utils.adjust_opts(self._jvm_opts,
                                             {"algorithm": {"memory_adjust":
                                                            {"magnitude": 1.1, "direction": "decrease"}}})
         jvm_opts += get_default_jvm_opts(tmp_dir)
     if "keyfile" in self._gatk_resources:
         params = ["-et", "NO_ET", "-K", self._gatk_resources["keyfile"]] + params
     return ["java"] + jvm_opts + ["-jar", gatk_jar] + [str(x) for x in params]
示例#13
0
def _square_batch_bcbio_variation(data, region, bam_files, vrn_files, out_file,
                                  todo="square"):
    """Run squaring or merging analysis using bcbio.variation.recall.
    """
    ref_file = tz.get_in(("reference", "fasta", "base"), data)
    cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
    resources = config_utils.get_resources("bcbio-variation-recall", data["config"])
    # adjust memory by cores but leave room for run program memory
    memcores = int(math.ceil(float(cores) / 5.0))
    jvm_opts = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms250m", "-Xmx2g"]),
                                        {"algorithm": {"memory_adjust": {"direction": "increase",
                                                                         "magnitude": memcores}}})
    # Write unique VCFs and BAMs to input file
    input_file = "%s-inputs.txt" % os.path.splitext(out_file)[0]
    with open(input_file, "w") as out_handle:
        out_handle.write("\n".join(sorted(list(set(vrn_files)))) + "\n")
        if todo == "square":
            out_handle.write("\n".join(sorted(list(set(bam_files)))) + "\n")
    variantcaller = tz.get_in(("config", "algorithm", "jointcaller"), data).replace("-joint", "")
    cmd = ["bcbio-variation-recall", todo] + jvm_opts + broad.get_default_jvm_opts() + \
          ["-c", cores, "-r", bamprep.region_to_gatk(region)]
    if todo == "square":
        cmd += ["--caller", variantcaller]
    cmd += [out_file, ref_file, input_file]
    do.run(cmd, "%s in region: %s" % (cmd, bamprep.region_to_gatk(region)))
    return out_file
示例#14
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        vrn_file, rm_file, interval_bed = _prepare_inputs(
            vrn_file, rm_file, rm_interval_file, base_dir, data)

        rtg_ref = tz.get_in(["reference", "rtg"], data)
        assert rtg_ref and os.path.exists(rtg_ref), (
            "Did not find rtg indexed reference file for validation:\n%s\n"
            "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        # handle CWL where we have a reference to a single file in the RTG directory
        if os.path.isfile(rtg_ref):
            rtg_ref = os.path.dirname(rtg_ref)

        # get core and memory usage from standard configuration
        threads = min(dd.get_num_cores(data), 6)
        resources = config_utils.get_resources("rtg", data["config"])
        memory = config_utils.adjust_opts(
            resources.get("jvm_opts", ["-Xms500m", "-Xmx1500m"]), {
                "algorithm": {
                    "memory_adjust": {
                        "magnitude": threads,
                        "direction": "increase"
                    }
                }
            })
        jvm_stack = [x for x in memory if x.startswith("-Xms")]
        jvm_mem = [x for x in memory if x.startswith("-Xmx")]
        jvm_stack = jvm_stack[0] if len(jvm_stack) > 0 else "-Xms500m"
        jvm_mem = jvm_mem[0].replace("-Xmx", "") if len(jvm_mem) > 0 else "3g"
        cmd = [
            "rtg", "vcfeval", "--threads",
            str(threads), "-b", rm_file, "--bed-regions", interval_bed, "-c",
            vrn_file, "-t", rtg_ref, "-o", out_dir
        ]
        cmd += [
            "--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))
        ]
        mem_export = "%s export RTG_JAVA_OPTS='%s' && export RTG_MEM=%s" % (
            utils.local_path_export(), jvm_stack, jvm_mem)
        cmd = mem_export + " && " + " ".join(cmd)
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    out = {
        "fp": os.path.join(out_dir, "fp.vcf.gz"),
        "fn": os.path.join(out_dir, "fn.vcf.gz")
    }
    tp_calls = os.path.join(out_dir, "tp.vcf.gz")
    tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz")
    if os.path.exists(tp_baseline):
        out["tp"] = tp_baseline
        out["tp-calls"] = tp_calls
    else:
        out["tp"] = tp_calls
    return out
示例#15
0
 def cl_gatk(self, params, tmp_dir, memscale=None, parallel_gc=False):
     support_nt = set()
     support_nct = set(["BaseRecalibrator"])
     if self._has_gatk_conda_wrapper():
         gatk_jar = None
     else:
         gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"], allow_missing=True)
         if not gatk_jar:
             raise ValueError("GATK processing requested but gatk or older jar install not found: "
                              "http://bcbio-nextgen.readthedocs.io/en/latest/contents/"
                              "installation.html#gatk-and-mutect-mutect2")
     is_gatk4 = "gatk4" not in dd.get_tools_off({"config": self._config})
     cores = self._config["algorithm"].get("num_cores", 1)
     config = self._config
     atype_index = params.index("-T") if params.count("-T") > 0 \
                     else params.index("--analysis_type")
     prog = params[atype_index + 1]
     # For GATK4 specify command first, so swap params to accomplish
     if is_gatk4:
         params = params[:]
         del params[atype_index + 1]
         del params[atype_index]
         params = [prog] + params
     if cores and int(cores) > 1:
         if prog in support_nt:
             params.extend(["-nt", str(cores)])
         elif prog in support_nct:
             params.extend(["-nct", str(cores)])
             memscale = config["algorithm"]["memory_adjust"] = {"direction": "increase",
                                                                "magnitude": max(1, int(cores) // 2)}
     # Filters and unsafe specifications not in GATK4
     if LooseVersion(self.gatk_major_version()) > LooseVersion("1.9") and not is_gatk4:
         if len([x for x in params if x.startswith(("-U", "--unsafe"))]) == 0:
             params.extend(["-U", "LENIENT_VCF_PROCESSING"])
         params.extend(["--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment"])
     if memscale:
         jvm_opts = get_gatk_opts(config, tmp_dir=tmp_dir, memscale=memscale, include_gatk=False,
                                  parallel_gc=parallel_gc)
     else:
         # Decrease memory slightly from configuration to avoid memory allocation errors
         jvm_opts = config_utils.adjust_opts(self._jvm_opts,
                                             {"algorithm": {"memory_adjust":
                                                            {"magnitude": 1.1, "direction": "decrease"}}})
         jvm_opts += get_default_jvm_opts(tmp_dir, parallel_gc=parallel_gc)
     if "keyfile" in self._gatk_resources:
         params = ["-et", "NO_ET", "-K", self._gatk_resources["keyfile"]] + params
     if gatk_jar:
         return " ".join(["java"] + jvm_opts + ["-jar", gatk_jar] + [str(x) for x in params])
     else:
         cmd = gatk_cmd("gatk", jvm_opts, params, config=self._config)
         if cmd:
             return cmd
         else:
             raise ValueError("GATK processing requested but gatk or older jar install not found: "
                              "http://bcbio-nextgen.readthedocs.io/en/latest/contents/"
                              "installation.html#gatk-and-mutect-mutect2")
示例#16
0
 def cl_gatk(self, params, tmp_dir, memscale=None, parallel_gc=False):
     support_nt = set()
     support_nct = set(["BaseRecalibrator"])
     if self._has_gatk_conda_wrapper():
         gatk_jar = None
     else:
         gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"], allow_missing=True)
         if not gatk_jar:
             raise ValueError("GATK processing requested but gatk or older jar install not found: "
                              "http://bcbio-nextgen.readthedocs.io/en/latest/contents/"
                              "installation.html#gatk-and-mutect-mutect2")
     is_gatk4 = "gatk4" not in dd.get_tools_off({"config": self._config})
     cores = self._config["algorithm"].get("num_cores", 1)
     config = self._config
     atype_index = params.index("-T") if params.count("-T") > 0 \
                     else params.index("--analysis_type")
     prog = params[atype_index + 1]
     # For GATK4 specify command first, so swap params to accomplish
     if is_gatk4:
         params = params[:]
         del params[atype_index + 1]
         del params[atype_index]
         params = [prog] + params
     if cores and int(cores) > 1:
         if prog in support_nt:
             params.extend(["-nt", str(cores)])
         elif prog in support_nct:
             params.extend(["-nct", str(cores)])
             memscale = config["algorithm"]["memory_adjust"] = {"direction": "increase",
                                                                "magnitude": max(1, int(cores) // 2)}
     # Filters and unsafe specifications not in GATK4
     if LooseVersion(self.gatk_major_version()) > LooseVersion("1.9") and not is_gatk4:
         if len([x for x in params if x.startswith(("-U", "--unsafe"))]) == 0:
             params.extend(["-U", "LENIENT_VCF_PROCESSING"])
         params.extend(["--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment"])
     if memscale:
         jvm_opts = get_gatk_opts(config, tmp_dir=tmp_dir, memscale=memscale, include_gatk=False,
                                  parallel_gc=parallel_gc)
     else:
         # Decrease memory slightly from configuration to avoid memory allocation errors
         jvm_opts = config_utils.adjust_opts(self._jvm_opts,
                                             {"algorithm": {"memory_adjust":
                                                            {"magnitude": 1.1, "direction": "decrease"}}})
         jvm_opts += get_default_jvm_opts(tmp_dir, parallel_gc=parallel_gc)
     if "keyfile" in self._gatk_resources:
         params = ["-et", "NO_ET", "-K", self._gatk_resources["keyfile"]] + params
     if gatk_jar:
         return " ".join(["java"] + jvm_opts + ["-jar", gatk_jar] + [str(x) for x in params])
     else:
         cmd = gatk_cmd("gatk", jvm_opts, params, config=self._config)
         if cmd:
             return cmd
         else:
             raise ValueError("GATK processing requested but gatk or older jar install not found: "
                              "http://bcbio-nextgen.readthedocs.io/en/latest/contents/"
                              "installation.html#gatk-and-mutect-mutect2")
示例#17
0
 def cl_mutect(self, params, tmp_dir):
     """Define parameters to run the mutect paired algorithm.
     """
     gatk_jar = self._get_jar("muTect", ["mutect"])
     # Decrease memory slightly from configuration to avoid memory allocation errors
     jvm_opts = config_utils.adjust_opts(self._jvm_opts,
                                         {"algorithm": {"memory_adjust":
                                                        {"magnitude": 1.1, "direction": "decrease"}}})
     return ["java"] + jvm_opts + get_default_jvm_opts(tmp_dir) + \
            ["-jar", gatk_jar] + [str(x) for x in params]
示例#18
0
    def cl_mutect(self, params, tmp_dir):
        """Define parameters to run the mutect paired algorithm."""

        gatk_jar = self._get_jar("muTect")
        local_args = []
        config = copy.deepcopy(self._config)

        local_args.append("-Djava.io.tmpdir=%s" % tmp_dir)
        return ["java"] + config_utils.adjust_opts(self._jvm_opts, config) + local_args + \
          ["-jar", gatk_jar] + [str(x) for x in params]
示例#19
0
 def cl_mutect(self, params, tmp_dir):
     """Define parameters to run the mutect paired algorithm.
     """
     gatk_jar = self._get_jar("muTect", ["mutect"])
     # Decrease memory slightly from configuration to avoid memory allocation errors
     jvm_opts = config_utils.adjust_opts(self._jvm_opts,
                                         {"algorithm": {"memory_adjust":
                                                        {"magnitude": 1.1, "direction": "decrease"}}})
     return ["java"] + jvm_opts + get_default_jvm_opts(tmp_dir) + \
            ["-jar", gatk_jar] + [str(x) for x in params]
示例#20
0
    def cl_mutect(self, params, tmp_dir):

        """Define parameters to run the mutect paired algorithm."""

        gatk_jar = self._get_jar("muTect")
        local_args = []
        config = copy.deepcopy(self._config)

        local_args.append("-Djava.io.tmpdir=%s" % tmp_dir)
        return ["java"] + config_utils.adjust_opts(self._jvm_opts, config) + local_args + \
          ["-jar", gatk_jar] + [str(x) for x in params]
示例#21
0
def _get_fgbio_jvm_opts(data, tmpdir, scale_factor=None):
    cores, mem = _get_cores_memory(data)
    resources = config_utils.get_resources("fgbio", data["config"])
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"])
    if scale_factor and cores > scale_factor:
        jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust":
                                                                     {"direction": "increase",
                                                                      "magnitude": cores // scale_factor}}})
    jvm_opts += broad.get_default_jvm_opts()
    jvm_opts = " ".join(jvm_opts)
    return jvm_opts + " --tmp-dir %s" % tmpdir
示例#22
0
def _get_fgbio_jvm_opts(data, tmpdir, scale_factor=None):
    cores, mem = _get_cores_memory(data)
    resources = config_utils.get_resources("fgbio", data["config"])
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"])
    if scale_factor and cores > scale_factor:
        jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust":
                                                                     {"direction": "increase",
                                                                      "magnitude": cores // scale_factor}}})
    jvm_opts += broad.get_default_jvm_opts()
    jvm_opts = " ".join(jvm_opts)
    return jvm_opts + " --tmp-dir %s" % tmpdir
示例#23
0
def _get_jvm_opts(out_file, data):
    """Retrieve Java options, adjusting memory for available cores.
    """
    resources = config_utils.get_resources("purple", data["config"])
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx3500m"])
    jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust":
                                                                 {"direction": "increase",
                                                                  "maximum": "30000M",
                                                                  "magnitude": dd.get_cores(data)}}})
    jvm_opts += broad.get_default_jvm_opts(os.path.dirname(out_file))
    return jvm_opts
示例#24
0
def _get_varscan_opts(config, tmp_dir):
    """Retrieve common options for running VarScan.
    Handles jvm_opts, setting user and country to English to avoid issues
    with different locales producing non-compliant VCF.
    """
    resources = config_utils.get_resources("varscan", config)
    jvm_opts = resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"])
    jvm_opts = config_utils.adjust_opts(jvm_opts,
                                        {"algorithm": {"memory_adjust":
                                                       {"magnitude": 1.1, "direction": "decrease"}}})
    jvm_opts += ["-Duser.language=en", "-Duser.country=US"]
    jvm_opts += broad.get_default_jvm_opts(tmp_dir)
    return " ".join(jvm_opts)
示例#25
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data, validate_method):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        vrn_file, rm_file, interval_bed = _prepare_inputs(vrn_file, rm_file, rm_interval_file, base_dir, data)

        rtg_ref = tz.get_in(["reference", "rtg"], data)
        if isinstance(rtg_ref, dict) and "base" in rtg_ref:
            rtg_ref = os.path.dirname(rtg_ref["base"])
        assert rtg_ref and os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n"
                                                     "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        # handle CWL where we have a reference to a single file in the RTG directory
        if os.path.isfile(rtg_ref):
            rtg_ref = os.path.dirname(rtg_ref)

        # get core and memory usage from standard configuration
        threads = min(dd.get_num_cores(data), 6)
        resources = config_utils.get_resources("rtg", data["config"])
        memory = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms500m", "-Xmx1500m"]),
                                          {"algorithm": {"memory_adjust": {"magnitude": threads,
                                                                           "direction": "increase"}}})
        jvm_stack = [x for x in memory if x.startswith("-Xms")]
        jvm_mem = [x for x in memory if x.startswith("-Xmx")]
        jvm_stack = jvm_stack[0] if len(jvm_stack) > 0 else "-Xms500m"
        jvm_mem = jvm_mem[0].replace("-Xmx", "") if len(jvm_mem) > 0 else "3g"
        cmd = ["rtg", "vcfeval", "--threads", str(threads),
               "-b", rm_file, "--bed-regions", interval_bed,
               "-c", vrn_file, "-t", rtg_ref, "-o", out_dir]
        if validate_method == "rtg-squash-ploidy":
            cmd += ["--squash-ploidy"]
        rm_samples = vcfutils.get_samples(rm_file)
        if len(rm_samples) > 1 and dd.get_sample_name(data) in rm_samples:
            cmd += ["--sample=%s" % dd.get_sample_name(data)]
        cmd += ["--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))]
        mem_export = "%s export RTG_JAVA_OPTS='%s' && export RTG_MEM=%s" % (utils.local_path_export(),
                                                                            jvm_stack, jvm_mem)
        cmd = mem_export + " && " + " ".join(cmd)
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    out = {"fp": os.path.join(out_dir, "fp.vcf.gz"),
           "fn": os.path.join(out_dir, "fn.vcf.gz")}
    tp_calls = os.path.join(out_dir, "tp.vcf.gz")
    tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz")
    if os.path.exists(tp_baseline):
        out["tp"] = tp_baseline
        out["tp-calls"] = tp_calls
    else:
        out["tp"] = tp_calls
    return out
示例#26
0
def _get_snpeff_cmd(cmd_name, datadir, data, out_file):
    """Retrieve snpEff base command line.
    """
    resources = config_utils.get_resources("snpeff", data["config"])
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx3g"])
    # scale by cores, defaulting to 2x base usage to ensure we have enough memory
    # for single core runs to use with human genomes
    jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust":
                                                                 {"direction": "increase",
                                                                  "magnitude": max(2, dd.get_cores(data))}}})
    memory = " ".join(jvm_opts)
    snpeff = config_utils.get_program("snpEff", data["config"])
    java_args = "-Djava.io.tmpdir=%s" % utils.safe_makedir(os.path.join(os.path.dirname(out_file), "tmp"))
    export = "unset JAVA_HOME && export PATH=%s:$PATH && " % (utils.get_java_binpath())
    cmd = "{export} {snpeff} {memory} {java_args} {cmd_name} -dataDir {datadir}"
    return cmd.format(**locals())
示例#27
0
def _get_snpeff_cmd(cmd_name, datadir, data, out_file):
    """Retrieve snpEff base command line.
    """
    resources = config_utils.get_resources("snpeff", data["config"])
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx3g"])
    # scale by cores, defaulting to 2x base usage to ensure we have enough memory
    # for single core runs to use with human genomes
    jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust":
                                                                 {"direction": "increase",
                                                                  "magnitude": max(2, dd.get_cores(data))}}})
    memory = " ".join(jvm_opts)
    snpeff = config_utils.get_program("snpEff", data["config"])
    java_args = "-Djava.io.tmpdir=%s" % utils.safe_makedir(os.path.join(os.path.dirname(out_file), "tmp"))
    export = "unset JAVA_HOME && export PATH=%s:$PATH && " % (utils.get_java_binpath())
    cmd = "{export} {snpeff} {memory} {java_args} {cmd_name} -dataDir {datadir}"
    return cmd.format(**locals())
示例#28
0
def _get_gatk_opts(config, names, tmp_dir=None, memscale=None):
    """Retrieve GATK memory specifications, moving down a list of potential specifications.
    """
    opts = ["-U", "LENIENT_VCF_PROCESSING", "--read_filter",
            "BadCigar", "--read_filter", "NotPrimaryAlignment"]
    if tmp_dir:
        opts.append("-Djava.io.tmpdir=%s" % tmp_dir)
    jvm_opts = ["-Xms750m", "-Xmx2g"]
    for n in names:
        resources = config_utils.get_resources(n, config)
        if resources and resources.get("jvm_opts"):
            jvm_opts = resources.get("jvm_opts")
            break
    if memscale:
        jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": memscale}})
    return jvm_opts + opts
示例#29
0
def _run_gridss(inputs, background, work_dir):
    out_file = os.path.join(
        work_dir, "%s-gridss.sv.vcf" %
        (dd.get_batch(inputs[0]) or dd.get_sample_name(inputs[0])))
    if not utils.file_exists(out_file) and not utils.file_exists(out_file +
                                                                 ".gz"):
        with file_transaction(inputs[0], out_file) as tx_out_file:
            htsjdk_opts = [
                "-Dsamjdk.create_index=true",
                "-Dsamjdk.use_async_io_read_samtools=true",
                "-Dsamjdk.use_async_io_write_samtools=true",
                "-Dsamjdk.use_async_io_write_tribble=true"
            ]
            cores = dd.get_cores(inputs[0])
            resources = config_utils.get_resources("gridss",
                                                   inputs[0]["config"])
            jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"])
            jvm_opts = config_utils.adjust_opts(
                jvm_opts, {
                    "algorithm": {
                        "memory_adjust": {
                            "direction": "increase",
                            "magnitude": cores
                        }
                    }
                })
            jvm_opts = _finalize_memory(jvm_opts)
            tx_ref_file = _setup_reference_files(inputs[0],
                                                 os.path.dirname(tx_out_file))
            blacklist_bed = sshared.prepare_exclude_file(
                inputs + background, out_file)
            cmd = ["gridss"] + jvm_opts + htsjdk_opts + ["gridss.CallVariants"] + \
                  ["THREADS=%s" % cores,
                   "TMP_DIR=%s" % os.path.dirname(tx_out_file), "WORKING_DIR=%s" % os.path.dirname(tx_out_file),
                   "OUTPUT=%s" % tx_out_file,
                   "ASSEMBLY=%s" % tx_out_file.replace(".sv.vcf", ".gridss.assembly.bam"),
                   "REFERENCE_SEQUENCE=%s" % tx_ref_file, "BLACKLIST=%s" % blacklist_bed]
            for data in inputs + background:
                cmd += [
                    "INPUT=%s" % dd.get_align_bam(data),
                    "INPUT_LABEL=%s" % dd.get_sample_name(data)
                ]
            exports = utils.local_path_export()
            cmd = exports + " ".join(cmd)
            do.run(cmd, "GRIDSS SV analysis")
    return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
示例#30
0
def _run_break_point_inspector(data, variant_file, paired):
    output_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(variant_file)[0], "bpi")
    if not utils.file_exists(output_vcf):
        with file_transaction(data, output_vcf) as tx_output_vcf:
            cores = dd.get_num_cores(data)
            resources = config_utils.get_resources("break-point-inspector", data["config"])
            memory = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms1000m", "-Xmx2000m"]),
                                              {"algorithm": {"memory_adjust": {"magnitude": cores,
                                                                               "direction": "increase"}}})
            cmd = ["break-point-inspector"]
            cmd += memory
            cmd += ["-vcf", variant_file]
            if paired:
                cmd += ["-ref", paired.normal_bam, "-tumor", paired.tumor_bam]
            cmd += ["-output_vcf", tx_output_vcf]
            do.run(cmd, "Running Break Point Inspector for Manta SV calls")
    return output_vcf
示例#31
0
def _get_jvm_opts(out_file, data):
    """Retrieve Java options, adjusting memory for available cores.
    """
    resources = config_utils.get_resources("purple", data["config"])
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx3500m"])
    jvm_opts = config_utils.adjust_opts(
        jvm_opts, {
            "algorithm": {
                "memory_adjust": {
                    "direction": "increase",
                    "maximum": "30000M",
                    "magnitude": dd.get_cores(data)
                }
            }
        })
    jvm_opts += broad.get_default_jvm_opts(os.path.dirname(out_file))
    return jvm_opts
示例#32
0
def _run_break_point_inspector(data, variant_file, paired, work_dir):
    output_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(variant_file)[0], "bpi")
    stats_file = "%s-%s_stats.txt" % (utils.splitext_plus(variant_file)[0], "bpi")
    if not utils.file_exists(output_vcf):
        with file_transaction(data, output_vcf) as tx_output_vcf, file_transaction(data, stats_file) as tx_stats_file:
            cores = dd.get_num_cores(data)
            resources = config_utils.get_resources("break-point-inspector", data["config"])
            jvm_mem_opts = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms1000m", "-Xmx2000m"]),
                                                    {"algorithm": {"memory_adjust": {"magnitude": cores,
                                                                                     "direction": "increase"}}})
            jvm_tmp_arg = "-Djava.io.tmpdir=" + utils.safe_makedir(os.path.join(work_dir, "bpi_tmp"))
            cmd = ["break-point-inspector"] + jvm_mem_opts + [jvm_tmp_arg, "-vcf", variant_file]
            if paired:
                cmd += ["-ref", paired.normal_bam, "-tumor", paired.tumor_bam]
            cmd += ["-output_vcf", tx_output_vcf, ">", tx_stats_file]
            do.run(cmd, "Running Break Point Inspector for Manta SV calls")
    return output_vcf
示例#33
0
def _get_gatk_opts(config, names, tmp_dir=None, memscale=None, include_gatk=True, parallel_gc=False):
    """Retrieve GATK memory specifications, moving down a list of potential specifications.
    """
    if include_gatk and "gatk4" in dd.get_tools_off({"config": config}):
        opts = ["-U", "LENIENT_VCF_PROCESSING", "--read_filter",
                "BadCigar", "--read_filter", "NotPrimaryAlignment"]
    else:
        opts = []
    jvm_opts = ["-Xms750m", "-Xmx2g"]
    for n in names:
        resources = config_utils.get_resources(n, config)
        if resources and resources.get("jvm_opts"):
            jvm_opts = resources.get("jvm_opts")
            break
    if memscale:
        jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": memscale}})
    jvm_opts += get_default_jvm_opts(tmp_dir, parallel_gc=parallel_gc)
    return jvm_opts + opts
示例#34
0
def _get_snpeff_cmd(cmd_name, datadir, data, out_file):
    """Retrieve snpEff base command line.
    """
    resources = config_utils.get_resources("snpeff", data["config"])
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx3g"])
    # scale by cores, defaulting to 2x base usage to ensure we have enough memory
    # for single core runs to use with human genomes.
    # Sets a maximum amount of memory to avoid core dumps exceeding 32Gb
    # We shouldn't need that much memory for snpEff, so avoid issues
    # https://www.elastic.co/guide/en/elasticsearch/guide/current/heap-sizing.html#compressed_oops
    jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust":
                                                                 {"direction": "increase",
                                                                  "maximum": "30000M",
                                                                  "magnitude": max(2, dd.get_cores(data))}}})
    memory = " ".join(jvm_opts)
    snpeff = config_utils.get_program("snpEff", data["config"])
    java_args = "-Djava.io.tmpdir=%s" % utils.safe_makedir(os.path.join(os.path.dirname(out_file), "tmp"))
    export = "unset JAVA_HOME && export PATH=%s:\"$PATH\" && " % (utils.get_java_binpath())
    cmd = "{export} {snpeff} {memory} {java_args} {cmd_name} -dataDir {datadir}"
    return cmd.format(**locals())
示例#35
0
def _get_snpeff_cmd(cmd_name, datadir, data, out_file):
    """Retrieve snpEff base command line.
    """
    resources = config_utils.get_resources("snpeff", data["config"])
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx3g"])
    jvm_opts = config_utils.adjust_opts(
        jvm_opts, {
            "algorithm": {
                "memory_adjust": {
                    "direction": "increase",
                    "magnitude": dd.get_cores(data)
                }
            }
        })
    memory = " ".join(jvm_opts)
    snpeff = config_utils.get_program("snpEff", data["config"])
    java_args = "-Djava.io.tmpdir=%s" % utils.safe_makedir(
        os.path.join(os.path.dirname(out_file), "tmp"))
    export = utils.local_path_export()
    cmd = "{export} {snpeff} {memory} {java_args} {cmd_name} -dataDir {datadir}"
    return cmd.format(**locals())
示例#36
0
def _square_batch_bcbio_variation(data, region, bam_files, vrn_files,
                                  out_file):
    """
    """
    ref_file = tz.get_in(("reference", "fasta", "base"), data)
    cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
    resources = config_utils.get_resources("bcbio-variation-recall",
                                           data["config"])
    jvm_opts = config_utils.adjust_opts(
        resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]), {
            "algorithm": {
                "memory_adjust": {
                    "direction": "increase",
                    "magnitude": cores
                }
            }
        })
    cmd = ["bcbio-variation-recall", "square"] + jvm_opts + \
          ["-c", cores, "-r", bamprep.region_to_gatk(region)] + \
          [out_file, ref_file] + vrn_files + bam_files
    do.run(cmd, "Squaring off in region: %s" % bamprep.region_to_gatk(region))
    return out_file
示例#37
0
def summary(samples, config):
    """Provide summary information on a single sample across regions of interest.
    """
    try:
        bc_jar = config_utils.get_jar(
            "bcbio.coverage",
            config_utils.get_program("bcbio_coverage", config, "dir"))
    except ValueError:
        logger.warning(
            "No coverage calculations: Did not find bcbio.coverage jar from system config"
        )
        return [[x] for x in samples]
    config_file, out_file = _prep_coverage_config(samples, config)
    tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file),
                                              "tmp"))
    resources = config_utils.get_resources("bcbio_coverage", config)
    config = copy.deepcopy(config)
    config["algorithm"]["memory_adjust"] = {
        "direction": "increase",
        "magnitude": config["algorithm"].get("num_cores", 1)
    }
    jvm_opts = config_utils.adjust_opts(
        resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]), config)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            java_args = [
                "-Djava.io.tmpdir=%s" % tmp_dir, "-Djava.awt.headless=true"
            ]
            cmd = ["java"] + jvm_opts + java_args + [
                "-jar", bc_jar, "multicompare", config_file, tx_out_file, "-c",
                str(config["algorithm"].get("num_cores", 1))
            ]
            do.run(cmd, "Summarizing coverage with bcbio.coverage", samples[0])
    out = []
    for x in samples:
        x["coverage"] = {"summary": out_file}
        out.append([x])
    return out