Exemplo n.º 1
0
def _run_somatic(paired, ref_file, target, out_file):
    """Run somatic calling with octopus, handling both paired and tumor-only cases.

    TODO:
      - Should we set downsample-above and downsample-target which default to 1000
        and 500? How will these effect high depth panels and runtimes?
    """
    align_bams = paired.tumor_bam
    if paired.normal_bam:
        align_bams += " %s --normal-sample %s" % (paired.normal_bam,
                                                  paired.normal_name)
    cores = dd.get_num_cores(paired.tumor_data)
    min_af = float(dd.get_min_allele_fraction(paired.tumor_data)) / 100.0
    cmd = (
        "octopus --threads {cores} --reference {ref_file} --reads {align_bams} "
        "--regions-file {target} "
        "--min-credible-somatic-frequency {min_af} "
        "-C cancer "
        "--working-directory {tmp_dir} "
        "-o {tx_out_file} --legacy")
    with file_transaction(paired.tumor_data, out_file) as tx_out_file:
        tmp_dir = os.path.dirname(tx_out_file)
        do.run(cmd.format(**locals()), "Octopus somatic calling")
        _produce_compatible_vcf(tx_out_file, paired.tumor_data)
    return out_file
Exemplo n.º 2
0
def rnaseq_vardict_variant_calling(data):
    sample = dd.get_sample_name(data)
    variation_dir = os.path.join(dd.get_work_dir(data), "variation")
    safe_makedir(variation_dir)
    out_file = os.path.join(variation_dir, sample + "-vardict.vcf.gz")
    if file_exists(out_file):
        data = dd.set_vrn_file(data, out_file)
        return data
    vardict_cmd = vardict.get_vardict_command(data)
    strandbias = "teststrandbias.R"
    var2vcf = "var2vcf_valid.pl"
    vcfstreamsort = config_utils.get_program("vcfstreamsort", data)
    compress_cmd = "| bgzip -c"
    freq = float(dd.get_min_allele_fraction(data, 20) / 100.0)
    var2vcf_opts = "-v 50"
    fix_ambig = vcfutils.fix_ambiguous_cl()
    remove_dup = vcfutils.remove_dup_cl()
    r_setup = ("unset R_HOME && export PATH=%s:$PATH && "
                % os.path.dirname(Rscript_cmd()))
    ref_file = dd.get_ref_file(data)
    bamfile = dd.get_work_bam(data)
    bed_file = gtf.gtf_to_bed(dd.get_gtf_file(data))
    opts = " -c 1 -S 2 -E 3 -g 4 "
    with file_transaction(out_file) as tx_out_file:
        jvm_opts = vardict._get_jvm_opts(data, tx_out_file)
        cmd = ("{r_setup}{jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} "
                "-N {sample} -b {bamfile} {opts} {bed_file} "
                "| {strandbias}"
                "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
                "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} "
                "> {tx_out_file}")
        message = "Calling RNA-seq variants with VarDict"
        do.run(cmd.format(**locals()), message)
    data = dd.set_vrn_file(data, out_file)
    return data
Exemplo n.º 3
0
def _run_somatic(paired, ref_file, target, out_file):
    """Run somatic calling with octopus, handling both paired and tumor-only cases.

    Tweaks for low frequency, tumor only and UMI calling documented in:
    https://github.com/luntergroup/octopus/blob/develop/configs/UMI.config
    """
    align_bams = paired.tumor_bam
    if paired.normal_bam:
        align_bams += " %s --normal-sample %s" % (paired.normal_bam, paired.normal_name)
    cores = dd.get_num_cores(paired.tumor_data)
    # Do not try to search below 0.4% currently as leads to long runtimes
    # https://github.com/luntergroup/octopus/issues/29#issuecomment-428167979
    min_af = max([float(dd.get_min_allele_fraction(paired.tumor_data)) / 100.0, 0.004])
    min_af_floor = min_af / 4.0
    cmd = ("octopus --threads {cores} --reference {ref_file} --reads {align_bams} "
           "--regions-file {target} "
           "--min-credible-somatic-frequency {min_af_floor} --min-expected-somatic-frequency {min_af} "
           "--downsample-above 4000 --downsample-target 4000 --min-kmer-prune 5 --min-bubble-score 20 "
           "--max-haplotypes 200 --somatic-snv-mutation-rate '5e-4' --somatic-indel-mutation-rate '1e-05' "
           "--target-working-memory 5G --target-read-buffer-footprint 5G --max-somatic-haplotypes 3 "
           "--caller cancer "
           "--working-directory {tmp_dir} "
           "-o {tx_out_file} --legacy")
    if not paired.normal_bam:
        cmd += (" --tumour-germline-concentration 5")
    if dd.get_umi_type(paired.tumor_data) or _is_umi_consensus_bam(paired.tumor_bam):
        cmd += (" --allow-octopus-duplicates --overlap-masking 0 "
                "--somatic-filter-expression 'GQ < 200 | MQ < 30 | SB > 0.2 | SD[.25] > 0.1 "
                "| BQ < 40 | DP < 100 | MF > 0.1 | AD < 5 | CC > 1.1 | GQD > 2'")
    with file_transaction(paired.tumor_data, out_file) as tx_out_file:
        tmp_dir = os.path.dirname(tx_out_file)
        do.run(cmd.format(**locals()), "Octopus somatic calling")
        _produce_compatible_vcf(tx_out_file, paired.tumor_data, is_somatic=True)
    return out_file
Exemplo n.º 4
0
def rnaseq_vardict_variant_calling(data):
    sample = dd.get_sample_name(data)
    out_dir = utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "vardict"))
    out_file = os.path.join(out_dir, sample + "-vardict.vcf.gz")
    if file_exists(out_file):
        data = dd.set_vrn_file(data, out_file)
        return data
    vardict_cmd = vardict.get_vardict_command(data)
    strandbias = "teststrandbias.R"
    var2vcf = "var2vcf_valid.pl"
    vcfstreamsort = config_utils.get_program("vcfstreamsort", data)
    compress_cmd = "| bgzip -c"
    freq = float(dd.get_min_allele_fraction(data, 20) / 100.0)
    var2vcf_opts = "-v 50"
    fix_ambig = vcfutils.fix_ambiguous_cl()
    remove_dup = vcfutils.remove_dup_cl()
    r_setup = get_R_exports()
    ref_file = dd.get_ref_file(data)
    bamfile = dd.get_work_bam(data)
    data = _setup_variant_regions(data, out_dir)
    opts, _ = vardict._vardict_options_from_config(
        [data],
        data["config"],
        out_file,
        dd.get_variant_regions(data),
        is_rnaseq=True)
    cores = dd.get_num_cores(data)
    if cores and cores > 1:
        opts += " -th %s" % str(cores)
    with file_transaction(data, out_file) as tx_out_file:
        jvm_opts = vardict._get_jvm_opts(data, tx_out_file)
        cmd = ("{r_setup} && {jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} "
               "-N {sample} -b {bamfile} {opts} "
               "| {strandbias}"
               "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
               "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} "
               "> {tx_out_file}")
        message = "Calling RNA-seq variants with VarDict"
        do.run(cmd.format(**locals()), message)
    out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    data = dd.set_vrn_file(data, out_file)
    return data
Exemplo n.º 5
0
def run(align_bams, items, ref_file, assoc_files, region=None, out_file=None):
    """Run tumor only pisces calling

    Handles bgzipping output file and fixing VCF sample naming to match BAM sample.
    """
    paired = vcfutils.get_paired_bams(align_bams, items)
    assert paired and not paired.normal_bam, (
        "Pisces supports tumor-only variant calling: %s" %
        (",".join([dd.get_sample_name(d) for d in items])))
    vrs = bedutils.population_variant_regions(items)
    target = shared.subset_variant_regions(vrs,
                                           region,
                                           out_file,
                                           items=items,
                                           do_merge=True)
    min_af = float(dd.get_min_allele_fraction(paired.tumor_data)) / 100.0
    if not utils.file_exists(out_file):
        base_out_name = utils.splitext_plus(os.path.basename(
            paired.tumor_bam))[0]
        raw_file = "%s.vcf" % utils.splitext_plus(out_file)[0]
        with file_transaction(paired.tumor_data, raw_file) as tx_out_file:
            ref_dir = _prep_genome(os.path.dirname(tx_out_file),
                                   paired.tumor_data)
            out_dir = os.path.dirname(tx_out_file)
            cores = dd.get_num_cores(paired.tumor_data)
            cmd = (
                "pisces --bampaths {paired.tumor_bam} --genomepaths {ref_dir} --intervalpaths {target} "
                "--maxthreads {cores} --minvf {min_af} --ploidy somatic --gvcf false -o {out_dir}"
            )
            do.run(cmd.format(**locals()), "Pisces tumor-only somatic calling")
            shutil.move(os.path.join(out_dir, "%s.vcf" % base_out_name),
                        tx_out_file)
        vcfutils.bgzip_and_index(
            raw_file,
            paired.tumor_data["config"],
            prep_cmd="sed 's#%s.bam#%s#' | %s" %
            (base_out_name, dd.get_sample_name(paired.tumor_data),
             vcfutils.add_contig_to_header_cl(
                 dd.get_ref_file(paired.tumor_data), out_file)))
    return vcfutils.bgzip_and_index(out_file, paired.tumor_data["config"])
Exemplo n.º 6
0
def rnaseq_vardict_variant_calling(data):
    sample = dd.get_sample_name(data)
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                              "variation", "rnaseq", "vardict"))
    out_file = os.path.join(out_dir, sample + "-vardict.vcf.gz")
    if file_exists(out_file):
        data = dd.set_vrn_file(data, out_file)
        return data
    vardict_cmd = vardict.get_vardict_command(data)
    strandbias = "teststrandbias.R"
    var2vcf = "var2vcf_valid.pl"
    vcfstreamsort = config_utils.get_program("vcfstreamsort", data)
    compress_cmd = "| bgzip -c"
    freq = float(dd.get_min_allele_fraction(data, 20) / 100.0)
    var2vcf_opts = "-v 50"
    fix_ambig = vcfutils.fix_ambiguous_cl()
    remove_dup = vcfutils.remove_dup_cl()
    r_setup = get_R_exports()
    ref_file = dd.get_ref_file(data)
    bamfile = dd.get_work_bam(data)
    data = _setup_variant_regions(data, out_dir)
    opts, _ = vardict._vardict_options_from_config([data], data["config"], out_file, dd.get_variant_regions(data),
                                                   is_rnaseq=True)
    cores = dd.get_num_cores(data)
    if cores and cores > 1:
        opts += " -th %s" % str(cores)
    with file_transaction(data, out_file) as tx_out_file:
        jvm_opts = vardict._get_jvm_opts(data, tx_out_file)
        cmd = ("{r_setup} && {jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} "
               "-N {sample} -b {bamfile} {opts} "
               "| {strandbias}"
               "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
               "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} "
               "> {tx_out_file}")
        message = "Calling RNA-seq variants with VarDict"
        do.run(cmd.format(**locals()), message)
    out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    data = dd.set_vrn_file(data, out_file)
    return data
Exemplo n.º 7
0
def run(align_bams, items, ref_file, assoc_files, region=None, out_file=None):
    """Run tumor only pisces calling

    Handles bgzipping output file and fixing VCF sample naming to match BAM sample.
    """
    paired = vcfutils.get_paired_bams(align_bams, items)
    assert paired and not paired.normal_bam, ("Pisces supports tumor-only variant calling: %s" %
                                              (",".join([dd.get_sample_name(d) for d in items])))
    vrs = bedutils.population_variant_regions(items)
    target = shared.subset_variant_regions(vrs, region,
                                            out_file, items=items, do_merge=True)
    min_af = float(dd.get_min_allele_fraction(paired.tumor_data)) / 100.0
    if not utils.file_exists(out_file):
        base_out_name = utils.splitext_plus(os.path.basename(paired.tumor_bam))[0]
        raw_file = "%s.vcf" % utils.splitext_plus(out_file)[0]
        with file_transaction(paired.tumor_data, raw_file) as tx_out_file:
            ref_dir = _prep_genome(os.path.dirname(tx_out_file), paired.tumor_data)
            out_dir = os.path.dirname(tx_out_file)
            cores = dd.get_num_cores(paired.tumor_data)
            emit_min_af = min_af / 10.0
            cmd = ("pisces --bampaths {paired.tumor_bam} --genomepaths {ref_dir} --intervalpaths {target} "
                   "--maxthreads {cores} --minvf {emit_min_af} --vffilter {min_af} "
                   "--ploidy somatic --gvcf false -o {out_dir}")
            # Recommended filtering for low frequency indels
            # https://github.com/bcbio/bcbio-nextgen/commit/49d0cbb1f6dcbea629c63749e2f9813bd06dcee3#commitcomment-29765373
            cmd += " -RMxNFilter 5,9,0.35"
            # For low frequency UMI tagged variants, set higher variant thresholds
            # https://github.com/Illumina/Pisces/issues/14#issuecomment-399756862
            if min_af < (1.0 / 100.0):
                cmd += " --minbasecallquality 30"
            do.run(cmd.format(**locals()), "Pisces tumor-only somatic calling")
            shutil.move(os.path.join(out_dir, "%s.vcf" % base_out_name),
                        tx_out_file)
        vcfutils.bgzip_and_index(raw_file, paired.tumor_data["config"],
                                 prep_cmd="sed 's#%s.bam#%s#' | %s" %
                                 (base_out_name, dd.get_sample_name(paired.tumor_data),
                                  vcfutils.add_contig_to_header_cl(dd.get_ref_file(paired.tumor_data), out_file)))
    return vcfutils.bgzip_and_index(out_file, paired.tumor_data["config"])