def _varscan_work(align_bams, ref_file, config, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.5": raise IOError("Please install version 2.3.5 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir")) resources = config_utils.get_resources("varscan", config) jvm_opts = " ".join(resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"])) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = ("{mpileup} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "> {out_file}") cmd = cmd.format(**locals()) do.run(cmd, "Varscan".format(**locals()), None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.5": raise IOError("Please install version 2.3.5 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir")) jvm_opts = _get_varscan_opts(config) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = "grep -v -P '\t0\t\t$'" cmd = ("{mpileup} | {remove_zerocoverage} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "> {out_file}") cmd = cmd.format(**locals()) do.run(cmd, "Varscan".format(**locals()), None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def _varscan_work(align_bams, ref_file, config, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.5": raise IOError( "Please install version 2.3.5 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) resources = config_utils.get_resources("varscan", config) jvm_opts = " ".join(resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"])) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = ( "{mpileup} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "> {out_file}") cmd = cmd.format(**locals()) do.run(cmd, "Varscan".format(**locals()), None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.6": raise IOError("Please install version 2.3.6 or better of VarScan" " with support for multisample calling and indels" " in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = "grep -v -P '\t0\t\t$'" # write a temporary mpileup file so we can check if empty mpfile = "%s.mpileup" % os.path.splitext(out_file)[0] with file_transaction(config, mpfile) as mpfile_tx: cmd = ("{mpileup} | {remove_zerocoverage} > {mpfile_tx}") do.run(cmd.format(**locals()), "mpileup for Varscan") if os.path.getsize(mpfile) == 0: write_empty_vcf(out_file) else: with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) fix_ambig = vcfutils.fix_ambiguous_cl() cmd = ( "cat {mpfile} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "| {fix_ambig} | vcfuniqalleles > {out_file}") do.run(cmd.format(**locals()), "Varscan", None, [do.file_exists(out_file)]) os.remove(sample_list) os.remove(mpfile) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) else: freebayes.clean_vcf_output(out_file, _clean_varscan_line, config) if orig_out_file.endswith(".gz"): vcfutils.bgzip_and_index(out_file, config)
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.6": raise IOError("Please install version 2.3.6 or better of VarScan" " with support for multisample calling and indels" " in VCF format.") varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir")) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = "grep -v -P '\t0\t\t$'" # write a temporary mpileup file so we can check if empty mpfile = "%s.mpileup" % os.path.splitext(out_file)[0] with file_transaction(config, mpfile) as mpfile_tx: cmd = ("{mpileup} | {remove_zerocoverage} > {mpfile_tx}") do.run(cmd.format(**locals()), "mpileup for Varscan") if os.path.getsize(mpfile) == 0: write_empty_vcf(out_file) else: with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) fix_ambig = vcfutils.fix_ambiguous_cl() cmd = ("cat {mpfile} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "| {fix_ambig} | vcfuniqalleles > {out_file}") do.run(cmd.format(**locals()), "Varscan", None, [do.file_exists(out_file)]) os.remove(sample_list) os.remove(mpfile) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) else: freebayes.clean_vcf_output(out_file, _clean_varscan_line, config) if orig_out_file.endswith(".gz"): vcfutils.bgzip_and_index(out_file, config)
def _varscan_work(align_bams, ref_file, items, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ config = items[0]["config"] max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.6": raise IOError("Please install version 2.3.6 or better of VarScan" " with support for multisample calling and indels" " in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) jvm_opts = _get_varscan_opts(config) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) # VarScan fails to generate a header on files that start with # zerocoverage calls; strip these with grep, we're not going to # call on them remove_zerocoverage = "grep -v -P '\t0\t\t$'" cmd = ( "{mpileup} | {remove_zerocoverage} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "> {out_file}") cmd = cmd.format(**locals()) do.run(cmd, "Varscan".format(**locals()), None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] version = programs.jar_versioner("varscan", "VarScan")(config) if LooseVersion(version) < LooseVersion("v2.3.6"): raise IOError( "Please install version 2.3.6 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) remove_zerocoverage = "grep -v -P '\t0\t\t$'" # No need for names in VarScan, hence the "_" paired = get_paired_bams(align_bams, items) if not paired.normal_bam: raise ValueError("Require both tumor and normal BAM files for VarScan cancer calling") if not file_exists(out_file): orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") base, ext = utils.splitext_plus(out_file) cleanup_files = [] for fname, mpext in [(paired.normal_bam, "normal"), (paired.tumor_bam, "tumor")]: mpfile = "%s-%s.mpileup" % (base, mpext) cleanup_files.append(mpfile) with file_transaction(mpfile) as mpfile_tx: mpileup = samtools.prep_mpileup([fname], ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = "{mpileup} > {mpfile_tx}" cmd = cmd.format(**locals()) do.run(cmd, "samtools mpileup".format(**locals()), None, [do.file_exists(mpfile_tx)]) # Sometimes mpileup writes an empty file: in this case we # just skip the rest of the analysis (VarScan will hang otherwise) if any(os.stat(filename).st_size == 0 for filename in cleanup_files): write_empty_vcf(orig_out_file, config) return # First index is normal, second is tumor normal_tmp_mpileup = cleanup_files[0] tumor_tmp_mpileup = cleanup_files[1] indel_file = base + ".indel.vcf" snp_file = base + ".snp.vcf" cleanup_files.append(indel_file) cleanup_files.append(snp_file) with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp): with utils.curdir_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic" " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}" " --output-vcf --min-coverage 5 --p-value 0.98 " "--strand-filter 1 ") # add minimum AF if "--min-var-freq" not in varscan_cmd: min_af = float(utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"),10)) / 100.0 varscan_cmd += "--min-var-freq {min_af} " do.run(varscan_cmd.format(**locals()), "Varscan", None, None) # VarScan files need to be corrected to match the VCF specification # We do this before combining them otherwise merging may fail # if there are invalid records to_combine = [] if do.file_exists(snp_file): to_combine.append(snp_file) _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name) if do.file_exists(indel_file): to_combine.append(indel_file) _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name) if not to_combine: write_empty_vcf(orig_out_file, config) return out_file = combine_variant_files([snp_file, indel_file], out_file, ref_file, config, region=target_regions) # Remove cleanup files for extra_file in cleanup_files: for ext in ["", ".gz", ".gz.tbi"]: if os.path.exists(extra_file + ext): os.remove(extra_file + ext) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if orig_out_file.endswith(".gz"): out_file = bgzip_and_index(out_file, config) _add_reject_flag(out_file, config)
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] version = programs.jar_versioner("varscan", "VarScan")(config) if LooseVersion(version) < LooseVersion("v2.3.6"): raise IOError( "Please install version 2.3.6 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) remove_zerocoverage = "grep -v -P '\t0\t\t$'" # No need for names in VarScan, hence the "_" paired = get_paired_bams(align_bams, items) if not paired.normal_bam: raise ValueError( "Require both tumor and normal BAM files for VarScan cancer calling" ) if not file_exists(out_file): base, ext = os.path.splitext(out_file) cleanup_files = [] for fname, mpext in [(paired.normal_bam, "normal"), (paired.tumor_bam, "tumor")]: mpfile = "%s-%s.mpileup" % (base, mpext) cleanup_files.append(mpfile) with file_transaction(mpfile) as mpfile_tx: mpileup = samtools.prep_mpileup([fname], ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = "{mpileup} > {mpfile_tx}" cmd = cmd.format(**locals()) do.run(cmd, "samtools mpileup".format(**locals()), None, [do.file_exists(mpfile_tx)]) # Sometimes mpileup writes an empty file: in this case we # just skip the rest of the analysis (VarScan will hang otherwise) if any(os.stat(filename).st_size == 0 for filename in cleanup_files): write_empty_vcf(out_file) return # First index is normal, second is tumor normal_tmp_mpileup = cleanup_files[0] tumor_tmp_mpileup = cleanup_files[1] jvm_opts = _get_varscan_opts(config) varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic" " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}" " --output-vcf --min-coverage 5 --p-value 0.98 " "--strand-filter 1 ") indel_file = base + ".indel.vcf" snp_file = base + ".snp.vcf" cleanup_files.append(indel_file) cleanup_files.append(snp_file) to_combine = [] with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp): varscan_cmd = varscan_cmd.format(**locals()) do.run(varscan_cmd, "Varscan".format(**locals()), None, None) # VarScan files need to be corrected to match the VCF specification # We do this before combining them otherwise merging may fail # if there are invalid records if do.file_exists(snp_file): to_combine.append(snp_file) _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name) if do.file_exists(indel_file): to_combine.append(indel_file) _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name) if not to_combine: write_empty_vcf(out_file) return out_file = combine_variant_files([snp_file, indel_file], out_file, ref_file, config, region=target_regions) # Remove cleanup files for extra_file in cleanup_files: os.remove(extra_file) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] version = programs.jar_versioner("varscan", "VarScan")(config) if LooseVersion(version) < LooseVersion("v2.3.6"): raise IOError( "Please install version 2.3.6 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) remove_zerocoverage = "grep -v -P '\t0\t\t$'" # No need for names in VarScan, hence the "_" tumor_bam, tumor_name, normal_bam, normal_name = get_paired_bams( align_bams, items) if not file_exists(out_file): base, ext = os.path.splitext(out_file) cleanup_files = [] for fname, mpext in [(normal_bam, "normal"), (tumor_bam, "tumor")]: mpfile = "%s-%s.mpileup" % (base, mpext) cleanup_files.append(mpfile) with file_transaction(mpfile) as mpfile_tx: mpileup = samtools.prep_mpileup([fname], ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = "{mpileup} > {mpfile_tx}" cmd = cmd.format(**locals()) do.run(cmd, "samtools mpileup".format(**locals()), None, [do.file_exists(mpfile_tx)]) # Sometimes mpileup writes an empty file: in this case we # just skip the rest of the analysis (VarScan will hang otherwise) if any(os.stat(filename).st_size == 0 for filename in cleanup_files): write_empty_vcf(out_file) return # First index is normal, second is tumor normal_tmp_mpileup = cleanup_files[0] tumor_tmp_mpileup = cleanup_files[1] jvm_opts = _get_varscan_opts(config) varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic" " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}" " --output-vcf --min-coverage 5 --p-value 0.98") indel_file = base + ".indel.vcf" snp_file = base + ".snp.vcf" cleanup_files.append(indel_file) cleanup_files.append(snp_file) to_combine = [] with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp): varscan_cmd = varscan_cmd.format(**locals()) do.run(varscan_cmd, "Varscan".format(**locals()), None, None) # VarScan files need to be corrected to match the VCF specification # We do this before combining them otherwise merging may fail # if there are invalid records if do.file_exists(snp_file): to_combine.append(snp_file) _fix_varscan_vcf(snp_file, normal_name, tumor_name) if do.file_exists(indel_file): to_combine.append(indel_file) _fix_varscan_vcf(indel_file, normal_name, tumor_name) if not to_combine: write_empty_vcf(out_file) return out_file = combine_variant_files([snp_file, indel_file], out_file, ref_file, config, region=target_regions) # Remove cleanup files for extra_file in cleanup_files: os.remove(extra_file) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] version = programs.jar_versioner("varscan", "VarScan")(config) if LooseVersion(version) < LooseVersion("v2.3.6"): raise IOError( "Please install version 2.3.6 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) remove_zerocoverage = "grep -v -P '\t0\t\t$'" # No need for names in VarScan, hence the "_" paired = get_paired_bams(align_bams, items) if not paired.normal_bam: affected_batch = items[0]["metadata"]["batch"] message = ("Batch {} requires both tumor and normal BAM files for" " VarScan cancer calling").format(affected_batch) raise ValueError(message) if not file_exists(out_file): orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") base, ext = utils.splitext_plus(out_file) cleanup_files = [] for fname, mpext in [(paired.normal_bam, "normal"), (paired.tumor_bam, "tumor")]: mpfile = "%s-%s.mpileup" % (base, mpext) cleanup_files.append(mpfile) with file_transaction(config, mpfile) as mpfile_tx: mpileup = samtools.prep_mpileup([fname], ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) cmd = "{mpileup} > {mpfile_tx}" cmd = cmd.format(**locals()) do.run(cmd, "samtools mpileup".format(**locals()), None, [do.file_exists(mpfile_tx)]) # Sometimes mpileup writes an empty file: in this case we # just skip the rest of the analysis (VarScan will hang otherwise) if any(os.stat(filename).st_size == 0 for filename in cleanup_files): write_empty_vcf(orig_out_file, config) return # First index is normal, second is tumor normal_tmp_mpileup = cleanup_files[0] tumor_tmp_mpileup = cleanup_files[1] indel_file = base + ".indel.vcf" snp_file = base + ".snp.vcf" cleanup_files.append(indel_file) cleanup_files.append(snp_file) with file_transaction(config, indel_file, snp_file) as (tx_indel, tx_snp): with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) fix_ambig = vcfutils.fix_ambiguous_cl() tx_snp_in = "%s-orig" % os.path.splitext(tx_snp)[0] tx_indel_in = "%s-orig" % os.path.splitext(tx_indel)[0] varscan_cmd = ( "java {jvm_opts} -jar {varscan_jar} somatic" " {normal_tmp_mpileup} {tumor_tmp_mpileup} " "--output-snp {tx_snp_in} --output-indel {tx_indel_in} " " --output-vcf --min-coverage 5 --p-value 0.98 " "--strand-filter 1 ") # add minimum AF if "--min-var-freq" not in varscan_cmd: min_af = float( utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 varscan_cmd += "--min-var-freq {min_af} " do.run(varscan_cmd.format(**locals()), "Varscan", None, None) for orig_fname, fname in [(tx_snp_in, tx_snp), (tx_indel_in, tx_indel)]: cmd = "vcfuniqalleles {orig_fname}.vcf | {fix_ambig} > {fname}" do.run(cmd.format(**locals()), "Varscan paired fix") # VarScan files need to be corrected to match the VCF specification # We do this before combining them otherwise merging may fail # if there are invalid records to_combine = [] if do.file_exists(snp_file): to_combine.append(snp_file) _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name, config) if do.file_exists(indel_file): to_combine.append(indel_file) _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name, config) if not to_combine: write_empty_vcf(orig_out_file, config) return out_file = combine_variant_files([snp_file, indel_file], out_file, ref_file, config, region=target_regions) # Remove cleanup files for extra_file in cleanup_files: for ext in ["", ".gz", ".gz.tbi"]: if os.path.exists(extra_file + ext): os.remove(extra_file + ext) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if orig_out_file.endswith(".gz"): out_file = bgzip_and_index(out_file, config) _add_reject_flag(out_file, config)