def _varscan_work(align_bams, ref_file, config, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.5": raise IOError("Please install version 2.3.5 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar("VarScan", config_utils.get_program("varscan", config, "dir")) resources = config_utils.get_resources("varscan", config) jvm_opts = " ".join(resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"])) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = ("{mpileup} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "> {out_file}") cmd = cmd.format(**locals()) do.run(cmd, "Varscan".format(**locals()), None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def _varscan_work(align_bams, ref_file, config, target_regions, out_file): """Perform SNP and indel genotyping with VarScan. """ max_read_depth = "1000" version = programs.jar_versioner("varscan", "VarScan")(config) if version < "v2.3.5": raise IOError( "Please install version 2.3.5 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) resources = config_utils.get_resources("varscan", config) jvm_opts = " ".join(resources.get("jvm_opts", ["-Xmx750m", "-Xmx2g"])) sample_list = _create_sample_list(align_bams, out_file) mpileup = samtools.prep_mpileup(align_bams, ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = ( "{mpileup} " "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 " " --vcf-sample-list {sample_list} --output-vcf --variants " "> {out_file}") cmd = cmd.format(**locals()) do.run(cmd, "Varscan".format(**locals()), None, [do.file_exists(out_file)]) os.remove(sample_list) # VarScan can create completely empty files in regions without # variants, so we create a correctly formatted empty file if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def shared_variantcall(call_fn, name, align_bams, ref_file, config, dbsnp=None, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ broad_runner = broad.runner_from_config(config) for x in align_bams: broad_runner.run_fn("picard_index", x) if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.info("Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]))) variant_regions = config["algorithm"].get("variant_regions", None) target_regions = subset_variant_regions(variant_regions, region, out_file) if ((variant_regions is not None and not os.path.isfile(target_regions)) or not all(has_aligned_reads(x, region) for x in align_bams)): write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: call_fn(align_bams, ref_file, config, target_regions, tx_out_file) return out_file
def _run_cortex_on_region(region, align_bam, ref_file, out_file_base, config): """Run cortex on a specified chromosome start/end region. """ kmers = [31, 51, 71] min_reads = 1750 cortex_dir = config["program"].get("cortex") stampy_dir = config["program"].get("stampy") vcftools_dir = config["program"].get("vcftools") if cortex_dir is None or stampy_dir is None: raise ValueError("cortex_var requires path to pre-built cortex and stampy") region_str = apply("{0}-{1}-{2}".format, region) base_dir = safe_makedir(os.path.join(os.path.dirname(out_file_base), region_str)) out_vcf_base = os.path.join(base_dir, "{0}-{1}".format( os.path.splitext(os.path.basename(out_file_base))[0], region_str)) out_file = "{0}.vcf".format(out_vcf_base) if not file_exists(out_file): fastq = _get_fastq_in_region(region, align_bam, out_vcf_base) if _count_fastq_reads(fastq, min_reads) < min_reads: write_empty_vcf(out_file) else: local_ref, genome_size = _get_local_ref(region, ref_file, out_vcf_base) indexes = _index_local_ref(local_ref, cortex_dir, stampy_dir, kmers) cortex_out = _run_cortex(fastq, indexes, {"kmers": kmers, "genome_size": genome_size, "sample": _get_sample_name(align_bam)}, out_vcf_base, {"cortex": cortex_dir, "stampy": stampy_dir, "vcftools": vcftools_dir}, config) if cortex_out: _remap_cortex_out(cortex_out, region, out_file) else: write_empty_vcf(out_file) return out_file
def run_cortex(align_bams, ref_file, config, dbsnp=None, region=None, out_file=None): """Top level entry to regional de-novo based variant calling with cortex_var. """ if len(align_bams) == 1: align_bam = align_bams[0] else: raise NotImplementedError("Need to add multisample calling for cortex_var") broad_runner = broad.runner_from_config(config) if out_file is None: out_file = "%s-cortex.vcf" % os.path.splitext(align_bam)[0] if region is not None: work_dir = safe_makedir(os.path.join(os.path.dirname(out_file), region.replace(".", "_"))) else: work_dir = os.path.dirname(out_file) if not file_exists(out_file): broad_runner.run_fn("picard_index", align_bam) variant_regions = config["algorithm"].get("variant_regions", None) if not variant_regions: raise ValueError("Only support regional variant calling with cortex_var: set variant_regions") target_regions = subset_variant_regions(variant_regions, region, out_file) if os.path.isfile(target_regions): with open(target_regions) as in_handle: regional_vcfs = [_run_cortex_on_region(x.strip().split("\t")[:3], align_bam, ref_file, work_dir, out_file, config) for x in in_handle] combine_file = apply("{0}-raw{1}".format, os.path.splitext(out_file)) _combine_variants(regional_vcfs, combine_file, ref_file, config) _select_final_variants(combine_file, out_file, config) else: write_empty_vcf(out_file) return out_file
def run_samtools(align_bam, ref_file, config, dbsnp=None, region=None, out_file=None): """Detect SNPs and indels with samtools mpileup and bcftools. """ broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index", align_bam) if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bam)[0] if not file_exists(out_file): logger.info("Genotyping with samtools: {region} {fname}".format( region=region, fname=os.path.basename(align_bam))) variant_regions = config["algorithm"].get("variant_regions", None) target_regions = subset_variant_regions(variant_regions, region, out_file) if variant_regions is not None and not os.path.isfile(target_regions): write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: _call_variants_samtools(align_bam, ref_file, config, target_regions, tx_out_file) return out_file
def run_cortex(align_bam, ref_file, config, dbsnp=None, region=None, out_file=None): """Top level entry to regional de-novo based variant calling with cortex_var. """ broad_runner = broad.runner_from_config(config) if out_file is None: out_file = "%s-cortex.vcf" % os.path.splitext(align_bam)[0] if region is not None: work_dir = safe_makedir(os.path.join(os.path.dirname(out_file), region.replace(".", "_"))) else: work_dir = os.path.dirname(out_file) if not file_exists(out_file): broad_runner.run_fn("picard_index", align_bam) variant_regions = config["algorithm"].get("variant_regions", None) if not variant_regions: raise ValueError("Only support regional variant calling with cortex_var: set variant_regions") target_regions = subset_variant_regions(variant_regions, region, out_file) if os.path.isfile(target_regions): with open(target_regions) as in_handle: regional_vcfs = [ _run_cortex_on_region(x.strip().split("\t")[:3], align_bam, ref_file, work_dir, out_file, config) for x in in_handle ] combine_file = apply("{0}-raw{1}".format, os.path.splitext(out_file)) _combine_variants(regional_vcfs, combine_file, ref_file, config) _select_final_variants(combine_file, out_file, config) else: write_empty_vcf(out_file) return out_file
def shared_variantcall(call_fn, name, align_bams, ref_file, config, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ broad_runner = broad.runner_from_config(config) for x in align_bams: broad_runner.run_fn("picard_index", x) if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.info( "Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]) ) ) variant_regions = config["algorithm"].get("variant_regions", None) target_regions = subset_variant_regions(variant_regions, region, out_file) if ( variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions) ) or not all(realign.has_aligned_reads(x, region) for x in align_bams): write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: call_fn(align_bams, ref_file, config, target_regions, tx_out_file) return out_file
def run_cortex(align_bam, ref_file, config, dbsnp=None, region=None, out_file=None): """Top level entry to regional de-novo based variant calling with cortex_var. """ broad_runner = broad.runner_from_config(config) if out_file is None: out_file = "%s-cortex.vcf" % os.path.splitext(align_bam)[0] if not file_exists(out_file): broad_runner.run_fn("picard_index", align_bam) variant_regions = config["algorithm"].get("variant_regions", None) if not variant_regions: raise ValueError( "Only regional variant calling with cortex_var is supported. Set variant_regions" ) target_regions = subset_variant_regions(variant_regions, region, out_file) if os.path.isfile(target_regions): with open(target_regions) as in_handle: regional_vcfs = [ _run_cortex_on_region(x.strip().split("\t")[:3], align_bam, ref_file, out_file, config) for x in in_handle ] combine_variant_files(regional_vcfs, out_file, ref_file, config) else: write_empty_vcf(out_file) return out_file
def _run_cortex_on_region(region, align_bam, ref_file, work_dir, out_file_base, config): """Run cortex on a specified chromosome start/end region. """ kmers = [31, 51, 71] min_reads = 1750 cortex_dir = config["program"].get("cortex") stampy_dir = config["program"].get("stampy") vcftools_dir = config["program"].get("vcftools") if cortex_dir is None or stampy_dir is None: raise ValueError( "cortex_var requires path to pre-built cortex and stampy") region_str = apply("{0}-{1}-{2}".format, region) base_dir = safe_makedir(os.path.join(work_dir, region_str)) try: out_vcf_base = os.path.join( base_dir, "{0}-{1}".format( os.path.splitext(os.path.basename(out_file_base))[0], region_str)) out_file = os.path.join( work_dir, os.path.basename("{0}.vcf".format(out_vcf_base))) if not file_exists(out_file): fastq = _get_fastq_in_region(region, align_bam, out_vcf_base) if _count_fastq_reads(fastq, min_reads) < min_reads: write_empty_vcf(out_file) else: local_ref, genome_size = _get_local_ref( region, ref_file, out_vcf_base) indexes = _index_local_ref(local_ref, cortex_dir, stampy_dir, kmers) cortex_out = _run_cortex( fastq, indexes, { "kmers": kmers, "genome_size": genome_size, "sample": _get_sample_name(align_bam) }, out_vcf_base, { "cortex": cortex_dir, "stampy": stampy_dir, "vcftools": vcftools_dir }, config) if cortex_out: _remap_cortex_out(cortex_out, region, out_file) else: write_empty_vcf(out_file) finally: if os.path.exists(base_dir): shutil.rmtree(base_dir) return out_file
def run_samtools(align_bam, ref_file, config, dbsnp=None, region=None, out_file=None): """Detect SNPs and indels with samtools mpileup and bcftools. """ broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index", align_bam) if out_file is None: out_file = "%s-variants.vcf" % os.path.splitext(align_bam)[0] if not file_exists(out_file): logger.info("Genotyping with samtools: {region} {fname}".format( region=region, fname=os.path.basename(align_bam))) variant_regions = config["algorithm"].get("variant_regions", None) target_regions = subset_variant_regions(variant_regions, region, out_file) if variant_regions is not None and not os.path.isfile(target_regions): write_empty_vcf(out_file) else: with file_transaction(out_file) as tx_out_file: _call_variants_samtools(align_bam, ref_file, config, target_regions, tx_out_file) return out_file
def run_cortex(align_bam, ref_file, config, dbsnp=None, region=None, out_file=None): """Top level entry to regional de-novo based variant calling with cortex_var. """ broad_runner = broad.runner_from_config(config) if out_file is None: out_file = "%s-cortex.vcf" % os.path.splitext(align_bam)[0] if not file_exists(out_file): broad_runner.run_fn("picard_index", align_bam) variant_regions = config["algorithm"].get("variant_regions", None) if not variant_regions: raise ValueError("Only regional variant calling with cortex_var is supported. Set variant_regions") target_regions = subset_variant_regions(variant_regions, region, out_file) if os.path.isfile(target_regions): with open(target_regions) as in_handle: regional_vcfs = [_run_cortex_on_region(x.strip().split("\t")[:3], align_bam, ref_file, out_file, config) for x in in_handle] combine_variant_files(regional_vcfs, out_file, ref_file, config) else: write_empty_vcf(out_file) return out_file
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm.""" if out_file is None: out_file = "%s-paired-variants.vcf" % os.path.splitext( align_bams[0])[0] if not file_exists(out_file): broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): write_empty_vcf(out_file) return with file_transaction(out_file) as tx_out_file: # Rationale: MuTect writes another table to stdout, # which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] try: broad_runner.run_mutect(params) except CalledProcessError as error: java_exception = _parse_gatk_java_error_string(error.cmd) #HACK: Currently MuTect bails out on certain small BAM files # Until the issue is fixed by Broad, this specific exception # will be ignored. All the other exceptions will be raised # correctly. if java_exception in _PASS_EXCEPTIONS: write_empty_vcf(tx_out_file) return else: raise return out_file