def gen_alt_fq(ref, variant_sets, read_count, dest_prefix="input"): """ Generate a batch of simulated reads independently for the variants in each variant_set Each set contains a list of variants and a policy describing cis / trans configuration :param ref: Path to reference fasta :param variant_sets: List of sets of variants to inject into reference :param read_count: :return: """ reads1 = dest_prefix + "_R1.fq" reads2 = dest_prefix + "_R2.fq" read1_fh = open(reads1, "w") read2_fh = open(reads2, "w") #read_count = int(read_count * 1.25) # Make sure depth in IGV is about whatever read_count is for vset in variant_sets: chrom = vset['vars'][0].chrom hap1, hap2 = collect_alts(vset) alt_genome = 'alt_genome' + util.randstr() + '.fa' alt_genome_size = gen_alt_genome(chrom, hap1, ref, alt_genome, overwrite=True) generate_reads(alt_genome, chrom, alt_genome_size / 2, read_count=read_count / 2, read1_fh=read1_fh, read2_fh=read2_fh) os.remove(alt_genome) os.remove(alt_genome + ".fai") alt_genome = 'alt_genome' + util.randstr() + '.fa' alt_genome_size = gen_alt_genome(chrom, hap2, ref, alt_genome, overwrite=True) generate_reads(alt_genome, chrom, alt_genome_size / 2, read_count=read_count / 2, read1_fh=read1_fh, read2_fh=read2_fh) os.remove(alt_genome) os.remove(alt_genome + ".fai") read1_fh.close() read2_fh.close() return (reads1, reads2)
def call_variant_varscan(bam, orig_genome_path, bed, conf): pre_output = "varscan." + util.randstr() + ".mpileup" vcfoutput = "output-vs." + util.randstr() + ".vcf" bedarg = "" if bed is not None: bedarg = " -l " + bed cmd = ( conf.get("main", "samtools_path") + " mpileup " + " -f " + orig_genome_path + " -o " + pre_output + " " + bedarg + " " + bam ) subprocess.check_call(cmd, shell=True) cmd2 = ( "java -Xmx2g -jar " + conf.get("main", "varscan_path") + " mpileup2cns " + pre_output + " --variants --output-vcf 1 --output-file " + vcfoutput ) output = subprocess.check_output(cmd2, shell=True) with open(vcfoutput, "w") as fh: fh.write(output) return util.bgz_tabix(vcfoutput, conf)
def call_variant_mp_bcf(bam, genome, bed, conf): pre_output = "mpileup." + util.randstr() + ".vcf" vcfoutput = "output-mp." + util.randstr() + ".vcf" bedarg = "" if bed is not None: bedarg = " -l " + bed cmd = conf.get('main','samtools') + ' mpileup ' + ' -f ' + genome + " -uv " + " -o " + pre_output + " " + bedarg + " " + bam subprocess.check_call(cmd) cmd2 = conf.get('main', 'bcftools') + ' call ' + ' -mv ' + ' -o ' + vcfoutput + " " + pre_output subprocess.check_call(cmd2) return util.bgz_tabix(vcfoutput, conf)
def call_variant_varscan_emit_all(bam, genome, bed, conf): pre_output = "varscan." + util.randstr() + ".mpileup" vcfoutput = "output-vs." + util.randstr() + ".vcf" bedarg = "" if bed is not None: bedarg = " -l " + bed cmd = conf.get('main','samtools') + ' mpileup ' + ' -f ' + genome + " -o " + pre_output + " " + bedarg + " " + bam subprocess.check_call(cmd) cmd2 = "java -Xmx2g -jar " + conf.get('main', 'varscan') + ' mpileup2cns ' + pre_output + ' --p-value 0.5 --variants --output-vcf 1 --output-file ' + vcfoutput subprocess.check_call(cmd2, stdout=open(vcfoutput, 'w')) return util.bgz_tabix(vcfoutput, conf)
def normalize_bcftools(orig_vcf, conf): """ Use bcftools to normalize. :param orig_vcf: :param conf: :return: """ norm_orig_vcf = orig_vcf.replace(".vcf.gz", ".norm.bcftools" + util.randstr() + ".vcf") norm_orig_cmd = conf.get('main', 'bcftools') + " norm " + " -c w -f " + conf.get('main', 'ref_genome') + " " + orig_vcf + " -o " + norm_orig_vcf subprocess.check_call(norm_orig_cmd.split()) return util.bgz_tabix(norm_orig_vcf, conf)
def call_variant_mp_bcf(bam, orig_genome_path, bed, conf): pre_output = "mpileup." + util.randstr() + ".vcf" vcfoutput = "output-mp." + util.randstr() + ".vcf" bedarg = "" if bed is not None: bedarg = " -l " + bed cmd = ( conf.get("main", "samtools_path") + " mpileup " + " -f " + orig_genome_path + " -uv " + " -o " + pre_output + " " + bedarg + " " + bam ) subprocess.check_call(cmd, shell=True) cmd2 = conf.get("main", "bcftools_path") + " call " + " -mv " + " -o " + vcfoutput + " " + pre_output subprocess.check_call(cmd2, shell=True) return util.bgz_tabix(vcfoutput, conf)
def call_variant_rtg(bam, orig_genome_path, bed, conf): output_dir = "rtg-output-" + util.randstr() vcfoutput = output_dir + "/snps.vcf.gz" cmd = [ "java", "-Djava.io.tmpdir=.", "-jar", conf.get("main", "rtg_jar"), "snp", "-t", conf.get("main", "rtg_ref_sdf"), "--bed-regions", bed, "-o", output_dir, bam, ] subprocess.check_output(cmd) return vcfoutput
def call_variant_rtg(bam, genome, bed, conf): output_dir = "rtg-output-" + util.randstr() vcfoutput = output_dir + "/snps.vcf.gz" cmd=["java", "-Djava.io.tmpdir=.", "-jar", conf.get('main', 'rtg_jar'), "snp", "-t", conf.get('main', 'rtg_ref_sdf'), "--bed-regions", bed, "-o", output_dir, bam] subprocess.check_call(cmd, stdout=open('/dev/null'), stderr=subprocess.STDOUT) return vcfoutput
def process_batch(self, vcf, batchname, gt_policy, ex_snp=None, keep_tmpdir=False, read_depth=250, reads=None): """ Process the given batch of variants by creating a fake 'genome' with the variants, simulating reads from it, aligning the reads to make a bam file, then using different callers, variant normalizers, and variant comparison methods to generate results. The results are just written to a big text file, which needs to be parsed by a separate utility to generate anything readable. :param vcf: .vcf file containing variants to simulate :param conf: Configuration containing paths to all required binaries / executables / genomes, etc. :param homs: Boolean indicating whether variants should be simulated as hets or homs :return: """ raw_vars = list(pysam.VariantFile(vcf)) tmpdir_del_policy = util.TempDir.DELETE_NO_EXCEPTION if keep_tmpdir: tmpdir_del_policy = util.TempDir.NEVER_DELETE tmp_dirname = batchname + "-" + util.randstr() with util.TempDir(dirname=tmp_dirname, del_policy=tmpdir_del_policy): ref_path = self.conf.get('main', 'ref_genome') var_results = defaultdict(dict) orig_vcf, variant_sets = self.create_input_vcf(raw_vars, ex_snp, gt_policy) bed = util.vars_to_bed(variant_sets) if reads is None: reads = bam_simulation.gen_alt_fq(ref_path, variant_sets, read_depth) bam = bam_simulation.gen_alt_bam(ref_path, self.conf, reads) caller_variants = self.call_variants(bam, bed) bam_stats = self.collect_bam_stats(bam, bed, orig_vcf) var_quals = self.collect_var_quals(caller_variants, bed, orig_vcf) for normalizer_name, normalizer in self.normalizers.iteritems(): logging.info("--> Running normalizer " + normalizer_name) normed_orig_vcf = normalizer(orig_vcf, self.conf) for caller in caller_variants: normed_caller_vcf = normalizer(caller_variants[caller], self.conf) for comparator_name, comparator in self.comparators.iteritems(): logging.info("--> Running comparator " + comparator_name + " (normalizer " + normalizer_name + ")") all_results = comparator(normed_orig_vcf, normed_caller_vcf, None, self.conf) single_results = split_results(all_results, bed) for region, result in zip(util.read_regions(bed), single_results): match_vars = util.find_matching_var(orig_vcf, region) if not match_vars: raise ValueError('Unable to find original variant from region ' + str(region)) result = compare_single_var(result, region, normed_orig_vcf, normed_caller_vcf, comparator, "/".join(str(i) for i in match_vars[0].samples[0]['GT']), self.conf) key = var_key(match_vars) if caller not in var_results[key]: var_results[key][caller] = defaultdict(dict) var_results[key][caller][normalizer_name][comparator_name] = result #Iterate over all results and write to standard output. We do this here instead of within the loops above #because it keeps results organized by variant, which makes them easier to look at self.reporter.write_output(var_results, var_quals, bam_stats)
def compare_test_vcf(self, raw_orig_vcf, raw_test_vcf): raw_orig_vcf = os.path.abspath(raw_orig_vcf) raw_test_vcf = os.path.abspath(raw_test_vcf) orig_vars = list(pysam.VariantFile(raw_orig_vcf)) tmp_dirname = util.strip_extensions(raw_test_vcf, ['gz','vcf']) + "-vcomp-" + util.randstr() with util.TempDir(dirname=tmp_dirname): orig_vcf = util.bgz_tabix(raw_orig_vcf, self.conf) test_vcf = util.remove_halfcalls(raw_test_vcf) test_vcf = util.bgz_tabix(test_vcf, self.conf) caller_name = util.strip_extensions(test_vcf, ['gz','vcf']) bed = util.vars_to_bed(orig_vars) var_results = defaultdict(dict) var_quals = self.collect_var_quals({caller_name: test_vcf}, bed, orig_vcf) bamstats = defaultdict(dict) for normalizer_name, normalizer in self.normalizers.iteritems(): logging.info("--> Running normalizer " + normalizer_name) normed_orig_vcf = normalizer(orig_vcf, self.conf) normed_caller_vcf = normalizer(test_vcf, self.conf) for comparator_name, comparator in self.comparators.iteritems(): logging.info("--> Running comparator " + comparator_name + " (normalizer " + normalizer_name + ")") all_results = comparator(normed_orig_vcf, normed_caller_vcf, None, self.conf) single_results = split_results(all_results, bed) for region, result in zip(util.read_regions(bed), single_results): match_vars = util.find_matching_var(orig_vcf, region) if not match_vars: raise ValueError('Unable to find original variant from region ' + str(region)) result = compare_single_var(result, region, normed_orig_vcf, normed_caller_vcf, comparator, "/".join(str(i) for i in match_vars[0].samples[0]['GT']), self.conf) key = var_key(match_vars) if caller_name not in var_results[key]: var_results[key][caller_name] = defaultdict(dict) var_results[key][caller_name][normalizer_name][comparator_name] = result bamstats[key] = {} # Iterate over all results and write to standard output. We do this here instead of within the loops above # because it keeps results organized by variant, which makes them easier to look at self.reporter.write_output(var_results, var_quals, bamstats)