def collect_var_quals(self, caller_vars, bed, orig_vcf): """ For each call and input variant, find the quality of the matching called variant, if there is one Return a dict[variant key][caller] for each input variant Missing variants (ref calls) are assigned a quality of MISSING_QUAL """ var_quals = defaultdict(dict) for region in util.read_regions(bed): key = var_key(util.find_matching_var(orig_vcf, region)) for caller in caller_vars: with pysam.VariantFile(caller_vars[caller]) as cvars: cvar = util.find_matching_var(cvars, region) var_quals[key][caller] = find_qual(cvar) return var_quals
def collect_bam_stats(self, bam, bed, orig_vcf): """ For each bed region compute some bam-level stats and return them in a dict The key of the dict is the var_key of the matching original (input) variant """ bam_stats = defaultdict(dict) for region in util.read_regions(bed): key = var_key(util.find_matching_var(orig_vcf, region)) bam_stats[key] = bam_simulation.gen_bam_stats(bam, region) return bam_stats
def compare_test_vcf(self, raw_orig_vcf, raw_test_vcf): raw_orig_vcf = os.path.abspath(raw_orig_vcf) raw_test_vcf = os.path.abspath(raw_test_vcf) orig_vars = list(pysam.VariantFile(raw_orig_vcf)) tmp_dirname = util.strip_extensions(raw_test_vcf, ['gz','vcf']) + "-vcomp-" + util.randstr() with util.TempDir(dirname=tmp_dirname): orig_vcf = util.bgz_tabix(raw_orig_vcf, self.conf) test_vcf = util.remove_halfcalls(raw_test_vcf) test_vcf = util.bgz_tabix(test_vcf, self.conf) caller_name = util.strip_extensions(test_vcf, ['gz','vcf']) bed = util.vars_to_bed(orig_vars) var_results = defaultdict(dict) var_quals = self.collect_var_quals({caller_name: test_vcf}, bed, orig_vcf) bamstats = defaultdict(dict) for normalizer_name, normalizer in self.normalizers.iteritems(): logging.info("--> Running normalizer " + normalizer_name) normed_orig_vcf = normalizer(orig_vcf, self.conf) normed_caller_vcf = normalizer(test_vcf, self.conf) for comparator_name, comparator in self.comparators.iteritems(): logging.info("--> Running comparator " + comparator_name + " (normalizer " + normalizer_name + ")") all_results = comparator(normed_orig_vcf, normed_caller_vcf, None, self.conf) single_results = split_results(all_results, bed) for region, result in zip(util.read_regions(bed), single_results): match_vars = util.find_matching_var(orig_vcf, region) if not match_vars: raise ValueError('Unable to find original variant from region ' + str(region)) result = compare_single_var(result, region, normed_orig_vcf, normed_caller_vcf, comparator, "/".join(str(i) for i in match_vars[0].samples[0]['GT']), self.conf) key = var_key(match_vars) if caller_name not in var_results[key]: var_results[key][caller_name] = defaultdict(dict) var_results[key][caller_name][normalizer_name][comparator_name] = result bamstats[key] = {} # Iterate over all results and write to standard output. We do this here instead of within the loops above # because it keeps results organized by variant, which makes them easier to look at self.reporter.write_output(var_results, var_quals, bamstats)
def process_batch(self, vcf, batchname, gt_policy, ex_snp=None, keep_tmpdir=False, read_depth=250, reads=None): """ Process the given batch of variants by creating a fake 'genome' with the variants, simulating reads from it, aligning the reads to make a bam file, then using different callers, variant normalizers, and variant comparison methods to generate results. The results are just written to a big text file, which needs to be parsed by a separate utility to generate anything readable. :param vcf: .vcf file containing variants to simulate :param conf: Configuration containing paths to all required binaries / executables / genomes, etc. :param homs: Boolean indicating whether variants should be simulated as hets or homs :return: """ raw_vars = list(pysam.VariantFile(vcf)) tmpdir_del_policy = util.TempDir.DELETE_NO_EXCEPTION if keep_tmpdir: tmpdir_del_policy = util.TempDir.NEVER_DELETE tmp_dirname = batchname + "-" + util.randstr() with util.TempDir(dirname=tmp_dirname, del_policy=tmpdir_del_policy): ref_path = self.conf.get('main', 'ref_genome') var_results = defaultdict(dict) orig_vcf, variant_sets = self.create_input_vcf(raw_vars, ex_snp, gt_policy) bed = util.vars_to_bed(variant_sets) if reads is None: reads = bam_simulation.gen_alt_fq(ref_path, variant_sets, read_depth) bam = bam_simulation.gen_alt_bam(ref_path, self.conf, reads) caller_variants = self.call_variants(bam, bed) bam_stats = self.collect_bam_stats(bam, bed, orig_vcf) var_quals = self.collect_var_quals(caller_variants, bed, orig_vcf) for normalizer_name, normalizer in self.normalizers.iteritems(): logging.info("--> Running normalizer " + normalizer_name) normed_orig_vcf = normalizer(orig_vcf, self.conf) for caller in caller_variants: normed_caller_vcf = normalizer(caller_variants[caller], self.conf) for comparator_name, comparator in self.comparators.iteritems(): logging.info("--> Running comparator " + comparator_name + " (normalizer " + normalizer_name + ")") all_results = comparator(normed_orig_vcf, normed_caller_vcf, None, self.conf) single_results = split_results(all_results, bed) for region, result in zip(util.read_regions(bed), single_results): match_vars = util.find_matching_var(orig_vcf, region) if not match_vars: raise ValueError('Unable to find original variant from region ' + str(region)) result = compare_single_var(result, region, normed_orig_vcf, normed_caller_vcf, comparator, "/".join(str(i) for i in match_vars[0].samples[0]['GT']), self.conf) key = var_key(match_vars) if caller not in var_results[key]: var_results[key][caller] = defaultdict(dict) var_results[key][caller][normalizer_name][comparator_name] = result #Iterate over all results and write to standard output. We do this here instead of within the loops above #because it keeps results organized by variant, which makes them easier to look at self.reporter.write_output(var_results, var_quals, bam_stats)