def main(args): """ Respond to command line args, check for basic config errors, and perform analyses :param args: """ conf = cp.SafeConfigParser() conf.read(args.conf) output = sys.stdout if isinstance(args.output, str): output = open(args.output, "w") if args.seed is not None: random.seed(args.seed) if args.het and args.hom: raise ValueError('Specify just one of --het or --hom') gt_default = None if args.het: gt_default = bam_simulation.ALL_HETS if args.hom: gt_default = bam_simulation.ALL_HOMS snp_inf = None if args.addsnp: if args.trans: snp_policy = bam_simulation.TRANS else: snp_policy = bam_simulation.CIS if args.snphom: snp_policy = bam_simulation.ALL_HOMS snp_inf = ExSNPInfo(policy=snp_policy, dist=-4) if args.generate_fqs: if len(args.vcf)>1: raise ValueError('Only one VCF supported for now') vcf = args.vcf[0] fastq_prefix = util.strip_extensions(vcf, ['vcf','gz']) logging.info("Generating reads for vcf file " + vcf) suffix = ".d" + str(args.readdepth) if gt_default == bam_simulation.ALL_HETS: suffix = suffix + ".het" elif gt_default == bam_simulation.ALL_HOMS: suffix = suffix + ".hom" fastq_prefix += suffix gen_reads(vcf, fastq_prefix + ".truth.vcf", fastq_prefix, snp_inf, gt_default, args.readdepth, conf) return if args.test_vcf: if len(args.vcf) > 1: raise ValueError('Only one VCF supported for now') process_test_vcf(args.vcf[0], args.test_vcf, output, conf) return for vcf in args.vcf: logging.info("Processing vcf file " + vcf) process_vcf(vcf, gt_default, conf, output, args.callers, fqs=args.fqs, snp_info=snp_inf, single_batch=args.batch, keep_tmpdir=args.keep, read_depth=args.readdepth)
def normalize_nothing(orig_vcf, conf): """ Just copy the original vcf to a new, identical vcf file. """ new_vcf = util.strip_extensions(orig_vcf, ['vcf']) + '.nonorm.vcf' cmd = 'cp {orig} {new}'.format(orig_vcf, new_ncf) subprocess.check_call(cmd, shell=True) return util.bgz_tabix(new_vcf, conf)
def compare_test_vcf(self, raw_orig_vcf, raw_test_vcf): raw_orig_vcf = os.path.abspath(raw_orig_vcf) raw_test_vcf = os.path.abspath(raw_test_vcf) orig_vars = list(pysam.VariantFile(raw_orig_vcf)) tmp_dirname = util.strip_extensions(raw_test_vcf, ['gz','vcf']) + "-vcomp-" + util.randstr() with util.TempDir(dirname=tmp_dirname): orig_vcf = util.bgz_tabix(raw_orig_vcf, self.conf) test_vcf = util.remove_halfcalls(raw_test_vcf) test_vcf = util.bgz_tabix(test_vcf, self.conf) caller_name = util.strip_extensions(test_vcf, ['gz','vcf']) bed = util.vars_to_bed(orig_vars) var_results = defaultdict(dict) var_quals = self.collect_var_quals({caller_name: test_vcf}, bed, orig_vcf) bamstats = defaultdict(dict) for normalizer_name, normalizer in self.normalizers.iteritems(): logging.info("--> Running normalizer " + normalizer_name) normed_orig_vcf = normalizer(orig_vcf, self.conf) normed_caller_vcf = normalizer(test_vcf, self.conf) for comparator_name, comparator in self.comparators.iteritems(): logging.info("--> Running comparator " + comparator_name + " (normalizer " + normalizer_name + ")") all_results = comparator(normed_orig_vcf, normed_caller_vcf, None, self.conf) single_results = split_results(all_results, bed) for region, result in zip(util.read_regions(bed), single_results): match_vars = util.find_matching_var(orig_vcf, region) if not match_vars: raise ValueError('Unable to find original variant from region ' + str(region)) result = compare_single_var(result, region, normed_orig_vcf, normed_caller_vcf, comparator, "/".join(str(i) for i in match_vars[0].samples[0]['GT']), self.conf) key = var_key(match_vars) if caller_name not in var_results[key]: var_results[key][caller_name] = defaultdict(dict) var_results[key][caller_name][normalizer_name][comparator_name] = result bamstats[key] = {} # Iterate over all results and write to standard output. We do this here instead of within the loops above # because it keeps results organized by variant, which makes them easier to look at self.reporter.write_output(var_results, var_quals, bamstats)
def process_vcf(vcf, gt_default, conf, output, callers, fqs=None, snp_info=None, single_batch=False, keep_tmpdir=False, read_depth=250): """ Perform analyses for each variant in the VCF file. :param input_vcf: Path to vcf file containing variants to process :param single_batch: Assume all variants in VCF are part of one batch and process them all simultaneously :param keep_tmpdir: Preserve tmpdirs created (otherwise delete them, unless they are flagged) :param conf: Configuration object """ variant_callers = core_callers.get_callers() variant_callers.update(load_components(conf, 'callers', 'get_callers')) normalizers = core_norms.get_normalizers() normalizers.update(load_components(conf, 'normalizers', 'get_normalizers')) comparators = core_comps.get_comparators() comparators.update(load_components(conf, 'comparators', 'get_comparators')) if callers is not None and len(callers)>0: callers_to_use = {} for caller in callers: if caller not in variant_callers: raise KeyError('No variant caller ' + caller + ' found in callers') callers_to_use[caller] = variant_callers[caller] variant_callers = callers_to_use if fqs is not None: fqs = [os.path.abspath(fq) for fq in fqs] processor = bp.VariantProcessor(variant_callers, normalizers, comparators, JsonReporter(output), conf) logging.info("Processing variants in file " + vcf) if single_batch: logging.info("Processing all variants as one batch") tmp_dir = '{}-tmp'.format(util.strip_extensions(os.path.basename(vcf), ['vcf','gz'])) processor.process_batch(vcf, tmp_dir, gt_default, ex_snp=snp_info, keep_tmpdir=keep_tmpdir, read_depth=read_depth, reads=fqs) else: batches = util.batch_variants(vcf, max_batch_size=1000, min_safe_dist=2000) for batchnum, batch_vcf in enumerate(batches, 1): logging.info('Processing batch #{} of {}'.format(batchnum, len(batches))) tmp_dir = '{}-batch{:03d}-tmp'.format(util.strip_extensions(os.path.basename(vcf), ['vcf','gz']), batchnum) processor.process_batch(batch_vcf, tmp_dir, gt_default, ex_snp=snp_info, keep_tmpdir=keep_tmpdir, read_depth=read_depth, reads=fqs) os.remove(batch_vcf)