Пример #1
0
def main(args):
    """
    Respond to command line args, check for basic config errors, and perform analyses
    :param args:
    """
    conf = cp.SafeConfigParser()
    conf.read(args.conf)

    output = sys.stdout
    if isinstance(args.output, str):
        output = open(args.output, "w")

    if args.seed is not None:
        random.seed(args.seed)

    if args.het and args.hom:
        raise ValueError('Specify just one of --het or --hom')

    gt_default = None
    if args.het:
        gt_default = bam_simulation.ALL_HETS
    if args.hom:
        gt_default = bam_simulation.ALL_HOMS

    snp_inf = None
    if args.addsnp:
        if args.trans:
            snp_policy = bam_simulation.TRANS
        else:
            snp_policy = bam_simulation.CIS
        if args.snphom:
            snp_policy = bam_simulation.ALL_HOMS
        snp_inf = ExSNPInfo(policy=snp_policy, dist=-4)

    if args.generate_fqs:
        if len(args.vcf)>1:
            raise ValueError('Only one VCF supported for now')
        vcf = args.vcf[0]
        fastq_prefix = util.strip_extensions(vcf, ['vcf','gz'])
        logging.info("Generating reads for vcf file " + vcf)
        suffix = ".d" + str(args.readdepth)
        if gt_default == bam_simulation.ALL_HETS:
            suffix = suffix + ".het"
        elif gt_default == bam_simulation.ALL_HOMS:
            suffix = suffix + ".hom"
        fastq_prefix += suffix
        gen_reads(vcf, fastq_prefix + ".truth.vcf", fastq_prefix, snp_inf, gt_default, args.readdepth, conf)
        return

    if args.test_vcf:
        if len(args.vcf) > 1:
            raise ValueError('Only one VCF supported for now')
        process_test_vcf(args.vcf[0], args.test_vcf, output, conf)
        return

    for vcf in args.vcf:
        logging.info("Processing vcf file " + vcf)
        process_vcf(vcf, gt_default, conf, output, args.callers, fqs=args.fqs, snp_info=snp_inf,
                         single_batch=args.batch, keep_tmpdir=args.keep, read_depth=args.readdepth)
Пример #2
0
def normalize_nothing(orig_vcf, conf):
    """
    Just copy the original vcf to a new, identical vcf file.
    """
    new_vcf = util.strip_extensions(orig_vcf, ['vcf']) + '.nonorm.vcf'
    cmd = 'cp {orig} {new}'.format(orig_vcf, new_ncf)
    subprocess.check_call(cmd, shell=True)
    return util.bgz_tabix(new_vcf, conf)
Пример #3
0
    def compare_test_vcf(self, raw_orig_vcf, raw_test_vcf):
        raw_orig_vcf = os.path.abspath(raw_orig_vcf)
        raw_test_vcf = os.path.abspath(raw_test_vcf)
        orig_vars    = list(pysam.VariantFile(raw_orig_vcf))
        tmp_dirname  = util.strip_extensions(raw_test_vcf, ['gz','vcf']) + "-vcomp-" + util.randstr()

        with util.TempDir(dirname=tmp_dirname):
            orig_vcf = util.bgz_tabix(raw_orig_vcf, self.conf)
            test_vcf = util.remove_halfcalls(raw_test_vcf)
            test_vcf = util.bgz_tabix(test_vcf, self.conf)
            caller_name = util.strip_extensions(test_vcf, ['gz','vcf'])
            bed = util.vars_to_bed(orig_vars)
            var_results = defaultdict(dict)
            var_quals = self.collect_var_quals({caller_name: test_vcf}, bed, orig_vcf)
            bamstats = defaultdict(dict)

            for normalizer_name, normalizer in self.normalizers.iteritems():
                logging.info("--> Running normalizer " + normalizer_name)
                normed_orig_vcf   = normalizer(orig_vcf, self.conf)
                normed_caller_vcf = normalizer(test_vcf, self.conf)

                for comparator_name, comparator in self.comparators.iteritems():
                    logging.info("--> Running comparator " + comparator_name + " (normalizer " + normalizer_name + ")")
                    all_results = comparator(normed_orig_vcf, normed_caller_vcf, None, self.conf)
                    single_results = split_results(all_results, bed)
                    for region, result in zip(util.read_regions(bed), single_results):
                        match_vars = util.find_matching_var(orig_vcf, region)
                        if not match_vars:
                            raise ValueError('Unable to find original variant from region ' + str(region))
                        result = compare_single_var(result,
                                                    region,
                                                    normed_orig_vcf,
                                                    normed_caller_vcf,
                                                    comparator,
                                                    "/".join(str(i) for i in match_vars[0].samples[0]['GT']),
                                                    self.conf)
                        key = var_key(match_vars)
                        if caller_name not in var_results[key]:
                            var_results[key][caller_name] = defaultdict(dict)
                        var_results[key][caller_name][normalizer_name][comparator_name] = result
                        bamstats[key] = {}

        # Iterate over all results and write to standard output. We do this here instead of within the loops above
        # because it keeps results organized by variant, which makes them easier to look at
        self.reporter.write_output(var_results, var_quals, bamstats)
Пример #4
0
def process_vcf(vcf, gt_default, conf, output, callers, fqs=None, snp_info=None, single_batch=False, keep_tmpdir=False, read_depth=250):
    """
    Perform analyses for each variant in the VCF file.
    :param input_vcf: Path to vcf file containing variants to process
    :param single_batch: Assume all variants in VCF are part of one batch and process them all simultaneously
    :param keep_tmpdir: Preserve tmpdirs created (otherwise delete them, unless they are flagged)
    :param conf: Configuration object
    """

    variant_callers = core_callers.get_callers()
    variant_callers.update(load_components(conf, 'callers', 'get_callers'))

    normalizers = core_norms.get_normalizers()
    normalizers.update(load_components(conf, 'normalizers', 'get_normalizers'))

    comparators = core_comps.get_comparators()
    comparators.update(load_components(conf, 'comparators', 'get_comparators'))

    if callers is not None and len(callers)>0:
        callers_to_use = {}
        for caller in callers:
            if caller not in variant_callers:
                raise KeyError('No variant caller ' + caller + ' found in callers')
            callers_to_use[caller] = variant_callers[caller]
        variant_callers = callers_to_use

    if fqs is not None:
        fqs = [os.path.abspath(fq) for fq in fqs]

    processor = bp.VariantProcessor(variant_callers, normalizers, comparators, JsonReporter(output), conf)
    logging.info("Processing variants in file " + vcf)
    if single_batch:
        logging.info("Processing all variants as one batch")
        tmp_dir = '{}-tmp'.format(util.strip_extensions(os.path.basename(vcf), ['vcf','gz']))
        processor.process_batch(vcf, tmp_dir, gt_default, ex_snp=snp_info, keep_tmpdir=keep_tmpdir, read_depth=read_depth, reads=fqs)
    else:
        batches = util.batch_variants(vcf, max_batch_size=1000, min_safe_dist=2000)
        for batchnum, batch_vcf in enumerate(batches, 1):
            logging.info('Processing batch #{} of {}'.format(batchnum, len(batches)))
            tmp_dir = '{}-batch{:03d}-tmp'.format(util.strip_extensions(os.path.basename(vcf), ['vcf','gz']), batchnum)
            processor.process_batch(batch_vcf, tmp_dir, gt_default, ex_snp=snp_info, keep_tmpdir=keep_tmpdir, read_depth=read_depth, reads=fqs)
            os.remove(batch_vcf)