示例#1
0
def gen_alt_fq(ref, variant_sets, read_count, dest_prefix="input"):
    """
    Generate a batch of simulated reads independently for the variants in each variant_set
    Each set contains a list of variants and a policy describing cis / trans configuration
    :param ref: Path to reference fasta
    :param variant_sets: List of sets of variants to inject into reference
    :param read_count:
    :return:
    """
    reads1 = dest_prefix + "_R1.fq"
    reads2 = dest_prefix + "_R2.fq"
    read1_fh = open(reads1, "w")
    read2_fh = open(reads2, "w")
    #read_count = int(read_count * 1.25)  # Make sure depth in IGV is about whatever read_count is

    for vset in variant_sets:
        chrom = vset['vars'][0].chrom
        hap1, hap2 = collect_alts(vset)

        alt_genome = 'alt_genome' + util.randstr() + '.fa'
        alt_genome_size = gen_alt_genome(chrom, hap1, ref, alt_genome, overwrite=True)
        generate_reads(alt_genome, chrom, alt_genome_size / 2, read_count=read_count / 2, read1_fh=read1_fh, read2_fh=read2_fh)
        os.remove(alt_genome)
        os.remove(alt_genome + ".fai")

        alt_genome = 'alt_genome' + util.randstr() + '.fa'
        alt_genome_size = gen_alt_genome(chrom, hap2, ref, alt_genome, overwrite=True)
        generate_reads(alt_genome, chrom, alt_genome_size / 2, read_count=read_count / 2, read1_fh=read1_fh, read2_fh=read2_fh)
        os.remove(alt_genome)
        os.remove(alt_genome + ".fai")

    read1_fh.close()
    read2_fh.close()
    return (reads1, reads2)
示例#2
0
def call_variant_varscan(bam, orig_genome_path, bed, conf):
    pre_output = "varscan." + util.randstr() + ".mpileup"
    vcfoutput = "output-vs." + util.randstr() + ".vcf"
    bedarg = ""
    if bed is not None:
        bedarg = " -l " + bed
    cmd = (
        conf.get("main", "samtools_path")
        + " mpileup "
        + " -f "
        + orig_genome_path
        + " -o "
        + pre_output
        + " "
        + bedarg
        + " "
        + bam
    )
    subprocess.check_call(cmd, shell=True)
    cmd2 = (
        "java -Xmx2g -jar "
        + conf.get("main", "varscan_path")
        + " mpileup2cns "
        + pre_output
        + " --variants --output-vcf 1 --output-file "
        + vcfoutput
    )
    output = subprocess.check_output(cmd2, shell=True)
    with open(vcfoutput, "w") as fh:
        fh.write(output)
    return util.bgz_tabix(vcfoutput, conf)
示例#3
0
def call_variant_mp_bcf(bam, genome, bed, conf):
    pre_output = "mpileup." + util.randstr() + ".vcf"
    vcfoutput = "output-mp." + util.randstr() + ".vcf"
    bedarg = ""
    if bed is not None:
        bedarg = " -l " + bed
    cmd = conf.get('main','samtools') + ' mpileup ' + ' -f ' + genome + " -uv " + " -o " + pre_output + " " + bedarg + " " + bam
    subprocess.check_call(cmd)
    cmd2 = conf.get('main', 'bcftools') + ' call ' + ' -mv ' + ' -o ' + vcfoutput + " " + pre_output
    subprocess.check_call(cmd2)
    return util.bgz_tabix(vcfoutput, conf)
示例#4
0
def call_variant_varscan_emit_all(bam, genome, bed, conf):
    pre_output = "varscan." + util.randstr() + ".mpileup"
    vcfoutput = "output-vs." + util.randstr() + ".vcf"
    bedarg = ""
    if bed is not None:
        bedarg = " -l " + bed
    cmd = conf.get('main','samtools') + ' mpileup ' + ' -f ' + genome + " -o " + pre_output + " " + bedarg + " " + bam
    subprocess.check_call(cmd)
    cmd2 = "java -Xmx2g -jar " + conf.get('main', 'varscan') + ' mpileup2cns ' + pre_output + '  --p-value 0.5 --variants --output-vcf 1 --output-file ' + vcfoutput
    subprocess.check_call(cmd2, stdout=open(vcfoutput, 'w'))
    return util.bgz_tabix(vcfoutput, conf)
示例#5
0
def normalize_bcftools(orig_vcf, conf):
    """
    Use bcftools to normalize.
    :param orig_vcf:
    :param conf:
    :return:
    """
    norm_orig_vcf = orig_vcf.replace(".vcf.gz", ".norm.bcftools" + util.randstr() + ".vcf")
    norm_orig_cmd = conf.get('main', 'bcftools') + " norm " + " -c w -f " + conf.get('main', 'ref_genome') + " " + orig_vcf + " -o " + norm_orig_vcf
    subprocess.check_call(norm_orig_cmd.split())
    return util.bgz_tabix(norm_orig_vcf, conf)
示例#6
0
def call_variant_mp_bcf(bam, orig_genome_path, bed, conf):
    pre_output = "mpileup." + util.randstr() + ".vcf"
    vcfoutput = "output-mp." + util.randstr() + ".vcf"
    bedarg = ""
    if bed is not None:
        bedarg = " -l " + bed
    cmd = (
        conf.get("main", "samtools_path")
        + " mpileup "
        + " -f "
        + orig_genome_path
        + " -uv "
        + " -o "
        + pre_output
        + " "
        + bedarg
        + " "
        + bam
    )
    subprocess.check_call(cmd, shell=True)
    cmd2 = conf.get("main", "bcftools_path") + " call " + " -mv " + " -o " + vcfoutput + " " + pre_output
    subprocess.check_call(cmd2, shell=True)
    return util.bgz_tabix(vcfoutput, conf)
示例#7
0
def call_variant_rtg(bam, orig_genome_path, bed, conf):
    output_dir = "rtg-output-" + util.randstr()
    vcfoutput = output_dir + "/snps.vcf.gz"
    cmd = [
        "java",
        "-Djava.io.tmpdir=.",
        "-jar",
        conf.get("main", "rtg_jar"),
        "snp",
        "-t",
        conf.get("main", "rtg_ref_sdf"),
        "--bed-regions",
        bed,
        "-o",
        output_dir,
        bam,
    ]
    subprocess.check_output(cmd)
    return vcfoutput
示例#8
0
def call_variant_rtg(bam, genome, bed, conf):
    output_dir = "rtg-output-" + util.randstr()
    vcfoutput = output_dir + "/snps.vcf.gz"
    cmd=["java", "-Djava.io.tmpdir=.", "-jar", conf.get('main', 'rtg_jar'), "snp", "-t", conf.get('main', 'rtg_ref_sdf'), "--bed-regions", bed, "-o", output_dir, bam]
    subprocess.check_call(cmd, stdout=open('/dev/null'), stderr=subprocess.STDOUT)
    return vcfoutput
示例#9
0
    def process_batch(self, vcf, batchname, gt_policy, ex_snp=None, keep_tmpdir=False, read_depth=250, reads=None):
        """
        Process the given batch of variants by creating a fake 'genome' with the variants, simulating reads from it,
         aligning the reads to make a bam file, then using different callers, variant normalizers, and variant
         comparison methods to generate results. The results are just written to a big text file, which needs to
         be parsed by a separate utility to generate anything readable.
        :param vcf: .vcf file containing variants to simulate
        :param conf: Configuration containing paths to all required binaries / executables / genomes, etc.
        :param homs: Boolean indicating whether variants should be simulated as hets or homs
        :return:
        """
        raw_vars = list(pysam.VariantFile(vcf))

        tmpdir_del_policy = util.TempDir.DELETE_NO_EXCEPTION
        if keep_tmpdir:
            tmpdir_del_policy = util.TempDir.NEVER_DELETE

        tmp_dirname = batchname + "-" + util.randstr()
        with util.TempDir(dirname=tmp_dirname, del_policy=tmpdir_del_policy):
            ref_path = self.conf.get('main', 'ref_genome')
            var_results = defaultdict(dict)

            orig_vcf, variant_sets = self.create_input_vcf(raw_vars, ex_snp, gt_policy)
            bed = util.vars_to_bed(variant_sets)
            if reads is None:
                reads = bam_simulation.gen_alt_fq(ref_path, variant_sets, read_depth)
            bam = bam_simulation.gen_alt_bam(ref_path, self.conf, reads)

            caller_variants = self.call_variants(bam, bed)
            bam_stats = self.collect_bam_stats(bam, bed, orig_vcf)
            var_quals = self.collect_var_quals(caller_variants, bed, orig_vcf)

            for normalizer_name, normalizer in self.normalizers.iteritems():
                logging.info("--> Running normalizer " + normalizer_name)
                normed_orig_vcf = normalizer(orig_vcf, self.conf)

                for caller in caller_variants:
                    normed_caller_vcf = normalizer(caller_variants[caller], self.conf)

                    for comparator_name, comparator in self.comparators.iteritems():
                        logging.info("--> Running comparator " + comparator_name + " (normalizer " + normalizer_name + ")")
                        all_results = comparator(normed_orig_vcf, normed_caller_vcf, None, self.conf)
                        single_results = split_results(all_results, bed)
                        for region, result in zip(util.read_regions(bed), single_results):
                            match_vars = util.find_matching_var(orig_vcf, region)
                            if not match_vars:
                                raise ValueError('Unable to find original variant from region ' + str(region))
                            result = compare_single_var(result,
                                                        region,
                                                        normed_orig_vcf,
                                                        normed_caller_vcf,
                                                        comparator,
                                                        "/".join(str(i) for i in match_vars[0].samples[0]['GT']),
                                                        self.conf)
                            key = var_key(match_vars)
                            if caller not in var_results[key]:
                                var_results[key][caller] = defaultdict(dict)
                            var_results[key][caller][normalizer_name][comparator_name] = result
            #Iterate over all results and write to standard output. We do this here instead of within the loops above
            #because it keeps results organized by variant, which makes them easier to look at
            self.reporter.write_output(var_results, var_quals, bam_stats)
示例#10
0
    def compare_test_vcf(self, raw_orig_vcf, raw_test_vcf):
        raw_orig_vcf = os.path.abspath(raw_orig_vcf)
        raw_test_vcf = os.path.abspath(raw_test_vcf)
        orig_vars    = list(pysam.VariantFile(raw_orig_vcf))
        tmp_dirname  = util.strip_extensions(raw_test_vcf, ['gz','vcf']) + "-vcomp-" + util.randstr()

        with util.TempDir(dirname=tmp_dirname):
            orig_vcf = util.bgz_tabix(raw_orig_vcf, self.conf)
            test_vcf = util.remove_halfcalls(raw_test_vcf)
            test_vcf = util.bgz_tabix(test_vcf, self.conf)
            caller_name = util.strip_extensions(test_vcf, ['gz','vcf'])
            bed = util.vars_to_bed(orig_vars)
            var_results = defaultdict(dict)
            var_quals = self.collect_var_quals({caller_name: test_vcf}, bed, orig_vcf)
            bamstats = defaultdict(dict)

            for normalizer_name, normalizer in self.normalizers.iteritems():
                logging.info("--> Running normalizer " + normalizer_name)
                normed_orig_vcf   = normalizer(orig_vcf, self.conf)
                normed_caller_vcf = normalizer(test_vcf, self.conf)

                for comparator_name, comparator in self.comparators.iteritems():
                    logging.info("--> Running comparator " + comparator_name + " (normalizer " + normalizer_name + ")")
                    all_results = comparator(normed_orig_vcf, normed_caller_vcf, None, self.conf)
                    single_results = split_results(all_results, bed)
                    for region, result in zip(util.read_regions(bed), single_results):
                        match_vars = util.find_matching_var(orig_vcf, region)
                        if not match_vars:
                            raise ValueError('Unable to find original variant from region ' + str(region))
                        result = compare_single_var(result,
                                                    region,
                                                    normed_orig_vcf,
                                                    normed_caller_vcf,
                                                    comparator,
                                                    "/".join(str(i) for i in match_vars[0].samples[0]['GT']),
                                                    self.conf)
                        key = var_key(match_vars)
                        if caller_name not in var_results[key]:
                            var_results[key][caller_name] = defaultdict(dict)
                        var_results[key][caller_name][normalizer_name][comparator_name] = result
                        bamstats[key] = {}

        # Iterate over all results and write to standard output. We do this here instead of within the loops above
        # because it keeps results organized by variant, which makes them easier to look at
        self.reporter.write_output(var_results, var_quals, bamstats)