Пример #1
0
    def test_002_check_split(self):
        self.split_fps.extend(split_variants(self.vcf_merged))

        # 1-based positions on either position to exclude
        # these are where two variants have been merged and cannot be easily
        # separated without alignment, or where an indel has been rolled
        # forwards due to the merge and splitting apart process.
        expt_excluded = [
            {675, 677, 1582, 1734, 1775},  # hap 1
            {370, 1194},  # hap 2
        ]

        for expt_vcf, got_vcf, excluded in zip([self.vcf1, self.vcf2],
                                               self.split_fps, expt_excluded):
            expt_vcfr = VCFReader(expt_vcf)
            got_vcfr = VCFReader(got_vcf)

            for expt in expt_vcfr.fetch():
                if expt.pos + 1 in excluded:
                    continue
                got = list(
                    got_vcfr.fetch(expt.chrom, expt.pos,
                                   expt.pos + len(expt.ref) + 1))
                self.assertEqual(
                    len(got), 1,
                    'Could not find split variant for {}:{}.'.format(
                        expt.chrom, expt.pos + 1))
                got = got[0]
                for key in ('chrom', 'pos', 'ref', 'alt'):
                    expected = getattr(expt, key)
                    result = getattr(got, key)
                    self.assertEqual(
                        expected, result,
                        'Splitting failed for {}:{} {}.'.format(
                            expt.chrom, expt.pos + 1, key))
Пример #2
0
def merge_haploid_vcfs(vcf1, vcf2, vcf_out):
    "Merge SNPs from two haploid VCFs into an unphased diploid vcf."
    loci_by_chrom = defaultdict(set)

    vcf1 = VCFReader(vcf1)
    vcf2 = VCFReader(vcf2)

    for v in chain(vcf1.fetch(), vcf2.fetch()):
        loci_by_chrom[v.chrom].add(v.pos)

    with VCFWriter(vcf_out, 'w', version='4.1') as vcf_writer:
        for chrom, loci in loci_by_chrom.items():
            for pos in sorted(loci):
                v1 = list(vcf1.fetch(ref_name=chrom, start=pos, end=pos+1))
                v2 = list(vcf2.fetch(ref_name=chrom, start=pos, end=pos+1))

                # the QC is -10*log10(1-p(label)) where p(label) is the medaka consensus
                # probability. To combine these, we probably want to multiply the
                # (1-p(label)) values, i.e. add the QC scores. However, in the case of a
                # herterozygous SNPs where one of the haplotypes is the reference, we
                # won't have the QC value of the reference haplotype (no variant was
                # called).
                # Hence if we want a common scale we need to assume we can apprimate the missing
                # QC score for the reference haplotypes as being equal to the non-reference
                # haplotype so we can set the overall score to double the latter.
                def get_gq(v1, v2):
                    if len(v1) == 1 and len(v2) == 1:
                        gq = float(v1[0].sample_dict['GQ']) + float(v2[0].sample_dict['GQ'])
                    else:
                        v = v1[0] if len(v1) == 1 else v2[0]
                        gq = 2 * float(v.sample_dict['GQ'])
                    return gq

                def get_ref(v1, v2):
                    return v1[0].ref if len(v1) == 1 else v2[0].ref

                # Note we output unphased GTs as we might have multiple phased
                # regions and the phase can switch between regions

                # heterozygous on v1:
                if len(v1) == 1 and (len(v2) == 0 or v2[0].alt == ['.']):
                    alt = v1[0].alt
                    gt = '0/1'  # not 1/0 by convention since this is unphased
                # heterozygous on v2
                elif (len(v1) == 0 or v1[0].alt == ['.']) and len(v2) == 1:
                    alt = v2[0].alt
                    gt = '0/1'
                else:
                    assert len(v1) == 1 and len(v2) == 1
                    if v1[0].alt == v2[0].alt:  #homozygous snp
                        alt = v1[0].alt
                        gt = '1/1'
                    else:  #heterozygous snp
                        alt = v1[0].alt + v2[0].alt
                        gt = '1/2'

                gq = get_gq(v1, v2)
                v = Variant(chrom, pos, get_ref(v1, v2), alt=alt, qual=gq, sample_dict={'GT':gt, 'GQ':gq})
                vcf_writer.write_variant(v)
Пример #3
0
    def test_vcf_annotate(self):
        variants_annotated = [
                Variant('MN908947.3', 29748, 'ACGATCGAGTG', alt=['A'],
                    ident='.', qual=243.965, filt='PASS',
                    info='AR=0,0;DP=200;DPS=100,100;DPSP=199;SC=19484,20327,22036,23215;SR=1,2,98,98',
                    genotype_data=OrderedDict([('GT','1'), ('GQ', '244')])),
                Variant('MN908947.3', 29764, 'TGAACAATGCT',
                    alt=['A'], ident='.', qual=243.965, filt='PASS',
                    info='AR=0,0;DP=200;DPS=100,100;DPSP=199;SC=19970,21140,15773,16751;SR=99,100,0,0',
                    genotype_data=OrderedDict([('GT','1'), ('GQ', '244')])),
                Variant('MN908947.3', 29788, 'TATATGGAAGA',
                     alt=['A'], ident='.', qual=243.965, filt='PASS',
                    info='AR=0,0;DP=199;DPS=99,100;DPSP=197;SC=26174,28129,19085,20315;SR=96,100,1,0',
                    genotype_data=OrderedDict([('GT', '1'), ('GQ','244')]))]
        variants_annotated = variants_annotated + deepcopy(variants_annotated)
        for i in range(3, 6):
            variants_annotated[i].chrom = "Duplicate"

        with tempfile.NamedTemporaryFile() as vcfout:
            # Annotate vcf
            args = Namespace(RG=self.rg, vcf=self.vcf,ref_fasta=self.ref_fasta,
                            bam=self.bam, vcfout=vcfout.name,
                             chunk_size=100000, pad=25, dpsp=True)
            annotate_vcf_n_reads(args)

            # Read in output variants and compare with expected annotated variants
            vcf_reader = VCFReader(vcfout.name)
            for i, v in enumerate(vcf_reader.fetch()):
                self.assertEqual(v, variants_annotated[i],
                                 'Annotation failed for variant {}: {} {}.'.format(i, v.chrom, v.pos))