示例#1
0
def deepvariant_header(contigs, sample_names):
    """Returns a VcfHeader used for writing VCF output.

  This function fills out the FILTER, INFO, FORMAT, and extra header information
  created by the DeepVariant pipeline using consistent fields that DeepVariant
  creates. The `contigs` and `sample_names` fields are unique depending on the
  input data used, so are required inputs.

  Args:
    contigs: list(ContigInfo). The list of contigs on which variants were
      called.
    sample_names: list(str). The list of samples present in the run.

  Returns:
    A nucleus.genomics.v1.VcfHeader proto with known fixed headers and the given
    samples and contigs populated.
  """
    version = variants_pb2.VcfExtra(key='DeepVariant_version',
                                    value=DEEP_VARIANT_VERSION)

    return variants_pb2.VcfHeader(
        fileformat='VCFv4.2',
        filters=[
            vcf_constants.reserved_filter_field(DEEP_VARIANT_PASS),
            variants_pb2.VcfFilterInfo(
                id=DEEP_VARIANT_REF_FILTER,
                description='Genotyping model thinks this site is reference.'),
            variants_pb2.VcfFilterInfo(
                id=DEEP_VARIANT_QUAL_FILTER,
                description='Confidence in this variant being real is below '
                'calling threshold.'),
        ],
        infos=[
            vcf_constants.reserved_info_field('END'),
        ],
        formats=[
            vcf_constants.reserved_format_field('GT'),
            vcf_constants.reserved_format_field('GQ'),
            vcf_constants.reserved_format_field('DP'),
            variants_pb2.VcfFormatInfo(
                id=DEEP_VARIANT_MIN_DP_FORMAT,
                number='1',
                type='Integer',
                description='Minimum DP observed within the GVCF block.'),
            vcf_constants.reserved_format_field('AD'),
            variants_pb2.VcfFormatInfo(
                id=DEEP_VARIANT_VAF_FORMAT,
                number='A',
                type='Float',
                description='Variant allele fractions.'),
            vcf_constants.reserved_format_field('PL'),
        ],
        contigs=contigs,
        sample_names=sample_names,
        extras=[version])
示例#2
0
    reference_pb2.ContigInfo(name='chr15', pos_in_fasta=14, n_bases=101991189),
    reference_pb2.ContigInfo(name='chr16', pos_in_fasta=15, n_bases=90338345),
    reference_pb2.ContigInfo(name='chr17', pos_in_fasta=16, n_bases=83257441),
    reference_pb2.ContigInfo(name='chr18', pos_in_fasta=17, n_bases=80373285),
    reference_pb2.ContigInfo(name='chr19', pos_in_fasta=18, n_bases=58617616),
    reference_pb2.ContigInfo(name='chr20', pos_in_fasta=19, n_bases=64444167),
    reference_pb2.ContigInfo(name='chr21', pos_in_fasta=20, n_bases=46709983),
    reference_pb2.ContigInfo(name='chr22', pos_in_fasta=21, n_bases=50818468),
    reference_pb2.ContigInfo(name='chrX', pos_in_fasta=22, n_bases=156040895),
    reference_pb2.ContigInfo(name='chrY', pos_in_fasta=23, n_bases=57227415),
    reference_pb2.ContigInfo(name='chrM', pos_in_fasta=24, n_bases=16569),
]

# pylint: disable=line-too-long
expected_samples_filters = [
    variants_pb2.VcfFilterInfo(id='PASS', description='All filters passed'),
    variants_pb2.VcfFilterInfo(id='LowQual', description='Low	quality'),
    variants_pb2.VcfFilterInfo(
        id='VQSRTrancheINDEL95.00to96.00',
        description=
        'Truth	sensitivity	tranche	level	for	INDEL	model	at	VQS	Lod:	0.9364	<=	x	<	1.0415'
    ),
    variants_pb2.VcfFilterInfo(
        id='VQSRTrancheINDEL96.00to97.00',
        description=
        'Truth	sensitivity	tranche	level	for	INDEL	model	at	VQS	Lod:	0.8135	<=	x	<	0.9364'
    ),
    variants_pb2.VcfFilterInfo(
        id='VQSRTrancheINDEL97.00to99.00',
        description=
        'Truth	sensitivity	tranche	level	for	INDEL	model	at	VQS	Lod:	0.323	<=	x	<	0.8135'
示例#3
0
    def test_writing_canned_variants(self):
        """Tests writing all the variants that are 'canned' in our tfrecord file."""
        # This file is in the TF record format
        tfrecord_file = test_utils.genomics_core_testdata(
            'test_samples.vcf.golden.tfrecord')

        writer_options = variants_pb2.VcfWriterOptions()
        header = variants_pb2.VcfHeader(
            contigs=[
                reference_pb2.ContigInfo(name='chr1', n_bases=248956422),
                reference_pb2.ContigInfo(name='chr2', n_bases=242193529),
                reference_pb2.ContigInfo(name='chr3', n_bases=198295559),
                reference_pb2.ContigInfo(name='chrX', n_bases=156040895)
            ],
            sample_names=['NA12878_18_99'],
            filters=[
                variants_pb2.VcfFilterInfo(id='PASS',
                                           description='All filters passed'),
                variants_pb2.VcfFilterInfo(id='LowQual', description=''),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL95.00to96.00'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL96.00to97.00'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL97.00to99.00'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.00to99.50'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.50to99.90'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.90to99.95'),
                variants_pb2.VcfFilterInfo(
                    id='VQSRTrancheINDEL99.95to100.00+'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.95to100.00'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.50to99.60'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.60to99.80'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.80to99.90'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.90to99.95'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00+'),
                variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00'),
            ],
            infos=[
                variants_pb2.VcfInfo(
                    id='END',
                    number='1',
                    type='Integer',
                    description='Stop position of the interval')
            ],
            formats=[
                variants_pb2.VcfFormatInfo(id='GT',
                                           number='1',
                                           type='String',
                                           description='Genotype'),
                variants_pb2.VcfFormatInfo(id='GQ',
                                           number='1',
                                           type='Integer',
                                           description='Genotype Quality'),
                variants_pb2.VcfFormatInfo(
                    id='DP',
                    number='1',
                    type='Integer',
                    description='Read depth of all passing filters reads.'),
                variants_pb2.VcfFormatInfo(
                    id='MIN_DP',
                    number='1',
                    type='Integer',
                    description='Minimum DP observed within the GVCF block.'),
                variants_pb2.VcfFormatInfo(
                    id='AD',
                    number='R',
                    type='Integer',
                    description=
                    'Read depth of all passing filters reads for each allele.'
                ),
                variants_pb2.VcfFormatInfo(
                    id='VAF',
                    number='A',
                    type='Float',
                    description='Variant allele fractions.'),
                variants_pb2.VcfFormatInfo(
                    id='PL',
                    number='G',
                    type='Integer',
                    description='Genotype likelihoods, Phred encoded'),
            ],
        )
        variant_records = list(
            io_utils.read_tfrecords(tfrecord_file, proto=variants_pb2.Variant))
        out_fname = test_utils.test_tmpfile('output.vcf')
        with vcf_writer.VcfWriter.to_file(out_fname, header,
                                          writer_options) as writer:
            for record in variant_records[:5]:
                writer.write(record)

        # Check: are the variants written as expected?
        # pylint: disable=line-too-long
        expected_vcf_content = [
            '##fileformat=VCFv4.2\n',
            '##FILTER=<ID=PASS,Description="All filters passed">\n',
            '##FILTER=<ID=LowQual,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL95.00to96.00,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL96.00to97.00,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL97.00to99.00,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL99.00to99.50,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL99.50to99.90,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL99.90to99.95,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00+,Description="">\n',
            '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00,Description="">\n',
            '##FILTER=<ID=VQSRTrancheSNP99.50to99.60,Description="">\n',
            '##FILTER=<ID=VQSRTrancheSNP99.60to99.80,Description="">\n',
            '##FILTER=<ID=VQSRTrancheSNP99.80to99.90,Description="">\n',
            '##FILTER=<ID=VQSRTrancheSNP99.90to99.95,Description="">\n',
            '##FILTER=<ID=VQSRTrancheSNP99.95to100.00+,Description="">\n',
            '##FILTER=<ID=VQSRTrancheSNP99.95to100.00,Description="">\n',
            '##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of '
            'the interval">\n',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n',
            '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">\n',
            '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth of all '
            'passing filters reads.">\n',
            '##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP '
            'observed within the GVCF block.">\n',
            '##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth of all '
            'passing filters reads for each allele.">\n',
            '##FORMAT=<ID=VAF,Number=A,Type=Float,Description=\"Variant allele '
            'fractions.">\n',
            '##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Genotype '
            'likelihoods, Phred encoded">\n',
            '##contig=<ID=chr1,length=248956422>\n',
            '##contig=<ID=chr2,length=242193529>\n',
            '##contig=<ID=chr3,length=198295559>\n',
            '##contig=<ID=chrX,length=156040895>\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNA12878_18_99\n',
            'chr1\t13613\t.\tT\tA\t39.88\tVQSRTrancheSNP99.90to99.95\t.\tGT:GQ:DP:AD:PL\t0/1:16:4:1,3:68,0,16\n',
            'chr1\t13813\t.\tT\tG\t90.28\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:9:3:0,3:118,9,0\n',
            'chr1\t13838\trs28428499\tC\tT\t62.74\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:6:2:0,2:90,6,0\n',
            'chr1\t14397\trs756427959\tCTGT\tC\t37.73\tPASS\t.\tGT:GQ:DP:AD:PL\t0/1:75:5:3,2:75,0,152\n',
            'chr1\t14522\t.\tG\tA\t49.77\tVQSRTrancheSNP99.60to99.80\t.\tGT:GQ:DP:AD:PL\t0/1:78:10:6,4:78,0,118\n'
        ]
        # pylint: enable=line-too-long

        with gfile.GFile(out_fname, 'r') as f:
            self.assertEqual(f.readlines(), expected_vcf_content)
示例#4
0
# Older symbolic alt allele, similar in meaning to gVCF alt allele
SYMBOLIC_ALT_ALLELE = '<NON_REF>'

# The replacement field used for missing data.
MISSING_FIELD = '.'

# Valid types for INFO and FORMAT fields, as per the VCF 4.3 spec.
CHARACTER_TYPE = 'Character'
FLAG_TYPE = 'Flag'
FLOAT_TYPE = 'Float'
INTEGER_TYPE = 'Integer'
STRING_TYPE = 'String'

# Reserved FILTER field definitions.
RESERVED_FILTER_FIELDS = [
    variants_pb2.VcfFilterInfo(id='PASS', description='All filters passed'),
]

# Reserved INFO field definitions, as per the VCF 4.3 spec.
RESERVED_INFO_FIELDS = [
    variants_pb2.VcfInfo(id='AA',
                         number='1',
                         type=STRING_TYPE,
                         description='Ancestral allele'),
    variants_pb2.VcfInfo(id='AC',
                         number='A',
                         type=INTEGER_TYPE,
                         description='Allele count in genotypes, for each ALT '
                         'allele, in the same order as listed'),
    variants_pb2.VcfInfo(id='AD',
                         number='R',
示例#5
0
def deepvariant_header(contigs,
                       sample_names,
                       add_info_candidates=False,
                       include_med_dp=True):
  """Returns a VcfHeader used for writing VCF output.

  This function fills out the FILTER, INFO, FORMAT, and extra header information
  created by the DeepVariant pipeline using consistent fields that DeepVariant
  creates. The `contigs` and `sample_names` fields are unique depending on the
  input data used, so are required inputs.

  Args:
    contigs: list(ContigInfo). The list of contigs on which variants were
      called.
    sample_names: list(str). The list of samples present in the run.
    add_info_candidates: Adds the 'CANDIDATES' info field for
      debugging purposes.
    include_med_dp: boolean. If True, we will include MED_DP.

  Returns:
    A nucleus.genomics.v1.VcfHeader proto with known fixed headers and the given
    samples and contigs populated.
  """
  version = variants_pb2.VcfExtra(
      key='DeepVariant_version', value=DEEP_VARIANT_VERSION)

  info_fields = [
      vcf_constants.reserved_info_field('END'),
  ]
  formats = [
      vcf_constants.reserved_format_field('GT'),
      vcf_constants.reserved_format_field('GQ'),
      vcf_constants.reserved_format_field('DP'),
      variants_pb2.VcfFormatInfo(
          id=DEEP_VARIANT_MIN_DP_FORMAT,
          number='1',
          type='Integer',
          description='Minimum DP observed within the GVCF block.'),
      vcf_constants.reserved_format_field('AD'),
      variants_pb2.VcfFormatInfo(
          id=DEEP_VARIANT_VAF_FORMAT,
          number='A',
          type='Float',
          description='Variant allele fractions.'),
      vcf_constants.reserved_format_field('PL'),
  ]
  if add_info_candidates:
    info_fields.append(
        variants_pb2.VcfInfo(
            id='CANDIDATES',
            number='1',
            type=vcf_constants.STRING_TYPE,
            description='pipe-delimited candidate alleles.'))

  if include_med_dp:
    formats.append(
        variants_pb2.VcfFormatInfo(
            id=DEEP_VARIANT_MED_DP_FORMAT,
            number='1',
            type='Integer',
            description='Median DP observed within the GVCF block '
            'rounded to the nearest integer.'))

  return variants_pb2.VcfHeader(
      fileformat='VCFv4.2',
      filters=[
          vcf_constants.reserved_filter_field(DEEP_VARIANT_PASS),
          variants_pb2.VcfFilterInfo(
              id=DEEP_VARIANT_REF_FILTER,
              description='Genotyping model thinks this site is reference.'),
          variants_pb2.VcfFilterInfo(
              id=DEEP_VARIANT_QUAL_FILTER,
              description='Confidence in this variant being real is below '
              'calling threshold.'),
          variants_pb2.VcfFilterInfo(
              id=DEEP_VARIANT_NO_CALL,
              description='Site has depth=0 resulting in no call.'),
      ],
      infos=info_fields,
      formats=formats,
      contigs=contigs,
      sample_names=sample_names,
      extras=[version])