def test_unicode_fields(self):
     sample_unicode_str = u'\xc3\xb6'
     sample_utf8_str = sample_unicode_str.encode('utf-8')
     variant = vcfio.Variant(
         reference_name='chr19',
         start=11,
         end=12,
         reference_bases='CT',
         alternate_bases=[],
         filters=[sample_unicode_str, sample_utf8_str],
         info={
             'AS1':
             vcfio.VariantInfo(sample_utf8_str, '1'),
             'AS2':
             vcfio.VariantInfo([sample_unicode_str, sample_utf8_str], '2')
         })
     expected_row = {
         ColumnKeyConstants.REFERENCE_NAME: 'chr19',
         ColumnKeyConstants.START_POSITION: 11,
         ColumnKeyConstants.END_POSITION: 12,
         ColumnKeyConstants.REFERENCE_BASES: 'CT',
         ColumnKeyConstants.ALTERNATE_BASES: [],
         ColumnKeyConstants.FILTER:
         [sample_unicode_str, sample_unicode_str],
         ColumnKeyConstants.CALLS: [],
         'AS1': sample_unicode_str,
         'AS2': [sample_unicode_str, sample_unicode_str]
     }
     self.assertEqual([expected_row],
                      self._get_row_list_from_variant(variant))
 def test_nonstandard_float_values(self):
     variant = vcfio.Variant(
         reference_name='chr19',
         start=11,
         end=12,
         reference_bases='CT',
         alternate_bases=[],
         filters=[],
         info={
             'F1': vcfio.VariantInfo(float('inf'), '1'),
             'F2': vcfio.VariantInfo(
                 [float('-inf'), float('nan'), 1.2], '3'),
             'F3': vcfio.VariantInfo(float('nan'), '1'),
         })
     null_replacement_value = -sys.maxint
     expected_row = {
         ColumnKeyConstants.REFERENCE_NAME: 'chr19',
         ColumnKeyConstants.START_POSITION: 11,
         ColumnKeyConstants.END_POSITION: 12,
         ColumnKeyConstants.REFERENCE_BASES: 'CT',
         ColumnKeyConstants.ALTERNATE_BASES: [],
         ColumnKeyConstants.CALLS: [],
         'F1': sys.maxint,
         'F2': [-sys.maxint, null_replacement_value, 1.2],
         'F3': None
     }
     self.assertEqual([expected_row],
                      self._get_row_list_from_variant(variant))
 def test_no_alternate_bases(self):
     variant = vcfio.Variant(reference_name='chr19',
                             start=11,
                             end=12,
                             reference_bases='CT',
                             alternate_bases=[],
                             filters=['q10'],
                             info={
                                 'A1':
                                 vcfio.VariantInfo('some data', '1'),
                                 'A2':
                                 vcfio.VariantInfo(['data1', 'data2'], '2')
                             })
     expected_row = {
         ColumnKeyConstants.REFERENCE_NAME: 'chr19',
         ColumnKeyConstants.START_POSITION: 11,
         ColumnKeyConstants.END_POSITION: 12,
         ColumnKeyConstants.REFERENCE_BASES: 'CT',
         ColumnKeyConstants.ALTERNATE_BASES: [],
         ColumnKeyConstants.FILTER: ['q10'],
         ColumnKeyConstants.CALLS: [],
         'A1': 'some data',
         'A2': ['data1', 'data2']
     }
     self.assertEqual([expected_row],
                      self._get_row_list_from_variant(variant))
 def test_null_repeated_fields(self):
     variant = vcfio.Variant(reference_name='chr19',
                             start=11,
                             end=12,
                             reference_bases='CT',
                             alternate_bases=[],
                             filters=['q10'],
                             info={
                                 'AI':
                                 vcfio.VariantInfo([0, 1, None], '3'),
                                 'AB':
                                 vcfio.VariantInfo([True, None, False],
                                                   '3'),
                                 'AF':
                                 vcfio.VariantInfo([0.1, 0.2, None, 0.4],
                                                   '4'),
                                 'AS':
                                 vcfio.VariantInfo([None, 'data1', 'data2'],
                                                   '3')
                             })
     expected_row = {
         ColumnKeyConstants.REFERENCE_NAME: 'chr19',
         ColumnKeyConstants.START_POSITION: 11,
         ColumnKeyConstants.END_POSITION: 12,
         ColumnKeyConstants.REFERENCE_BASES: 'CT',
         ColumnKeyConstants.ALTERNATE_BASES: [],
         ColumnKeyConstants.FILTER: ['q10'],
         ColumnKeyConstants.CALLS: [],
         'AI': [0, 1, -sys.maxint],
         'AB': [True, False, False],
         'AF': [0.1, 0.2, -sys.maxint, 0.4],
         'AS': ['.', 'data1', 'data2']
     }
     self.assertEqual([expected_row],
                      self._get_row_list_from_variant(variant))
 def test_schema_conflict_in_info_field_number(self):
     variant = vcfio.Variant(
         reference_name='chr19',
         start=11,
         end=12,
         reference_bases='CT',
         alternate_bases=[],
         filters=[],
         info={
             'IB': vcfio.VariantInfo(data=[1, 2], field_count='2'),
             'IBR': vcfio.VariantInfo(data=1, field_count='1'),
             'II': vcfio.VariantInfo(data=[10, 20], field_count='2'),
             'IF': vcfio.VariantInfo(data=1.1, field_count='1'),
             'IS': vcfio.VariantInfo(data='foo', field_count='1')
         },
     )
     expected_row = {
         ColumnKeyConstants.REFERENCE_NAME: 'chr19',
         ColumnKeyConstants.START_POSITION: 11,
         ColumnKeyConstants.END_POSITION: 12,
         ColumnKeyConstants.REFERENCE_BASES: 'CT',
         ColumnKeyConstants.ALTERNATE_BASES: [],
         ColumnKeyConstants.CALLS: [],
         'IB': True,
         'IBR': [True],
         'II': 10,
         'IF': [1.1],
         'IS': ['foo'],
     }
     self.assertEqual([expected_row],
                      self._get_row_list_from_variant(
                          variant,
                          self._schema_descriptor,
                          allow_incompatible_records=True))
 def _get_sample_variant(self):
     return vcfio.Variant(reference_name='19',
                          start=11,
                          end=12,
                          reference_bases='C',
                          alternate_bases=['A', 'TT'],
                          names=['rs1', 'rs2'],
                          quality=2,
                          filters=['PASS'],
                          info={
                              'A1': vcfio.VariantInfo('some data', '1'),
                              'A2': vcfio.VariantInfo(['data1', 'data2'],
                                                      'A')
                          },
                          calls=[
                              vcfio.VariantCall(name='Sample1',
                                                genotype=[0, 1],
                                                info={
                                                    'GQ': 20,
                                                    'HQ': [10, 20]
                                                }),
                              vcfio.VariantCall(name='Sample2',
                                                genotype=[1, 0],
                                                info={
                                                    'GQ': 10,
                                                    'FLAG1': True
                                                })
                          ])
Пример #7
0
def _get_sample_variant_1():
    """Get first sample variant.

  Features:
    multiple alternates
    not phased
    multiple names
  """
    vcf_line = ('20	1234	rs123;rs2	C	A,T	50	PASS	AF=0.5,0.1;NS=1	'
                'GT:GQ	0/0:48	1/0:20\n')
    variant = vcfio.Variant(reference_name='20',
                            start=1233,
                            end=1234,
                            reference_bases='C',
                            alternate_bases=['A', 'T'],
                            names=['rs123', 'rs2'],
                            quality=50,
                            filters=['PASS'],
                            info={
                                'AF':
                                vcfio.VariantInfo(data=[0.5, 0.1],
                                                  field_count='A'),
                                'NS':
                                vcfio.VariantInfo(data=1, field_count='1')
                            })
    variant.calls.append(
        vcfio.VariantCall(name='Sample1', genotype=[0, 0], info={'GQ': 48}))
    variant.calls.append(
        vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={'GQ': 20}))
    return variant, vcf_line
Пример #8
0
    def test_get_merged_variants_move_info_to_calls(self):
        strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy(
            info_keys_to_move_to_calls_regex='^A1$',
            copy_quality_to_calls=False,
            copy_filter_to_calls=False)
        variants = self._get_sample_variants()

        # Test single variant merge.
        single_merged_variant = list(
            strategy.get_merged_variants([variants[0]]))[0]
        self.assertEqual([
            vcfio.VariantCall(name='Sample1',
                              genotype=[0, 1],
                              info={
                                  'GQ': 20,
                                  'HQ': [10, 20],
                                  'A1': 'some data'
                              }),
            vcfio.VariantCall(name='Sample2',
                              genotype=[1, 0],
                              info={
                                  'GQ': 10,
                                  'FLAG1': True,
                                  'A1': 'some data'
                              })
        ], single_merged_variant.calls)

        # Test multiple variant merge.
        merged_variant = list(strategy.get_merged_variants(variants))[0]
        self._assert_common_expected_merged_fields(merged_variant)
        self.assertEqual([
            vcfio.VariantCall(name='Sample1',
                              genotype=[0, 1],
                              info={
                                  'GQ': 20,
                                  'HQ': [10, 20],
                                  'A1': 'some data'
                              }),
            vcfio.VariantCall(name='Sample2',
                              genotype=[1, 0],
                              info={
                                  'GQ': 10,
                                  'FLAG1': True,
                                  'A1': 'some data'
                              }),
            vcfio.VariantCall(
                name='Sample3', genotype=[1, 1], info={'A1': 'some data2'}),
            vcfio.VariantCall(name='Sample4',
                              genotype=[1, 0],
                              info={
                                  'GQ': 20,
                                  'A1': 'some data2'
                              })
        ], merged_variant.calls)
        self.assertItemsEqual(['A2', 'A3'], merged_variant.info.keys())
        self.assertEqual(vcfio.VariantInfo(['data1', 'data2'], '2'),
                         merged_variant.info['A2'])
        self.assertEqual(vcfio.VariantInfo(['data3', 'data4'], '2'),
                         merged_variant.info['A3'])
    def test_schema_conflict_in_info_field_type(self):
        variant = vcfio.Variant(reference_name='chr19',
                                start=11,
                                end=12,
                                reference_bases='CT',
                                alternate_bases=[],
                                filters=[],
                                info={
                                    'IB':
                                    vcfio.VariantInfo(data=1, field_count='1'),
                                    'II':
                                    vcfio.VariantInfo(data=1.1,
                                                      field_count='1'),
                                    'IF':
                                    vcfio.VariantInfo(data=[1, 2],
                                                      field_count='2'),
                                    'IS':
                                    vcfio.VariantInfo(data=[1.0, 2.0],
                                                      field_count='2')
                                })
        expected_row = {
            ColumnKeyConstants.REFERENCE_NAME: 'chr19',
            ColumnKeyConstants.START_POSITION: 11,
            ColumnKeyConstants.END_POSITION: 12,
            ColumnKeyConstants.REFERENCE_BASES: 'CT',
            ColumnKeyConstants.ALTERNATE_BASES: [],
            ColumnKeyConstants.CALLS: [],
            'IB': True,
            'II': 1,
            'IF': [1.0, 2.0],
            'IS': ['1.0', '2.0']
        }
        self.assertEqual([expected_row],
                         self._get_row_list_from_variant(
                             variant,
                             self._schema_descriptor,
                             allow_incompatible_records=True))

        with self.assertRaises(ValueError):
            variant = vcfio.Variant(
                reference_name='chr19',
                start=11,
                end=12,
                reference_bases='CT',
                alternate_bases=[],
                filters=[],
                # String cannot be casted to integer.
                info={
                    'II': vcfio.VariantInfo(data='1.1', field_count='1'),
                })
            self._get_row_list_from_variant(variant,
                                            self._schema_descriptor,
                                            allow_incompatible_records=True)
            self.fail(
                'String data for an integer schema must cause an exception')
Пример #10
0
 def _get_sample_variants(self):
     variant_1 = vcfio.Variant(reference_name='19',
                               start=11,
                               end=12,
                               reference_bases='C',
                               alternate_bases=['A', 'TT'],
                               names=['rs1'],
                               quality=2,
                               filters=['PASS'],
                               info={
                                   'A1':
                                   vcfio.VariantInfo('some data', '1'),
                                   'A2':
                                   vcfio.VariantInfo(['data1', 'data2'],
                                                     '2')
                               },
                               calls=[
                                   vcfio.VariantCall(name='Sample1',
                                                     genotype=[0, 1],
                                                     phaseset='*',
                                                     info={
                                                         'GQ': 20,
                                                         'HQ': [10, 20]
                                                     }),
                                   vcfio.VariantCall(name='Sample2',
                                                     genotype=[1, 0],
                                                     info={
                                                         'GQ': 10,
                                                         'FLAG1': True
                                                     }),
                               ])
     variant_2 = vcfio.Variant(reference_name='20',
                               start=11,
                               end=12,
                               reference_bases='C',
                               alternate_bases=['A', 'TT'],
                               names=['rs1'],
                               quality=20,
                               filters=['q10'],
                               info={
                                   'A1':
                                   vcfio.VariantInfo('some data2', '2'),
                                   'A3':
                                   vcfio.VariantInfo(['data3', 'data4'],
                                                     '2')
                               },
                               calls=[
                                   vcfio.VariantCall(name='Sample3',
                                                     genotype=[1, 1]),
                                   vcfio.VariantCall(name='Sample4',
                                                     genotype=[1, 0],
                                                     info={'GQ': 20}),
                               ])
     return [variant_1, variant_2]
 def _get_sample_variant_1(self):
   variant = vcfio.Variant(
       reference_name='chr19', start=11, end=12, reference_bases='C',
       alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2,
       filters=['PASS'],
       info={'IS': vcfio.VariantInfo('some data', '1'),
             'IF': vcfio.VariantInfo(True, '0'),
             'IA': vcfio.VariantInfo([0.1, 0.2], '2')},
       calls=[vcfio.VariantCall(
           name='Sample1', genotype=[0, 1], phaseset='*',
           info={'FI': 20, 'FU': [10.0, 20.0]})]
   )
   return variant
 def _get_sample_variant_with_incompatible_records(self):
     variant = vcfio.Variant(reference_name='chr19',
                             start=11,
                             end=12,
                             reference_bases='C',
                             alternate_bases=[],
                             filters=['PASS'],
                             info={
                                 'IFR': vcfio.VariantInfo(['0.1', '0.2'],
                                                          '2'),
                                 'IS': vcfio.VariantInfo(1, '1'),
                                 'ISR': vcfio.VariantInfo(1, '1')
                             },
                             calls=[
                                 vcfio.VariantCall(name='Sample1',
                                                   genotype=[0, 1],
                                                   phaseset='*',
                                                   info={
                                                       'GQ': 20,
                                                       'FIR': [10.0, 20.0]
                                                   }),
                             ])
     row = {
         ColumnKeyConstants.REFERENCE_NAME:
         'chr19',
         ColumnKeyConstants.START_POSITION:
         11,
         ColumnKeyConstants.END_POSITION:
         12,
         ColumnKeyConstants.REFERENCE_BASES:
         'C',
         ColumnKeyConstants.ALTERNATE_BASES: [],
         ColumnKeyConstants.FILTER: ['PASS'],
         ColumnKeyConstants.CALLS: [{
             ColumnKeyConstants.CALLS_NAME:
             'Sample1',
             ColumnKeyConstants.CALLS_GENOTYPE: [0, 1],
             ColumnKeyConstants.CALLS_PHASESET:
             '*',
             'GQ':
             20,
             'FIR': [10, 20]
         }],
         'IFR': [0.1, 0.2],
         'IS':
         '1',
         'ISR': ['1']
     }
     return variant, row
Пример #13
0
def _get_sample_variant_3():
    """Get third sample variant.

  Features:
    symbolic alternate
    no calls for sample 2
    alternate phaseset
  """
    vcf_line = ('19	12	.	C	<SYMBOLIC>	49	q10	AF=0.5	GT:PS:GQ	0|1:1:45	'
                '.:.:.\n')
    variant = vcfio.Variant(
        reference_name='19',
        start=11,
        end=12,
        reference_bases='C',
        alternate_bases=['<SYMBOLIC>'],
        quality=49,
        filters=['q10'],
        info={'AF': vcfio.VariantInfo(data=[0.5], field_count='A')})
    variant.calls.append(
        vcfio.VariantCall(name='Sample1',
                          genotype=[0, 1],
                          phaseset='1',
                          info={'GQ': 45}))
    variant.calls.append(
        vcfio.VariantCall(name='Sample2',
                          genotype=[vcfio.MISSING_GENOTYPE_VALUE],
                          info={'GQ': None}))
    return variant, vcf_line
Пример #14
0
def _get_sample_variant_2():
    """Get second sample variant.

  Features:
    multiple references
    no alternate
    phased
    multiple filters
    missing format field
  """
    vcf_line = ('19	123	rs1234	GTC	.	40	q10;s50	NS=2	GT:GQ	1|0:48	0/1:.\n')
    variant = vcfio.Variant(
        reference_name='19',
        start=122,
        end=125,
        reference_bases='GTC',
        alternate_bases=[],
        names=['rs1234'],
        quality=40,
        filters=['q10', 's50'],
        info={'NS': vcfio.VariantInfo(data=2, field_count='1')})
    variant.calls.append(
        vcfio.VariantCall(name='Sample1',
                          genotype=[1, 0],
                          phaseset=vcfio.DEFAULT_PHASESET_VALUE,
                          info={'GQ': 48}))
    variant.calls.append(
        vcfio.VariantCall(name='Sample2', genotype=[0, 1], info={'GQ': None}))
    return variant, vcf_line
 def _get_sample_variant_with_empty_calls(self):
     variant = vcfio.Variant(reference_name='20',
                             start=123,
                             end=125,
                             reference_bases='CT',
                             alternate_bases=[],
                             filters=['q10', 's10'],
                             info={'II': vcfio.VariantInfo(1234, '1')},
                             calls=[
                                 vcfio.VariantCall(name='EmptySample',
                                                   genotype=[],
                                                   phaseset='*',
                                                   info={}),
                             ])
     row = {
         ColumnKeyConstants.REFERENCE_NAME: '20',
         ColumnKeyConstants.START_POSITION: 123,
         ColumnKeyConstants.END_POSITION: 125,
         ColumnKeyConstants.REFERENCE_BASES: 'CT',
         ColumnKeyConstants.ALTERNATE_BASES: [],
         ColumnKeyConstants.FILTER: ['q10', 's10'],
         ColumnKeyConstants.CALLS: [],
         'II': 1234
     }
     return variant, row
Пример #16
0
 def _get_sample_variant_1(self, split_alternate_allele_info_fields=True):
   variant = vcfio.Variant(
       reference_name='chr19', start=11, end=12, reference_bases='C',
       alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2,
       filters=['PASS'],
       info={'AF': vcfio.VariantInfo([0.1, 0.2], 'A'),
             'AF2': vcfio.VariantInfo([0.2, 0.3], 'A'),
             'A1': vcfio.VariantInfo('some data', '1'),
             'A2': vcfio.VariantInfo(['data1', 'data2'], '2')},
       calls=[
           vcfio.VariantCall(
               name='Sample1', genotype=[0, 1], phaseset='*',
               info={'GQ': 20, 'HQ': [10, 20]}),
           vcfio.VariantCall(
               name='Sample2', genotype=[1, 0],
               info={'GQ': 10, 'FLAG1': True}),
       ]
   )
   row = {ColumnKeyConstants.REFERENCE_NAME: 'chr19',
          ColumnKeyConstants.START_POSITION: 11,
          ColumnKeyConstants.END_POSITION: 12,
          ColumnKeyConstants.REFERENCE_BASES: 'C',
          ColumnKeyConstants.NAMES: ['rs1', 'rs2'],
          ColumnKeyConstants.QUALITY: 2,
          ColumnKeyConstants.FILTER: ['PASS'],
          ColumnKeyConstants.CALLS: [
              {ColumnKeyConstants.CALLS_NAME: 'Sample1',
               ColumnKeyConstants.CALLS_GENOTYPE: [0, 1],
               ColumnKeyConstants.CALLS_PHASESET: '*',
               'GQ': 20, 'HQ': [10, 20]},
              {ColumnKeyConstants.CALLS_NAME: 'Sample2',
               ColumnKeyConstants.CALLS_GENOTYPE: [1, 0],
               ColumnKeyConstants.CALLS_PHASESET: None,
               'GQ': 10, 'FLAG1': True}],
          'A1': 'some data',
          'A2': ['data1', 'data2']}
   if split_alternate_allele_info_fields:
     row[ColumnKeyConstants.ALTERNATE_BASES] = [
         {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'A', 'AF': 0.1, 'AF2': 0.2},
         {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'TT', 'AF': 0.2, 'AF2': 0.3}]
   else:
     row[ColumnKeyConstants.ALTERNATE_BASES] = [
         {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'A'},
         {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'TT'}]
     row['AF'] = [0.1, 0.2]
     row['AF2'] = [0.2, 0.3]
   return variant, row
 def test_create_processed_variant_mismatched_annotation_alt(self):
     # This is like `test_create_processed_variant_move_alt_info_and_annotation`
     # with the difference that it has an extra alt annotation which does not
     # match any alts.
     variant, header_fields = self._get_sample_variant_and_header_with_csq()
     variant.info['CSQ'] = vcfio.VariantInfo(data=[
         'A|C1|I1|S1|G1', 'TT|C2|I2|S2|G2', 'A|C3|I3|S3|G3',
         'ATAT|C3|I3|S3|G3'
     ],
                                             field_count='.')
     counter_factory = _CounterSpyFactory()
     factory = processed_variant.ProcessedVariantFactory(
         header_fields,
         split_alternate_allele_info_fields=True,
         annotation_fields=['CSQ'],
         counter_factory=counter_factory)
     proc_var = factory.create_processed_variant(variant)
     alt1 = processed_variant.AlternateBaseData('A')
     alt1._info = {
         'A2':
         'data1',
         'CSQ': [{
             processed_variant._ANNOTATION_ALT: 'A',
             'Consequence': 'C1',
             'IMPACT': 'I1',
             'SYMBOL': 'S1',
             'Gene': 'G1'
         }, {
             processed_variant._ANNOTATION_ALT: 'A',
             'Consequence': 'C3',
             'IMPACT': 'I3',
             'SYMBOL': 'S3',
             'Gene': 'G3'
         }]
     }
     alt2 = processed_variant.AlternateBaseData('TT')
     alt2._info = {
         'A2':
         'data2',
         'CSQ': [{
             processed_variant._ANNOTATION_ALT: 'TT',
             'Consequence': 'C2',
             'IMPACT': 'I2',
             'SYMBOL': 'S2',
             'Gene': 'G2'
         }]
     }
     self.assertEqual(proc_var.alternate_data_list, [alt1, alt2])
     self.assertFalse(proc_var.non_alt_info.has_key('A2'))
     self.assertFalse(proc_var.non_alt_info.has_key('CSQ'))
     self.assertEqual(
         counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 2)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 1)
 def _get_sample_variant_2(self):
   variant = vcfio.Variant(
       reference_name='20', start=123, end=125, reference_bases='CT',
       alternate_bases=[], filters=['q10', 's10'],
       info={'IS_2': vcfio.VariantInfo('some data', '1')},
       calls=[vcfio.VariantCall(
           name='Sample1', genotype=[0, 1], phaseset='*', info={'FI_2': 20})]
   )
   return variant
 def test_create_processed_variant_annotation_alt_prefix_but_ref(self):
     # The returned variant is ignored as we create a custom one next.
     _, header_fields = self._get_sample_variant_and_header_with_csq()
     variant = vcfio.Variant(
         reference_name='19',
         start=11,
         end=12,
         reference_bases='C',
         alternate_bases=['AA', 'AAA'],
         names=['rs1'],
         quality=2,
         filters=['PASS'],
         info={
             'CSQ':
             vcfio.VariantInfo(data=['AA|C1|I1|S1|G1', 'AAA|C2|I2|S2|G2'],
                               field_count='.')
         })
     counter_factory = _CounterSpyFactory()
     factory = processed_variant.ProcessedVariantFactory(
         header_fields,
         split_alternate_allele_info_fields=True,
         annotation_fields=['CSQ'],
         counter_factory=counter_factory)
     proc_var = factory.create_processed_variant(variant)
     alt1 = processed_variant.AlternateBaseData('AA')
     alt1._info = {
         'CSQ': [{
             processed_variant._ANNOTATION_ALT: 'AA',
             'Consequence': 'C1',
             'IMPACT': 'I1',
             'SYMBOL': 'S1',
             'Gene': 'G1'
         }]
     }
     alt2 = processed_variant.AlternateBaseData('AAA')
     alt2._info = {
         'CSQ': [{
             processed_variant._ANNOTATION_ALT: 'AAA',
             'Consequence': 'C2',
             'IMPACT': 'I2',
             'SYMBOL': 'S2',
             'Gene': 'G2'
         }]
     }
     self.assertEqual(proc_var.alternate_data_list, [alt1, alt2])
     self.assertFalse(proc_var.non_alt_info.has_key('CSQ'))
     self.assertEqual(
         counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 2)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MINIMAL_MATCH.value].get_value(), 0)
 def test_nonstandard_fields_names(self):
     variant = vcfio.Variant(reference_name='chr19',
                             start=11,
                             end=12,
                             reference_bases='CT',
                             alternate_bases=[],
                             info={
                                 'A-1': vcfio.VariantInfo('data1', '1'),
                                 '_A': vcfio.VariantInfo('data2', '2')
                             })
     expected_row = {
         ColumnKeyConstants.REFERENCE_NAME: 'chr19',
         ColumnKeyConstants.START_POSITION: 11,
         ColumnKeyConstants.END_POSITION: 12,
         ColumnKeyConstants.REFERENCE_BASES: 'CT',
         ColumnKeyConstants.ALTERNATE_BASES: [],
         ColumnKeyConstants.CALLS: [],
         'A_1': 'data1',
         'field__A': 'data2'
     }
     self.assertEqual([expected_row],
                      self._get_row_list_from_variant(variant))
Пример #21
0
 def _get_sample_variant_2(self):
   variant = vcfio.Variant(
       reference_name='20', start=123, end=125, reference_bases='CT',
       alternate_bases=[], filters=['q10', 's10'],
       info={'INTINFO': vcfio.VariantInfo(1234, '1')})
   row = {ColumnKeyConstants.REFERENCE_NAME: '20',
          ColumnKeyConstants.START_POSITION: 123,
          ColumnKeyConstants.END_POSITION: 125,
          ColumnKeyConstants.REFERENCE_BASES: 'CT',
          ColumnKeyConstants.ALTERNATE_BASES: [],
          ColumnKeyConstants.FILTER: ['q10', 's10'],
          ColumnKeyConstants.CALLS: [],
          'INTINFO': 1234}
   return variant, row
Пример #22
0
 def _get_sample_variant_and_header_with_csq(self):
     variant = self._get_sample_variant()
     variant.info['CSQ'] = vcfio.VariantInfo(
         data=['A|C1|I1|S1|G1', 'TT|C2|I2|S2|G2', 'A|C3|I3|S3|G3'],
         field_count='.')
     csq_info = parser._Info(
         id=None,
         num='.',
         type=None,
         desc='some desc Allele|Consequence|IMPACT|SYMBOL|Gene',
         source=None,
         version=None)
     header_fields = vcf_header_io.VcfHeader(infos={'CSQ': csq_info})
     return variant, header_fields
    def test_get_merged_variants_no_custom_options(self):
        strategy = move_to_calls_strategy.MoveToCallsStrategy(
            info_keys_to_move_to_calls_regex=None,
            copy_quality_to_calls=False,
            copy_filter_to_calls=False)
        variants = self._get_sample_variants()

        # Test single variant merge.
        self.assertEqual([variants[0]],
                         strategy.get_merged_variants([variants[0]]))

        # Test multiple variant merge.
        merged_variant = strategy.get_merged_variants(variants)[0]
        self._assert_common_expected_merged_fields(merged_variant)
        self.assertEqual([
            vcfio.VariantCall(name='Sample1',
                              genotype=[0, 1],
                              info={
                                  'GQ': 20,
                                  'HQ': [10, 20]
                              }),
            vcfio.VariantCall(name='Sample2',
                              genotype=[1, 0],
                              info={
                                  'GQ': 10,
                                  'FLAG1': True
                              }),
            vcfio.VariantCall(name='Sample3', genotype=[1, 1]),
            vcfio.VariantCall(name='Sample4', genotype=[1, 0], info={'GQ': 20})
        ], merged_variant.calls)
        self.assertItemsEqual(['A1', 'A2', 'A3'], merged_variant.info.keys())
        self.assertTrue(merged_variant.info['A1'].data in ('some data',
                                                           'some data2'))
        self.assertEqual(vcfio.VariantInfo(['data1', 'data2'], '2'),
                         merged_variant.info['A2'])
        self.assertEqual(vcfio.VariantInfo(['data3', 'data4'], '2'),
                         merged_variant.info['A3'])
 def test_all_fields(self):
     variant = vcfio.Variant(
         reference_name='chr19',
         start=11,
         end=12,
         reference_bases='C',
         alternate_bases=['A', 'TT'],
         names=['rs1', 'rs2'],
         quality=2,
         filters=['PASS'],
         info={
             'AF': vcfio.VariantInfo([0.1, 0.2], 'A'),
             'AF2': vcfio.VariantInfo([0.2, 0.3], 'A'),
             'I1': vcfio.VariantInfo('some data', '1'),
             'I2': vcfio.VariantInfo(['data1', 'data2'], '2')
         },
         calls=[
             vcfio.VariantCall(name='Sample1',
                               genotype=[0, 1],
                               phaseset='*',
                               info={
                                   'GQ': 20,
                                   'HQ': [10, 20]
                               }),
             vcfio.VariantCall(name='Sample2',
                               genotype=[1, 0],
                               info={
                                   'GQ': 10,
                                   'FLAG1': True
                               }),
             vcfio.VariantCall(name='Sample3',
                               genotype=[vcfio.MISSING_GENOTYPE_VALUE])
         ])
     expected_row = {
         ColumnKeyConstants.REFERENCE_NAME:
         'chr19',
         ColumnKeyConstants.START_POSITION:
         11,
         ColumnKeyConstants.END_POSITION:
         12,
         ColumnKeyConstants.REFERENCE_BASES:
         'C',
         ColumnKeyConstants.ALTERNATE_BASES: [{
             ColumnKeyConstants.ALTERNATE_BASES_ALT:
             'A',
             'AF':
             0.1,
             'AF2':
             0.2
         }, {
             ColumnKeyConstants.ALTERNATE_BASES_ALT:
             'TT',
             'AF':
             0.2,
             'AF2':
             0.3
         }],
         ColumnKeyConstants.NAMES: ['rs1', 'rs2'],
         ColumnKeyConstants.QUALITY:
         2,
         ColumnKeyConstants.FILTER: ['PASS'],
         ColumnKeyConstants.CALLS: [{
             ColumnKeyConstants.CALLS_NAME:
             'Sample1',
             ColumnKeyConstants.CALLS_GENOTYPE: [0, 1],
             ColumnKeyConstants.CALLS_PHASESET:
             '*',
             'GQ':
             20,
             'HQ': [10, 20]
         }, {
             ColumnKeyConstants.CALLS_NAME:
             'Sample2',
             ColumnKeyConstants.CALLS_GENOTYPE: [1, 0],
             ColumnKeyConstants.CALLS_PHASESET:
             None,
             'GQ':
             10,
             'FLAG1':
             True
         }, {
             ColumnKeyConstants.CALLS_NAME:
             'Sample3',
             ColumnKeyConstants.CALLS_GENOTYPE:
             [vcfio.MISSING_GENOTYPE_VALUE],
             ColumnKeyConstants.CALLS_PHASESET:
             None
         }],
         'I1':
         'some data',
         'I2': ['data1', 'data2']
     }
     self.assertEqual([expected_row],
                      self._get_row_list_from_variant(variant))
 def test_create_processed_variant_annotation_alt_allele_num(self):
     csq_info = parser._Info(
         id=None,
         num='.',
         type=None,
         desc='some desc Allele|Consequence|IMPACT|ALLELE_NUM',
         source=None,
         version=None)
     header_fields = vcf_header_parser.HeaderFields(infos={'CSQ': csq_info},
                                                    formats={})
     variant = vcfio.Variant(
         reference_name='19',
         start=11,
         end=12,
         reference_bases='C',
         # The following represent a SNV and an insertion, resp.
         alternate_bases=['T', 'CT'],
         names=['rs1'],
         quality=2,
         filters=['PASS'],
         # Note that in the minimal mode of VEP, 'T' is an ambiguous annotation
         # ALT because it can map to either the 'T' SNV or the 'CT' insertion.
         # But because there is ALLELE_NUM there should be no ambiguity.
         # The last four annotations have incorrect ALLELE_NUMs.
         info={
             'CSQ':
             vcfio.VariantInfo(data=[
                 'T|C1|I1|1', 'T|C2|I2|2', 'T|C3|I3|0', 'T|C4|I4|3',
                 'T|C5|I5|TEST', 'T|C6|I6|'
             ],
                               field_count='.')
         })
     counter_factory = _CounterSpyFactory()
     factory = processed_variant.ProcessedVariantFactory(
         header_fields,
         split_alternate_allele_info_fields=True,
         annotation_fields=['CSQ'],
         use_allele_num=True,
         minimal_match=True,  # This should be ignored by the factory method.
         counter_factory=counter_factory)
     proc_var = factory.create_processed_variant(variant)
     alt1 = processed_variant.AlternateBaseData('T')
     alt1._info = {
         'CSQ': [{
             processed_variant._ANNOTATION_ALT: 'T',
             'Consequence': 'C1',
             'IMPACT': 'I1',
             'ALLELE_NUM': '1'
         }]
     }
     alt2 = processed_variant.AlternateBaseData('CT')
     alt2._info = {
         'CSQ': [{
             processed_variant._ANNOTATION_ALT: 'T',
             'Consequence': 'C2',
             'IMPACT': 'I2',
             'ALLELE_NUM': '2'
         }]
     }
     self.assertEqual(proc_var.alternate_data_list, [alt1, alt2])
     self.assertFalse(proc_var.non_alt_info.has_key('CSQ'))
     self.assertEqual(
         counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 2)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MINIMAL_MATCH.value].get_value(), 0)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value].get_value(), 0)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ALLELE_NUM_INCORRECT.value].get_value(), 4)
 def test_create_processed_variant_annotation_alt_minimal(self):
     # The returned variant is ignored as we create a custom one next.
     _, header_fields = self._get_sample_variant_and_header_with_csq()
     variant = vcfio.Variant(
         reference_name='19',
         start=11,
         end=12,
         reference_bases='CC',
         # The following represent a SNV, an insertion, and a deletion, resp.
         alternate_bases=['CT', 'CCT', 'C'],
         names=['rs1'],
         quality=2,
         filters=['PASS'],
         # Note that in the minimal mode, 'T' is an ambiguous annotation ALT
         # because it can map to either the 'CT' SNV or the 'CCT' insertion.
         # It is not ambiguous in the non-minimal mode (it only maps to `CT`).
         info={
             'CSQ':
             vcfio.VariantInfo(data=['T|C1|I1|S1|G1', '-|C2|I2|S2|G2'],
                               field_count='.')
         })
     counter_factory = _CounterSpyFactory()
     factory = processed_variant.ProcessedVariantFactory(
         header_fields,
         split_alternate_allele_info_fields=True,
         annotation_fields=['CSQ'],
         minimal_match=True,
         counter_factory=counter_factory)
     proc_var = factory.create_processed_variant(variant)
     alt1 = processed_variant.AlternateBaseData('CT')
     alt1._info = {}
     alt2 = processed_variant.AlternateBaseData('CCT')
     alt2._info = {
         'CSQ': [{
             processed_variant._ANNOTATION_ALT: 'T',
             processed_variant._ANNOTATION_ALT_AMBIGUOUS: True,
             'Consequence': 'C1',
             'IMPACT': 'I1',
             'SYMBOL': 'S1',
             'Gene': 'G1'
         }]
     }
     alt3 = processed_variant.AlternateBaseData('C')
     alt3._info = {
         'CSQ': [{
             processed_variant._ANNOTATION_ALT: '-',
             processed_variant._ANNOTATION_ALT_AMBIGUOUS: False,
             'Consequence': 'C2',
             'IMPACT': 'I2',
             'SYMBOL': 'S2',
             'Gene': 'G2'
         }]
     }
     self.assertEqual(proc_var.alternate_data_list, [alt1, alt2, alt3])
     self.assertFalse(proc_var.non_alt_info.has_key('CSQ'))
     self.assertEqual(
         counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 0)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MINIMAL_MATCH.value].get_value(), 2)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value].get_value(), 1)
 def test_create_processed_variant_symbolic_and_breakend_annotation_alt(
         self):
     # The returned variant is ignored as we create a custom one next.
     _, header_fields = self._get_sample_variant_and_header_with_csq()
     variant = vcfio.Variant(
         reference_name='19',
         start=11,
         end=12,
         reference_bases='C',
         alternate_bases=['<SYMBOLIC>', '[13:123457[.', 'C[10:10357[.'],
         names=['rs1'],
         quality=2,
         filters=['PASS'],
         info={
             'CSQ':
             vcfio.VariantInfo(
                 data=[
                     'SYMBOLIC|C1|I1|S1|G1', '[13|C2|I2|S2|G2',
                     'C[10|C3|I3|S3|G3', 'C[1|C3|I3|S3|G3'
                 ],  # The last one does not match any alts.
                 field_count='.')
         })
     counter_factory = _CounterSpyFactory()
     factory = processed_variant.ProcessedVariantFactory(
         header_fields,
         split_alternate_allele_info_fields=True,
         annotation_fields=['CSQ'],
         counter_factory=counter_factory)
     proc_var = factory.create_processed_variant(variant)
     alt1 = processed_variant.AlternateBaseData('<SYMBOLIC>')
     alt1._info = {
         'CSQ': [{
             processed_variant._ANNOTATION_ALT: 'SYMBOLIC',
             'Consequence': 'C1',
             'IMPACT': 'I1',
             'SYMBOL': 'S1',
             'Gene': 'G1'
         }]
     }
     alt2 = processed_variant.AlternateBaseData('[13:123457[.')
     alt2._info = {
         'CSQ': [{
             processed_variant._ANNOTATION_ALT: '[13',
             'Consequence': 'C2',
             'IMPACT': 'I2',
             'SYMBOL': 'S2',
             'Gene': 'G2'
         }]
     }
     alt3 = processed_variant.AlternateBaseData('C[10:10357[.')
     alt3._info = {
         'CSQ': [{
             processed_variant._ANNOTATION_ALT: 'C[10',
             'Consequence': 'C3',
             'IMPACT': 'I3',
             'SYMBOL': 'S3',
             'Gene': 'G3'
         }]
     }
     self.assertEqual(proc_var.alternate_data_list, [alt1, alt2, alt3])
     self.assertFalse(proc_var.non_alt_info.has_key('CSQ'))
     self.assertEqual(
         counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 3)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 1)
    def test_sharded_rows(self):
        variant = vcfio.Variant(reference_name='chr19',
                                start=11,
                                end=12,
                                reference_bases='C',
                                alternate_bases=['A', 'TT'],
                                names=['rs1', 'rs2'],
                                quality=2,
                                filters=['PASS'],
                                info={
                                    'AF': vcfio.VariantInfo([0.1, 0.2], 'A'),
                                    'AF2': vcfio.VariantInfo([0.2, 0.3], 'A'),
                                    'I1': vcfio.VariantInfo('some data', '1'),
                                },
                                calls=[
                                    vcfio.VariantCall(name='Sample1',
                                                      genotype=[0, 1],
                                                      phaseset='*',
                                                      info={
                                                          'GQ': 20,
                                                          'HQ': [10, 20]
                                                      }),
                                    vcfio.VariantCall(name='Sample2',
                                                      genotype=[1, 0],
                                                      info={
                                                          'GQ': 10,
                                                          'FLAG1': True
                                                      }),
                                    vcfio.VariantCall(name='Sample3',
                                                      genotype=[1, 0],
                                                      info={
                                                          'GQ': 30,
                                                          'FLAG1': True
                                                      })
                                ])
        expected_rows = [
            {
                ColumnKeyConstants.REFERENCE_NAME:
                'chr19',
                ColumnKeyConstants.START_POSITION:
                11,
                ColumnKeyConstants.END_POSITION:
                12,
                ColumnKeyConstants.REFERENCE_BASES:
                'C',
                ColumnKeyConstants.ALTERNATE_BASES: [{
                    ColumnKeyConstants.ALTERNATE_BASES_ALT:
                    'A',
                    'AF':
                    0.1,
                    'AF2':
                    0.2
                }, {
                    ColumnKeyConstants.ALTERNATE_BASES_ALT:
                    'TT',
                    'AF':
                    0.2,
                    'AF2':
                    0.3
                }],
                ColumnKeyConstants.NAMES: ['rs1', 'rs2'],
                ColumnKeyConstants.QUALITY:
                2,
                ColumnKeyConstants.FILTER: ['PASS'],
                ColumnKeyConstants.CALLS: [{
                    ColumnKeyConstants.CALLS_NAME:
                    'Sample1',
                    ColumnKeyConstants.CALLS_GENOTYPE: [0, 1],
                    ColumnKeyConstants.CALLS_PHASESET:
                    '*',
                    'GQ':
                    20,
                    'HQ': [10, 20]
                }, {
                    ColumnKeyConstants.CALLS_NAME:
                    'Sample2',
                    ColumnKeyConstants.CALLS_GENOTYPE: [1, 0],
                    ColumnKeyConstants.CALLS_PHASESET:
                    None,
                    'GQ':
                    10,
                    'FLAG1':
                    True
                }],
                'I1':
                'some data'
            },
            {
                ColumnKeyConstants.REFERENCE_NAME:
                'chr19',
                ColumnKeyConstants.START_POSITION:
                11,
                ColumnKeyConstants.END_POSITION:
                12,
                ColumnKeyConstants.REFERENCE_BASES:
                'C',
                ColumnKeyConstants.ALTERNATE_BASES: [{
                    ColumnKeyConstants.ALTERNATE_BASES_ALT:
                    'A',
                    'AF':
                    0.1,
                    'AF2':
                    0.2
                }, {
                    ColumnKeyConstants.ALTERNATE_BASES_ALT:
                    'TT',
                    'AF':
                    0.2,
                    'AF2':
                    0.3
                }],
                ColumnKeyConstants.NAMES: ['rs1', 'rs2'],
                ColumnKeyConstants.QUALITY:
                2,
                ColumnKeyConstants.FILTER: ['PASS'],
                ColumnKeyConstants.CALLS: [{
                    ColumnKeyConstants.CALLS_NAME:
                    'Sample3',
                    ColumnKeyConstants.CALLS_GENOTYPE: [1, 0],
                    ColumnKeyConstants.CALLS_PHASESET:
                    None,
                    'GQ':
                    30,
                    'FLAG1':
                    True
                }],
                'I1':
                'some data'
            },
        ]

        original_max_row_size = bigquery_vcf_schema._MAX_BIGQUERY_ROW_SIZE_BYTES
        try:
            bigquery_vcf_schema._MAX_BIGQUERY_ROW_SIZE_BYTES = (
                len(json.dumps(expected_rows[0])) + 10)
            self.assertEqual(expected_rows,
                             self._get_row_list_from_variant(variant))
        finally:
            bigquery_vcf_schema._MAX_BIGQUERY_ROW_SIZE_BYTES = original_max_row_size
    def test_get_merged_variants_move_quality_and_filter_to_calls(self):
        strategy = move_to_calls_strategy.MoveToCallsStrategy(
            info_keys_to_move_to_calls_regex='',
            copy_quality_to_calls=True,
            copy_filter_to_calls=True)
        variants = self._get_sample_variants()

        # Test single variant merge.
        single_merged_variant = strategy.get_merged_variants([variants[0]])[0]
        self.assertEqual([
            vcfio.VariantCall(name='Sample1',
                              genotype=[0, 1],
                              info={
                                  'GQ': 20,
                                  'HQ': [10, 20],
                                  ColumnKeyConstants.QUALITY: 2,
                                  ColumnKeyConstants.FILTER: ['PASS']
                              }),
            vcfio.VariantCall(name='Sample2',
                              genotype=[1, 0],
                              info={
                                  'GQ': 10,
                                  'FLAG1': True,
                                  ColumnKeyConstants.QUALITY: 2,
                                  ColumnKeyConstants.FILTER: ['PASS']
                              })
        ], single_merged_variant.calls)

        # Test multiple variant merge.
        merged_variant = strategy.get_merged_variants(variants)[0]
        self._assert_common_expected_merged_fields(merged_variant)
        self.assertEqual([
            vcfio.VariantCall(name='Sample1',
                              genotype=[0, 1],
                              info={
                                  'GQ': 20,
                                  'HQ': [10, 20],
                                  ColumnKeyConstants.QUALITY: 2,
                                  ColumnKeyConstants.FILTER: ['PASS']
                              }),
            vcfio.VariantCall(name='Sample2',
                              genotype=[1, 0],
                              info={
                                  'GQ': 10,
                                  'FLAG1': True,
                                  ColumnKeyConstants.QUALITY: 2,
                                  ColumnKeyConstants.FILTER: ['PASS']
                              }),
            vcfio.VariantCall(name='Sample3',
                              genotype=[1, 1],
                              info={
                                  ColumnKeyConstants.QUALITY: 20,
                                  ColumnKeyConstants.FILTER: ['q10']
                              }),
            vcfio.VariantCall(name='Sample4',
                              genotype=[1, 0],
                              info={
                                  'GQ': 20,
                                  ColumnKeyConstants.QUALITY: 20,
                                  ColumnKeyConstants.FILTER: ['q10']
                              })
        ], merged_variant.calls)
        self.assertItemsEqual(['A1', 'A2', 'A3'], merged_variant.info.keys())
        self.assertTrue(merged_variant.info['A1'].data in ('some data',
                                                           'some data2'))
        self.assertEqual(vcfio.VariantInfo(['data1', 'data2'], '2'),
                         merged_variant.info['A2'])
        self.assertEqual(vcfio.VariantInfo(['data3', 'data4'], '2'),
                         merged_variant.info['A3'])