示例#1
0
    def test_merge_many_different_alternates(self):
        strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy(
            None, None, None)

        variant_1 = vcfio.Variant(reference_name='1',
                                  start=1,
                                  end=2,
                                  reference_bases='A',
                                  alternate_bases=['C'])
        variant_2 = vcfio.Variant(reference_name='1',
                                  start=1,
                                  end=2,
                                  reference_bases='A',
                                  alternate_bases=['G'])
        variant_3 = vcfio.Variant(reference_name='1',
                                  start=1,
                                  end=2,
                                  reference_bases='A',
                                  alternate_bases=['T'])
        variant_1.calls.append(
            vcfio.VariantCall(name='Sample1', genotype=[1, 0]))
        variant_2.calls.append(
            vcfio.VariantCall(name='Sample2', genotype=[1, 0]))
        variant_3.calls.append(
            vcfio.VariantCall(name='Sample3', genotype=[1, 0]))
        variants = [variant_1, variant_2, variant_3]
        merged_variants = list(strategy.get_merged_variants(variants))
        self.assertEqual(sorted(merged_variants), sorted(variants))
示例#2
0
 def _get_variants(self):
     variant_1 = vcfio.Variant(reference_name='19',
                               start=11,
                               end=12,
                               reference_bases='C',
                               alternate_bases=['A', 'TT'],
                               names=['rs1'],
                               quality=2,
                               filters=['PASS'],
                               info={
                                   'A1': 'some data',
                                   'A2': ['data1', 'data2']
                               })
     variant_2 = vcfio.Variant(reference_name='19',
                               start=11,
                               end=12,
                               reference_bases='C',
                               alternate_bases=['A', 'TT'],
                               names=['rs1'],
                               quality=20,
                               filters=['q10'],
                               info={
                                   'A1': 'some data2',
                                   'A3': ['data3', 'data4']
                               })
     return [variant_1, variant_2]
示例#3
0
    def test_densify_variants_pipeline(self):
        sample_ids = [
            hash_name('sample1'),
            hash_name('sample2'),
            hash_name('sample3')
        ]
        variant_calls = [
            vcfio.VariantCall(sample_id=sample_ids[0]),
            vcfio.VariantCall(sample_id=sample_ids[1]),
            vcfio.VariantCall(sample_id=sample_ids[2]),
        ]
        variants = [
            vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]),
            vcfio.Variant(calls=[variant_calls[1], variant_calls[2]]),
        ]

        pipeline = TestPipeline()
        densified_variants = (
            pipeline
            | Create(variants)
            |
            'DensifyVariants' >> densify_variants.DensifyVariants(sample_ids))
        assert_that(densified_variants, asserts.has_sample_ids(sample_ids))

        pipeline.run()
示例#4
0
 def _get_sample_variants(self):
   variant_1 = vcfio.Variant(
       reference_name='19', start=11, end=12, reference_bases='C',
       alternate_bases=['A', 'TT'], names=['rs1'], quality=2,
       filters=['PASS'],
       info={'A1': 'some data', 'A2': ['data1', 'data2']},
       calls=[
           vcfio.VariantCall(
               name='Sample1', genotype=[0, 1], phaseset='*',
               info={'GQ': 20, 'HQ': [10, 20]}),
           vcfio.VariantCall(
               name='Sample2', genotype=[1, 0],
               info={'GQ': 10, 'FLAG1': True}),
       ]
   )
   variant_2 = vcfio.Variant(
       reference_name='20', start=11, end=12, reference_bases='C',
       alternate_bases=['A', 'TT'], names=['rs1'], quality=20,
       filters=['q10'],
       info={'A1': 'some data2', 'A3': ['data3', 'data4']},
       calls=[
           vcfio.VariantCall(name='Sample3', genotype=[1, 1]),
           vcfio.VariantCall(
               name='Sample4', genotype=[1, 0],
               info={'GQ': 20}),
       ]
   )
   return [variant_1, variant_2]
 def _get_sample_unmerged_variants(self):
     # Start/end are different from merged variants.
     variant_1 = vcfio.Variant(reference_name='19',
                               start=123,
                               end=125,
                               reference_bases='C',
                               alternate_bases=['A', 'TT'],
                               names=['rs2'],
                               calls=[
                                   vcfio.VariantCall(
                                       sample_id=hash_name('Unmerged1'),
                                       genotype=[0, 1])
                               ])
     # Ordering of alternate_bases is different from merged variants.
     variant_2 = vcfio.Variant(reference_name='19',
                               start=11,
                               end=12,
                               reference_bases='C',
                               alternate_bases=['TT', 'A'],
                               names=['rs3'],
                               calls=[
                                   vcfio.VariantCall(
                                       sample_id=hash_name('Unmerged2'),
                                       genotype=[0, 1])
                               ])
     return [variant_1, variant_2]
示例#6
0
  def test_schema_conflict_in_info_field_type(self):
    variant = vcfio.Variant(
        reference_name='chr19', start=11, end=12, reference_bases='CT',
        alternate_bases=[], filters=[],
        info={'IB': 1,
              'II': 1.1,
              'IFR': [1, 2],
              'ISR': [1.0, 2.0]})
    header_num_dict = {'IB': '1', 'II': '1', 'IFR': '2', 'ISR': '2'}
    expected_row = {
        ColumnKeyConstants.REFERENCE_NAME: 'chr19',
        ColumnKeyConstants.START_POSITION: 11,
        ColumnKeyConstants.END_POSITION: 12,
        ColumnKeyConstants.REFERENCE_BASES: 'CT',
        ColumnKeyConstants.ALTERNATE_BASES: [],
        ColumnKeyConstants.CALLS: [],
        'IB': True,
        'II': 1,
        'IFR': [1.0, 2.0],
        'ISR': ['1.0', '2.0']}
    self.assertEqual([expected_row], self._get_row_list_from_variant(
        variant, header_num_dict, allow_incompatible_records=True))

    with self.assertRaises(ValueError):
      variant = vcfio.Variant(
          reference_name='chr19', start=11, end=12, reference_bases='CT',
          alternate_bases=[], filters=[],
          # String cannot be casted to integer.
          info={'II': '1.1'})
      header_num_dict = {'II': '1'}
      self._get_row_list_from_variant(
          variant, header_num_dict, allow_incompatible_records=True)
      self.fail('String data for an integer schema must cause an exception')
示例#7
0
    def test_non_variant_split_by_snp(self):
        strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy(
            None, None, None)
        non_variant = vcfio.Variant(reference_name='1', start=0, end=10)
        variant = vcfio.Variant(reference_name='1',
                                start=5,
                                end=6,
                                reference_bases='C',
                                alternate_bases=['A'])
        call_1 = vcfio.VariantCall(name='1', genotype=[0, 0])
        call_2 = vcfio.VariantCall(name='2', genotype=[1, 0])
        non_variant.calls.append(call_1)
        variant.calls.append(call_2)
        expected_1 = vcfio.Variant(reference_name='1', start=0, end=5)
        expected_2 = vcfio.Variant(reference_name='1',
                                   start=5,
                                   end=6,
                                   reference_bases='C',
                                   alternate_bases=['A'])
        expected_3 = vcfio.Variant(reference_name='1', start=6, end=10)
        expected_1.calls.append(call_1)
        expected_2.calls.append(call_1)
        expected_2.calls.append(call_2)
        expected_3.calls.append(call_1)

        actual = list(strategy.get_merged_variants([non_variant, variant]))
        expected = [expected_1, expected_2, expected_3]
        self.assertEqual(sorted(actual), sorted(expected))
示例#8
0
    def test_sample_ids_combiner_pipeline_preserve_sample_order(self):
        sample_ids = [
            hash_name('sample2'),
            hash_name('sample1'),
            hash_name('sample3')
        ]
        variant_calls = [
            vcfio.VariantCall(sample_id=sample_ids[0]),
            vcfio.VariantCall(sample_id=sample_ids[1]),
            vcfio.VariantCall(sample_id=sample_ids[2])
        ]
        variants = [
            vcfio.Variant(
                calls=[variant_calls[0], variant_calls[1], variant_calls[2]]),
            vcfio.Variant(
                calls=[variant_calls[0], variant_calls[1], variant_calls[2]])
        ]

        pipeline = TestPipeline()
        combined_sample_ids = (
            pipeline
            | transforms.Create(variants)
            | 'CombineSampleIds' >>
            combine_sample_ids.SampleIdsCombiner(preserve_sample_order=True)
            | combiners.ToList())
        assert_that(combined_sample_ids, equal_to([sample_ids]))
        pipeline.run()
示例#9
0
  def test_merge_snp_with_non_variant(self):
    strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy(
        None, None, None)

    variant = vcfio.Variant(
        reference_name='1',
        start=5,
        end=6,
        reference_bases='A',
        alternate_bases=['C'],
        names=['v'],
        filters=['vf'],
        quality=1)
    non_variant = vcfio.Variant(
        reference_name='1',
        start=0,
        end=10,
        reference_bases='G',
        alternate_bases=['<NON_REF>'],
        names=['nv'],
        filters=['nvf'],
        quality=2)

    call_1 = vcfio.VariantCall(name='1', genotype=[1, 0])
    call_2 = vcfio.VariantCall(name='2', genotype=[0, 0])
    variant.calls.append(call_1)
    non_variant.calls.append(call_2)
    expected_1 = vcfio.Variant(
        reference_name='1',
        start=0,
        end=5,
        alternate_bases=['<NON_REF>'],
        names=['nv'],
        filters=['nvf'],
        quality=2)
    expected_2 = vcfio.Variant(
        reference_name='1',
        start=6,
        end=10,
        alternate_bases=['<NON_REF>'],
        names=['nv'],
        filters=['nvf'],
        quality=2)
    expected_3 = vcfio.Variant(
        reference_name='1',
        start=5,
        end=6,
        reference_bases='A',
        alternate_bases=['C'],
        names=['v'],
        filters=['vf'],
        quality=1)
    expected_1.calls.append(call_2)
    expected_2.calls.append(call_2)
    expected_3.calls.append(call_1)
    expected_3.calls.append(call_2)
    actual = list(strategy.get_merged_variants([variant, non_variant]))
    expected = [expected_1, expected_2, expected_3]
    self.assertEqual(sorted(actual), sorted(expected))
示例#10
0
    def test_get_snp_merge_keys(self):
        strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy(
            None, None, None, 2)

        variant_1 = vcfio.Variant(reference_name='1', start=3, end=4)
        variant_2 = vcfio.Variant(reference_name='2', start=4, end=5)

        self.assertEqual(next(strategy.get_merge_keys(variant_1)), '1:2')
        self.assertEqual(next(strategy.get_merge_keys(variant_2)), '2:4')
示例#11
0
  def test_merge_2_non_variants(self):
    strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy(
        None, None, None)

    non_variant_1 = vcfio.Variant(
        reference_name='1',
        start=0,
        end=10,
        alternate_bases=['<NON_REF>'],
        names=['nonv1', 'nonv2'],
        filters=['f1', 'f2'],
        quality=1)
    non_variant_2 = vcfio.Variant(
        reference_name='1',
        start=5,
        end=15,
        alternate_bases=['<NON_REF>'],
        names=['nonv2', 'nonv3'],
        filters=['f2', 'f3'],
        quality=2)
    call_1 = vcfio.VariantCall(name='1', genotype=[0, 0])
    call_2 = vcfio.VariantCall(name='2', genotype=[0, 0])
    non_variant_1.calls.append(call_1)
    non_variant_2.calls.append(call_2)
    expected_1 = vcfio.Variant(
        reference_name='1',
        start=0,
        end=5,
        alternate_bases=['<NON_REF>'],
        names=['nonv1', 'nonv2'],
        filters=['f1', 'f2'],
        quality=1)
    expected_2 = vcfio.Variant(
        reference_name='1',
        start=10,
        end=15,
        alternate_bases=['<NON_REF>'],
        names=['nonv2', 'nonv3'],
        filters=['f2', 'f3'],
        quality=2)
    expected_3 = vcfio.Variant(
        reference_name='1',
        start=5,
        end=10,
        alternate_bases=['<NON_REF>'],
        names=['nonv1', 'nonv2', 'nonv3'],
        filters=['f1', 'f2', 'f3'],
        quality=1)
    expected_1.calls.append(call_1)
    expected_2.calls.append(call_2)
    expected_3.calls.append(call_1)
    expected_3.calls.append(call_2)
    actual = list(strategy.get_merged_variants([non_variant_1, non_variant_2]))
    expected = [expected_1, expected_2, expected_3]

    self.assertEqual(sorted(actual), sorted(expected))
 def _get_sample_variants(self):
   variant1 = vcfio.Variant(
       reference_name='chr19', start=11, end=12, reference_bases='C')
   variant2 = vcfio.Variant(
       reference_name='20', start=123, end=125, reference_bases='CT')
   variant3 = vcfio.Variant(
       reference_name='20', start=None, end=None, reference_bases=None)
   variant4 = vcfio.Variant(
       reference_name='20', start=123, end=125, reference_bases='CT')
   return [variant1, variant2, variant3, variant4]
def _get_sample_variant_1(is_for_nucleus=False):
    """Get first sample variant.

  Features:
    multiple alternates
    not phased
    multiple names
    utf-8 encoded
  """
    if not is_for_nucleus:
        vcf_line = ('20	1234	rs123;rs2	C	A,T	50	'
                    'PASS	AF=0.5,0.1;NS=1;SVTYPE=BÑD	GT:GQ	0/0:48	1/0:20\n')
        variant = vcfio.Variant(reference_name='20',
                                start=1233,
                                end=1234,
                                reference_bases='C',
                                alternate_bases=['A', 'T'],
                                names=['rs123', 'rs2'],
                                quality=50,
                                filters=['PASS'],
                                info={
                                    'AF': [0.5, 0.1],
                                    'NS': 1,
                                    'SVTYPE': ['BÑD']
                                })
        variant.calls.append(
            vcfio.VariantCall(name='Sample1', genotype=[0, 0], info={'GQ':
                                                                     48}))
        variant.calls.append(
            vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={'GQ':
                                                                     20}))
    else:
        # 0.1 -> 0.25 float precision loss due to binary floating point conversion.
        vcf_line = ('20	1234	rs123;rs2	C	A,T	50	'
                    'PASS	AF=0.5,0.25;NS=1	GT:GQ	0/0:48	1/0:20\n')
        variant = vcfio.Variant(reference_name='20',
                                start=1233,
                                end=1234,
                                reference_bases='C',
                                alternate_bases=['A', 'T'],
                                names=['rs123', 'rs2'],
                                quality=50,
                                filters=['PASS'],
                                info={
                                    'AF': [0.5, 0.25],
                                    'NS': 1
                                })
        variant.calls.append(
            vcfio.VariantCall(name='Sample1', genotype=[0, 0], info={'GQ':
                                                                     48}))
        variant.calls.append(
            vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={'GQ':
                                                                     20}))

    return variant, vcf_line
def _get_sample_variant_3(is_for_nucleus=False):
    """Get third sample variant.

  Features:
    symbolic alternate
    no calls for sample 2
    alternate phaseset
  """
    if not is_for_nucleus:
        vcf_line = ('19	12	.	C	<SYMBOLIC>	49	q10	AF=0.5	'
                    'GT:PS:GQ	0|1:1:45	.:.:.\n')
        variant = vcfio.Variant(reference_name='19',
                                start=11,
                                end=12,
                                reference_bases='C',
                                alternate_bases=['<SYMBOLIC>'],
                                quality=49,
                                filters=['q10'],
                                info={'AF': [0.5]})
        variant.calls.append(
            vcfio.VariantCall(name='Sample1',
                              genotype=[0, 1],
                              phaseset='1',
                              info={'GQ': 45}))
        variant.calls.append(
            vcfio.VariantCall(name='Sample2',
                              genotype=[vcfio.MISSING_GENOTYPE_VALUE],
                              info={'GQ': None}))
    else:
        # '.:.:.' -> './.:.:.' due to Nucleus handeling of VariantCall.genotype.
        vcf_line = ('19	12	.	C	<SYMBOLIC>	49	PASS	'
                    'AF=0.5	GT:PS:GQ	0|1:1:45	./.:.:.\n')
        variant = vcfio.Variant(reference_name='19',
                                start=11,
                                end=12,
                                reference_bases='C',
                                alternate_bases=['<SYMBOLIC>'],
                                quality=49,
                                filters=['PASS'],
                                info={'AF': [0.5]})
        variant.calls.append(
            vcfio.VariantCall(name='Sample1',
                              genotype=[0, 1],
                              phaseset='1',
                              info={'GQ': 45}))
        variant.calls.append(
            vcfio.VariantCall(name='Sample2',
                              genotype=[
                                  vcfio.MISSING_GENOTYPE_VALUE,
                                  vcfio.MISSING_GENOTYPE_VALUE
                              ],
                              info={}))
    return variant, vcf_line
def _get_sample_variant_2(is_for_nucleus=False):
    """Get second sample variant.

  Features:
    multiple references
    no alternate
    phased
    multiple filters
    missing format field
  """
    if not is_for_nucleus:
        vcf_line = ('19	123	rs1234	GTC	.	40	q10;s50	NS=2	'
                    'GT:GQ	1|0:48	0/1:.\n')
        variant = vcfio.Variant(reference_name='19',
                                start=122,
                                end=125,
                                reference_bases='GTC',
                                alternate_bases=[],
                                names=['rs1234'],
                                quality=40,
                                filters=['q10', 's50'],
                                info={'NS': 2})
        variant.calls.append(
            vcfio.VariantCall(name='Sample1',
                              genotype=[1, 0],
                              phaseset=vcfio.DEFAULT_PHASESET_VALUE,
                              info={'GQ': 48}))
        variant.calls.append(
            vcfio.VariantCall(name='Sample2',
                              genotype=[0, 1],
                              info={'GQ': None}))
    else:
        # 'q10;s50' -> 'PASS' due to missing header fields.
        vcf_line = ('19	123	rs1234	GTC	.	40	PASS	NS=2	' 'GT:GQ	1|0:48	0/1:.\n')
        variant = vcfio.Variant(reference_name='19',
                                start=122,
                                end=125,
                                reference_bases='GTC',
                                alternate_bases=[],
                                names=['rs1234'],
                                quality=40,
                                filters=['PASS'],
                                info={'NS': 2})
        variant.calls.append(
            vcfio.VariantCall(name='Sample1',
                              genotype=[1, 0],
                              phaseset=vcfio.DEFAULT_PHASESET_VALUE,
                              info={'GQ': 48}))
        variant.calls.append(
            vcfio.VariantCall(name='Sample2', genotype=[0, 1], info={}))
    return variant, vcf_line
示例#16
0
    def test_overlapping_three_non_variants(self):
        strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy(
            None, None, None)
        non_variant_1 = vcfio.Variant(reference_name='1', start=0, end=10)
        non_variant_2 = vcfio.Variant(reference_name='1', start=3, end=5)
        non_variant_3 = vcfio.Variant(reference_name='1', start=4, end=9)
        call_1 = vcfio.VariantCall('1', [0, 0])
        call_2 = vcfio.VariantCall('2', [0, 0])
        call_3 = vcfio.VariantCall('3', [0, 0])
        non_variant_1.calls.append(call_1)
        non_variant_2.calls.append(call_2)
        non_variant_3.calls.append(call_3)

        expected_1 = vcfio.Variant(reference_name='1', start=0, end=3)
        expected_2 = vcfio.Variant(reference_name='1', start=3, end=4)
        expected_3 = vcfio.Variant(reference_name='1', start=4, end=5)
        expected_4 = vcfio.Variant(reference_name='1', start=5, end=9)
        expected_5 = vcfio.Variant(reference_name='1', start=9, end=10)
        expected_1.calls.append(call_1)
        expected_2.calls.append(call_1)
        expected_2.calls.append(call_2)
        expected_3.calls.append(call_1)
        expected_3.calls.append(call_2)
        expected_3.calls.append(call_3)
        expected_4.calls.append(call_1)
        expected_4.calls.append(call_3)
        expected_5.calls.append(call_1)
        expected = [expected_1, expected_2, expected_3, expected_4, expected_5]
        actual = list(
            strategy.get_merged_variants(
                [non_variant_1, non_variant_2, non_variant_3]))
        self.assertEqual(sorted(actual), sorted(expected))
 def test_nonstandard_float_values(self):
     variant = vcfio.Variant(
         reference_name='chr19',
         start=11,
         end=12,
         reference_bases='CT',
         alternate_bases=[],
         filters=[],
         info={
             'F1': vcfio.VariantInfo(float('inf'), '1'),
             'F2': vcfio.VariantInfo(
                 [float('-inf'), float('nan'), 1.2], '3'),
             'F3': vcfio.VariantInfo(float('nan'), '1'),
         })
     null_replacement_value = -sys.maxint
     expected_row = {
         ColumnKeyConstants.REFERENCE_NAME: 'chr19',
         ColumnKeyConstants.START_POSITION: 11,
         ColumnKeyConstants.END_POSITION: 12,
         ColumnKeyConstants.REFERENCE_BASES: 'CT',
         ColumnKeyConstants.ALTERNATE_BASES: [],
         ColumnKeyConstants.CALLS: [],
         'F1': sys.maxint,
         'F2': [-sys.maxint, null_replacement_value, 1.2],
         'F3': None
     }
     self.assertEqual([expected_row],
                      self._get_row_list_from_variant(variant))
示例#18
0
 def _get_sample_variant_with_empty_calls(self):
     variant = vcfio.Variant(reference_name='20',
                             start=123,
                             end=125,
                             reference_bases='CT',
                             alternate_bases=[],
                             filters=['q10', 's10'],
                             info={'II': 1234},
                             calls=[
                                 vcfio.VariantCall(name='EmptySample',
                                                   genotype=[],
                                                   phaseset='*',
                                                   info={}),
                             ])
     header_num_dict = {'II': '1'}
     row = {
         ColumnKeyConstants.REFERENCE_NAME: '20',
         ColumnKeyConstants.START_POSITION: 123,
         ColumnKeyConstants.END_POSITION: 125,
         ColumnKeyConstants.REFERENCE_BASES: 'CT',
         ColumnKeyConstants.ALTERNATE_BASES: [],
         ColumnKeyConstants.FILTER: ['q10', 's10'],
         ColumnKeyConstants.CALLS: [],
         'II': 1234
     }
     return variant, row, header_num_dict
示例#19
0
def _get_sample_variant_2():
    """Get second sample variant.

  Features:
    multiple references
    no alternate
    phased
    multiple filters
    missing format field
  """
    vcf_line = ('19	123	rs1234	GTC	.	40	q10;s50	NS=2	GT:GQ	1|0:48	0/1:.\n')
    variant = vcfio.Variant(
        reference_name='19',
        start=122,
        end=125,
        reference_bases='GTC',
        alternate_bases=[],
        names=['rs1234'],
        quality=40,
        filters=['q10', 's50'],
        info={'NS': vcfio.VariantInfo(data=2, field_count='1')})
    variant.calls.append(
        vcfio.VariantCall(name='Sample1',
                          genotype=[1, 0],
                          phaseset=vcfio.DEFAULT_PHASESET_VALUE,
                          info={'GQ': 48}))
    variant.calls.append(
        vcfio.VariantCall(name='Sample2', genotype=[0, 1], info={'GQ': None}))
    return variant, vcf_line
示例#20
0
def _get_sample_variant_1():
    """Get first sample variant.

  Features:
    multiple alternates
    not phased
    multiple names
  """
    vcf_line = ('20	1234	rs123;rs2	C	A,T	50	PASS	AF=0.5,0.1;NS=1	'
                'GT:GQ	0/0:48	1/0:20\n')
    variant = vcfio.Variant(reference_name='20',
                            start=1233,
                            end=1234,
                            reference_bases='C',
                            alternate_bases=['A', 'T'],
                            names=['rs123', 'rs2'],
                            quality=50,
                            filters=['PASS'],
                            info={
                                'AF':
                                vcfio.VariantInfo(data=[0.5, 0.1],
                                                  field_count='A'),
                                'NS':
                                vcfio.VariantInfo(data=1, field_count='1')
                            })
    variant.calls.append(
        vcfio.VariantCall(name='Sample1', genotype=[0, 0], info={'GQ': 48}))
    variant.calls.append(
        vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={'GQ': 20}))
    return variant, vcf_line
示例#21
0
def _get_sample_variant_3():
    """Get third sample variant.

  Features:
    symbolic alternate
    no calls for sample 2
    alternate phaseset
  """
    vcf_line = ('19	12	.	C	<SYMBOLIC>	49	q10	AF=0.5	GT:PS:GQ	0|1:1:45	'
                '.:.:.\n')
    variant = vcfio.Variant(
        reference_name='19',
        start=11,
        end=12,
        reference_bases='C',
        alternate_bases=['<SYMBOLIC>'],
        quality=49,
        filters=['q10'],
        info={'AF': vcfio.VariantInfo(data=[0.5], field_count='A')})
    variant.calls.append(
        vcfio.VariantCall(name='Sample1',
                          genotype=[0, 1],
                          phaseset='1',
                          info={'GQ': 45}))
    variant.calls.append(
        vcfio.VariantCall(name='Sample2',
                          genotype=[vcfio.MISSING_GENOTYPE_VALUE],
                          info={'GQ': None}))
    return variant, vcf_line
示例#22
0
 def _get_sample_variant_1(self):
     variant = vcfio.Variant(reference_name='chr19',
                             start=11,
                             end=12,
                             reference_bases='C',
                             alternate_bases=['A', 'TT'],
                             names=['rs1', 'rs2'],
                             quality=2,
                             filters=['PASS'],
                             info={
                                 'IS': 'some data',
                                 'ISI': '1',
                                 'ISF': '1.0',
                                 'IF': 1.0,
                                 'IB': True,
                                 'IA': [1, 2]
                             },
                             calls=[
                                 vcfio.VariantCall(
                                     sample_id=hash_name('Sample1'),
                                     genotype=[0, 1],
                                     phaseset='*',
                                     info={
                                         'FI': 20,
                                         'FU': [10.0, 20.0]
                                     })
                             ])
     return variant
示例#23
0
 def test_unicode_fields(self):
     sample_unicode_str = u'\xc3\xb6'
     sample_utf8_str = sample_unicode_str.encode('utf-8')
     variant = vcfio.Variant(reference_name='chr19',
                             start=11,
                             end=12,
                             reference_bases='CT',
                             alternate_bases=[],
                             filters=[sample_unicode_str, sample_utf8_str],
                             info={
                                 'IS': sample_utf8_str,
                                 'ISR':
                                 [sample_unicode_str, sample_utf8_str]
                             })
     header_num_dict = {'IS': '1', 'ISR': '2'}
     proc_variant = _get_processed_variant(variant, header_num_dict)
     expected_row = {
         ColumnKeyConstants.REFERENCE_NAME: 'chr19',
         ColumnKeyConstants.START_POSITION: 11,
         ColumnKeyConstants.END_POSITION: 12,
         ColumnKeyConstants.REFERENCE_BASES: 'CT',
         ColumnKeyConstants.ALTERNATE_BASES: [],
         ColumnKeyConstants.FILTER:
         [sample_unicode_str, sample_unicode_str],
         ColumnKeyConstants.CALLS: [],
         'IS': sample_unicode_str,
         'ISR': [sample_unicode_str, sample_unicode_str]
     }
     self.assertEqual([expected_row],
                      list(self._row_generator.get_rows(proc_variant)))
示例#24
0
    def test_no_alternate_bases(self):
        variant = vcfio.Variant(reference_name='chr19',
                                start=11,
                                end=12,
                                reference_bases='CT',
                                alternate_bases=[],
                                filters=['q10'],
                                info={
                                    'IS': 'some data',
                                    'ISR': ['data1', 'data2']
                                })
        header_num_dict = {'IS': '1', 'ISR': '2'}
        proc_variant = _get_processed_variant(variant, header_num_dict)
        expected_row = {
            ColumnKeyConstants.REFERENCE_NAME: 'chr19',
            ColumnKeyConstants.START_POSITION: 11,
            ColumnKeyConstants.END_POSITION: 12,
            ColumnKeyConstants.REFERENCE_BASES: 'CT',
            ColumnKeyConstants.ALTERNATE_BASES: [],
            ColumnKeyConstants.FILTER: ['q10'],
            ColumnKeyConstants.CALLS: [],
            'IS': 'some data',
            'ISR': ['data1', 'data2']
        }

        self.assertEqual([expected_row],
                         list(self._row_generator.get_rows(proc_variant)))
示例#25
0
 def test_create_processed_variant_annotation_alt_prefix(self):
     # The returned variant is ignored as we create a custom one next.
     _, header_fields = self._get_sample_variant_and_header_with_csq()
     variant = vcfio.Variant(
         reference_name='19',
         start=11,
         end=12,
         reference_bases='C',
         alternate_bases=['CT', 'CC', 'CCC'],
         names=['rs1'],
         quality=2,
         filters=['PASS'],
         info={'CSQ': ['T|C1|I1|S1|G1', 'C|C2|I2|S2|G2', 'CC|C3|I3|S3|G3']})
     counter_factory = _CounterSpyFactory()
     factory = processed_variant.ProcessedVariantFactory(
         header_fields,
         split_alternate_allele_info_fields=True,
         annotation_fields=['CSQ'],
         counter_factory=counter_factory)
     proc_var = factory.create_processed_variant(variant)
     alt1 = processed_variant.AlternateBaseData('CT')
     alt1._info = {
         'CSQ': [{
             annotation_parser.ANNOTATION_ALT: 'T',
             'Consequence': 'C1',
             'IMPACT': 'I1',
             'SYMBOL': 'S1',
             'Gene': 'G1'
         }]
     }
     alt2 = processed_variant.AlternateBaseData('CC')
     alt2._info = {
         'CSQ': [{
             annotation_parser.ANNOTATION_ALT: 'C',
             'Consequence': 'C2',
             'IMPACT': 'I2',
             'SYMBOL': 'S2',
             'Gene': 'G2'
         }]
     }
     alt3 = processed_variant.AlternateBaseData('CCC')
     alt3._info = {
         'CSQ': [{
             annotation_parser.ANNOTATION_ALT: 'CC',
             'Consequence': 'C3',
             'IMPACT': 'I3',
             'SYMBOL': 'S3',
             'Gene': 'G3'
         }]
     }
     self.assertEqual(proc_var.alternate_data_list, [alt1, alt2, alt3])
     self.assertFalse(proc_var.non_alt_info.has_key('CSQ'))
     self.assertEqual(
         counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 3)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0)
示例#26
0
def _get_sample_variant_1(file_name='', use_1_based_coordinate=False,
                          use_hashing=True, move_hom_ref_calls=False):
  """Get first sample variant.

  Features:
    multiple alternates
    not phased
    multiple names
    utf-8 encoded
  """
  hash_name_method = _get_hashing_function(file_name, use_hashing)
  variant = vcfio.Variant(
      reference_name='20', start=1233 + use_1_based_coordinate, end=1234,
      reference_bases='C', alternate_bases=['A', 'T'], names=['rs123', 'rs2'],
      quality=50, filters=['PASS'],
      hom_ref_calls=([('Sample1', hash_name_method('Sample1'))] if
                      move_hom_ref_calls else None),
      info={'AF': [0.5, 0.1], 'NS': 1, 'SVTYPE': ['BÑD']})
  if not move_hom_ref_calls:
    variant.calls.append(
        vcfio.VariantCall(sample_id=hash_name_method('Sample1'), name='Sample1',
                          genotype=[0, 0], info={'GQ': 48}))
  variant.calls.append(
      vcfio.VariantCall(sample_id=hash_name_method('Sample2'), name='Sample2',
                        genotype=[1, 0], info={'GQ': 20}))

  return variant
示例#27
0
def _get_sample_variant_2(file_name='', use_1_based_coordinate=False,
                          use_hashing=True, move_hom_ref_calls=False):
  """Get second sample variant.

  Features:
    multiple references
    no alternate
    phased
    multiple filters
    missing format field
  """
  hash_name_method = _get_hashing_function(file_name, use_hashing)
  variant = vcfio.Variant(
      reference_name='19',
      start=122 + use_1_based_coordinate, end=125, reference_bases='GTC',
      alternate_bases=[], names=['rs1234'], quality=40,
      filters=['q10', 's50'], hom_ref_calls=[] if move_hom_ref_calls else None,
      info={'NS': 2})
  variant.calls.append(
      vcfio.VariantCall(sample_id=hash_name_method('Sample1'), name='Sample1',
                        genotype=[-1, 0], phaseset=vcfio.DEFAULT_PHASESET_VALUE,
                        info={'GQ': 48}))
  variant.calls.append(
      vcfio.VariantCall(sample_id=hash_name_method('Sample2'), name='Sample2',
                        genotype=[0, -1], info={'GQ': None}))
  return variant
 def _get_sample_variant(self):
     return vcfio.Variant(reference_name='19',
                          start=11,
                          end=12,
                          reference_bases='C',
                          alternate_bases=['A', 'TT'],
                          names=['rs1', 'rs2'],
                          quality=2,
                          filters=['PASS'],
                          info={
                              'A1': vcfio.VariantInfo('some data', '1'),
                              'A2': vcfio.VariantInfo(['data1', 'data2'],
                                                      'A')
                          },
                          calls=[
                              vcfio.VariantCall(name='Sample1',
                                                genotype=[0, 1],
                                                info={
                                                    'GQ': 20,
                                                    'HQ': [10, 20]
                                                }),
                              vcfio.VariantCall(name='Sample2',
                                                genotype=[1, 0],
                                                info={
                                                    'GQ': 10,
                                                    'FLAG1': True
                                                })
                          ])
    def test_get_merge_keys(self):
        strategy = move_to_calls_strategy.MoveToCallsStrategy(None, None, None)

        def get_expected_key(reference_name, start, end, reference_bases,
                             alternate_bases):
            return '%s:%s:%s:%s:%s' % (
                reference_name or '', str(start or ''), str(
                    end or ''), strategy._get_hash(reference_bases or ''),
                strategy._get_hash(','.join(alternate_bases or [])))

        variant = vcfio.Variant()
        self.assertEqual(get_expected_key(None, None, None, None, None),
                         next(strategy.get_merge_keys(variant)))

        variant.reference_name = '19'
        self.assertEqual(get_expected_key(19, None, None, None, None),
                         next(strategy.get_merge_keys(variant)))

        variant.start = 123
        variant.end = 125
        variant.reference_bases = 'AT'
        self.assertEqual(get_expected_key(19, 123, 125, 'AT', None),
                         next(strategy.get_merge_keys(variant)))

        variant.alternate_bases = ['A', 'C']
        self.assertEqual(get_expected_key(19, 123, 125, 'AT', ['A', 'C']),
                         next(strategy.get_merge_keys(variant)))
示例#30
0
    def test_schema_conflict_in_format_field_number(self):
        variant = vcfio.Variant(reference_name='chr19',
                                start=11,
                                end=12,
                                reference_bases='CT',
                                alternate_bases=[],
                                filters=[],
                                calls=[
                                    vcfio.VariantCall(name='Sample1',
                                                      genotype=[0, 1],
                                                      phaseset='*',
                                                      info={
                                                          'FB': [1, 2],
                                                          'FI': [1, 2],
                                                          'FSR': 'str'
                                                      }),
                                    vcfio.VariantCall(name='Sample2',
                                                      genotype=[1, 0],
                                                      info={
                                                          'FB': [],
                                                          'FI': [],
                                                          'FSR': ''
                                                      })
                                ])
        proc_variant = _get_processed_variant(variant)
        expected_row = {
            ColumnKeyConstants.REFERENCE_NAME:
            'chr19',
            ColumnKeyConstants.START_POSITION:
            11,
            ColumnKeyConstants.END_POSITION:
            12,
            ColumnKeyConstants.REFERENCE_BASES:
            'CT',
            ColumnKeyConstants.ALTERNATE_BASES: [],
            ColumnKeyConstants.CALLS: [
                {
                    ColumnKeyConstants.CALLS_NAME: 'Sample1',
                    ColumnKeyConstants.CALLS_GENOTYPE: [0, 1],
                    ColumnKeyConstants.CALLS_PHASESET: '*',
                    'FB': True,
                    'FI': 1,
                    'FSR': ['str']
                },
                {
                    ColumnKeyConstants.CALLS_NAME: 'Sample2',
                    ColumnKeyConstants.CALLS_GENOTYPE: [1, 0],
                    ColumnKeyConstants.CALLS_PHASESET: None,
                    'FB': False,
                    'FI': None,
                    'FSR': ['']
                },
            ],
        }

        self.assertEqual(
            [expected_row],
            list(
                self._row_generator.get_rows(proc_variant,
                                             allow_incompatible_records=True)))