Пример #1
0
    def test_should_merge_genotype_call_object_in_sample_data(self):
        sample_data1 = SampleData(['GT'], ['sample_name'])
        sample_data1.add_sample_data('sample_name', 'GT', GenotypeCall('0/1'))
        sample_data2 = SampleData(['GT'], ['sample_name'])
        sample_data2.add_sample_data('sample_name', 'GT', GenotypeCall('0/1'))

        sample_data1.merge_genotype_calls(sample_data2.genotypes())

        self.assertEqual(sample_data1.get_field("sample_name", "GT"),
                         GenotypeCall("1/1"))
Пример #2
0
    def test_should_fail_if_sample_data_objects_have_different_sample(self):
        sample_data1 = SampleData(['GT'], ['sample_name_1'])
        sample_data1.add_sample_data('sample_name_1', 'GT',
                                     GenotypeCall('0/0'))
        sample_data2 = SampleData(['GT'], ['sample_name_2'])
        sample_data2.add_sample_data('sample_name_2', 'GT',
                                     GenotypeCall('0/0'))

        self.assertRaises(Exception, sample_data1.merge_genotype_calls,
                          sample_data2.genotypes())
Пример #3
0
    def test_eq(self):
        reference = Record(None, Variant("1", 20, "A", "G"), set(), 0.0, set(),
                           InfoData(None, {}), SampleData([], []), False)

        self.assertTrue(
            reference == Record(None, Variant("1", 20, "A", "G"), set(
            ), 0.0, set(), InfoData(None, {}), SampleData([], []), False))

        self.assertFalse(
            reference == Record(None, Variant("2", 20, "A", "G"), set(
            ), 0.0, set(), InfoData(None, {}), SampleData([], []), False))

        self.assertFalse(reference == Record(None, Variant(
            "1", 20, "A", "G"), set("rs0"), 0.0, set(), InfoData(None, {}),
                                             SampleData([], []), False))

        self.assertFalse(
            reference == Record(None, Variant("1", 20, "A", "G"), set(
            ), 5.0, set(), InfoData(None, {}), SampleData([], []), False))

        self.assertFalse(
            reference == Record(None, Variant("1", 20, "A", "G"), set(
            ), 0.0, set("CV"), InfoData(None, {}), SampleData([], []), False))

        self.assertFalse(reference == Record(None, Variant(
            "1", 20, "A", "G"), set(), 0.0, set(), InfoData(None, {'AF': []}),
                                             SampleData([], []), False))

        self.assertFalse(reference == Record(
            None, Variant("1", 20, "A", "G"), set(), 0.0, set(),
            InfoData(None, {}), SampleData([], ['NA12787']), False))

        self.assertFalse(
            reference == Record(None, Variant("1", 20, "A", "G"), set(
            ), 0.0, set(), InfoData(None, {}), SampleData([], []), True))
Пример #4
0
 def test_sample_data_copes_with_mixed_missing_values_in_PL(self):
     sample_name = 'sample_name'
     sample_data = SampleData(['PL'], [sample_name])
     sample_data.set_genotype_likelihoods(sample_name,
                                          [-0.1, '.', -0.2, None, -0.3])
     self.assertEqual(sample_data.get_field(sample_name, 'PL'),
                      [1.0, None, 2.0, None, 3.0])
Пример #5
0
    def test_read_sample_data(self):
        schema = self.__get_example_schema("vcf_example.vcf")
        sample_schema = [key for key, _ in schema.iter_sample_data()]

        sample_data = SampleData(sample_schema, ['sample1'])

        sample_data.add_sample_data("sample1", "GT", GenotypeCall("1|0"))
        sample_data.add_sample_data("sample1", "PL", [3000, 0, 3000])
        sample_data.add_sample_data("sample1", "GQ", [1000])
        sample_data.add_sample_data("sample1", "PQ", [2000])
        sample_data.add_sample_data("sample1", "PS", [60000])
        sample_data.add_sample_data("sample1", "AD", [140, 110])
        sample_data.add_sample_data("sample1", "DP", [250])
        sample_data.add_sample_data("sample1", "VAF", [0.4])

        self.assertTrue(sample_data.has_sample("sample1"))
        self.assertEqual(sample_data.genotypes(),
                         {"sample1": GenotypeCall("1|0")})
        self.assertEqual(sample_data.get_field("sample1", 'GT'),
                         GenotypeCall("1|0"))
        self.assertEqual(sample_data.get_field("sample1", 'PL'),
                         [3000, 0, 3000])

        genotype_data = sample_data.get_genotype_data("sample1")
        self.assertEqual(genotype_data.genotype(), GenotypeCall("1|0"))
        self.assertEqual(genotype_data['GT'], GenotypeCall("1|0"))
        self.assertEqual(genotype_data['PL'], [3000, 0, 3000])
Пример #6
0
 def test_sample_data_copes_with_mixed_missing_values_in_PL(self):
     sample_name = 'sample_name'
     sample_data = SampleData(['PL'], [sample_name])
     sample_data.add_sample_data(sample_name, 'PL',
                                 [-0.1, '.', -0.2, None, -0.3])
     self.assertEqual(sample_data.get_genotype_likelihoods(sample_name),
                      [0.01, None, 0.02, None, 0.03])
Пример #7
0
 def test_should_allow_multiple_samples_for_add_sample_data(self):
     sample_data = SampleData(['genotype_key1'],
                              ['sample_name1', 'sample_name2'])
     sample_data.add_sample_data('sample_name1', 'genotype_key1', [1])
     sample_data.add_sample_data('sample_name2', 'genotype_key1', [3, 4])
     self.assertEqual(
         sample_data.get_field('sample_name1', 'genotype_key1'), [1])
     self.assertEqual(
         sample_data.get_field('sample_name2', 'genotype_key1'), [3, 4])
Пример #8
0
 def test_should_raise_when_adding_wrong_genotype_data(self):
     sample_data = SampleData(['GT'], ['sample_name'])
     self.assertRaisesRegex(
         weCallException,
         "Genotype field must be a GenotypeCall.",
         sample_data.add_sample_data,
         'sample_name',
         'GT',
         [1],
     )
Пример #9
0
 def test_should_raise_when_adding_sample_data_to_missing_sample(self):
     sample_data = SampleData(['key'], ['sample_name'])
     self.assertRaisesRegex(
         weCallException,
         "Missing sample name missing_sample_name supplied when adding sample data.",
         sample_data.add_sample_data,
         'missing_sample_name',
         'key',
         [1],
     )
Пример #10
0
 def test_default_values_are_assigned_when_sample_data_is_constructed(self):
     sample_data = SampleData(['GT', 'key1', 'key2'],
                              ['sample_name1', 'sample_name2'])
     self.assertEqual(sample_data.get_field('sample_name1', 'GT'),
                      GenotypeCall("./."))
     self.assertEqual(sample_data.get_field('sample_name2', 'GT'),
                      GenotypeCall("./."))
     self.assertEqual(sample_data.get_field('sample_name1', 'key1'), [])
     self.assertEqual(sample_data.get_field('sample_name2', 'key1'), [])
     self.assertEqual(sample_data.get_field('sample_name1', 'key2'), [])
     self.assertEqual(sample_data.get_field('sample_name2', 'key2'), [])
Пример #11
0
    def test_should_return_default_diploid_genotype(self):
        sample_data = SampleData(['GT', 'GL'], ["NA12878"])

        self.assertEqual(GenotypeCall("./."), GenotypeCall("./."))

        self.assertTrue(sample_data.has_sample("NA12878"))
        self.assertEqual(sample_data.genotypes(),
                         {"NA12878": GenotypeCall("./.")})
        self.assertEqual(sample_data.get_field("NA12878", 'GT'),
                         GenotypeCall("./."))
        self.assertEqual(sample_data.get_field("NA12878", 'GL'), [])

        genotype_data = sample_data.get_genotype_data("NA12878")
        self.assertEqual(genotype_data.genotype(), GenotypeCall("./."))
        self.assertEqual(genotype_data['GT'], GenotypeCall("./."))
        self.assertEqual(genotype_data['GL'], [])
Пример #12
0
    def generate_record_from_variant(self, variant, **kwargs):
        annotations = {
            'variant_id':
            set(),
            'quality':
            None,
            'filters':
            set(),
            'info':
            InfoData(self.schema, {}),
            'sample_info':
            SampleData([key for key, _ in self.schema.iter_sample_data()],
                       self.schema.samples),
            'from_multi_alt':
            False,
        }
        for key, value in kwargs.items():
            annotations[key] = value

        return Record(schema=self.schema, variant=variant, **annotations)
Пример #13
0
    def test_should_write_missing_values_in_sample_data(self):
        with VCFReaderContextManager(
                os.path.join(self.data_dir, "vcf_example.vcf")) as vcf_handler:
            first_record = next(vcf_handler.read_records())

        sample_data = SampleData(['GT', 'PL', 'GQ'],
                                 ['sample1', 'sample2', 'sample3'])

        sample_data.add_sample_data("sample1", "GT", GenotypeCall("1|0"))
        sample_data.add_sample_data("sample1", "PL", [3000, 0, 3000])
        sample_data.add_sample_data("sample1", "GQ", [1000])

        sample_data.add_sample_data("sample2", "GT", GenotypeCall("1|1"))
        sample_data.add_sample_data("sample2", "PL", [2000, 0, 1000])
        sample_data.add_sample_data("sample2", "GQ", [3])

        first_record.sample_info = sample_data

        print((sample_data.to_vcf_columns()))
        vcf_string = vcf_row_from_record(first_record)
        expected_vcf_string = "20	10	.	CT	C	3000	PASS	PP=3000;DP=250;DPR=140;DPF=110;VC=100;VCR=49;VCF=51;ABPV=0.2;SBPV=0.3;MQ=70.0;BR=31.0;QD=None	GT:PL:GQ	1|0:3000,0,3000:1000	1|1:2000,0,1000:3	./.:.:."  # noqa
        self.assertEqual(expected_vcf_string, vcf_string)
Пример #14
0
 def test_should_add_sample_data(self):
     sample_data = SampleData(['genotype_key1'], ['sample_name'])
     sample_data.add_sample_data('sample_name', 'genotype_key1', [1])
     self.assertEqual(sample_data.get_field('sample_name', 'genotype_key1'),
                      [1])
Пример #15
0
 def test_has_genotype_keys_should_support_multiple_keys(self):
     sample_data = SampleData(['genotype_key1', 'genotype_key2'],
                              ['sample_name'])
     self.assertTrue(sample_data.has_genotype_key('genotype_key1'))
     self.assertTrue(sample_data.has_genotype_key('genotype_key2'))
Пример #16
0
 def test_has_genotype_key_should_report_expected_value(self):
     sample_data = SampleData(['genotype_key'], ['sample_name'])
     self.assertTrue(sample_data.has_genotype_key('genotype_key'))
     self.assertFalse(sample_data.has_genotype_key('missing_genotype_key'))
Пример #17
0
 def test_has_sample_reports_expected_value(self):
     sample_data = SampleData(['key1'], ['sample_name'])
     self.assertTrue(sample_data.has_sample('sample_name'))
     self.assertFalse(sample_data.has_sample('missing_sample_name'))
Пример #18
0
 def setUp(self):
     self.sample_data = SampleData(['GT', 'key'],
                                   ['sample_name1', 'sample_name2'])
     self.sample_data.add_sample_data("sample_name1", "key", [1, 2])
     self.sample_data.add_sample_data("sample_name2", "GT",
                                      GenotypeCall("0/1"))
Пример #19
0
 def test_gets_value_for_GQ_key(self):
     sample_name = 'sample_name'
     sample_data = SampleData(['GQ'], [sample_name])
     sample_data.add_sample_data(sample_name, 'GQ', [2.3])
     self.assertEqual(sample_data.get_genotype_quality(sample_name), [2.3])
Пример #20
0
 def test_default_field_value_is_assigned_when_sample_data_is_constructed(
         self):
     sample_data = SampleData(['key1'], ['sample_name'])
     self.assertEqual(sample_data.get_field('sample_name', 'key1'), [])
Пример #21
0
 def test_gets_exact_values_if_key_is_GL(self):
     sample_name = 'sample_name'
     sample_data = SampleData(['GL'], [sample_name])
     sample_data.add_sample_data(sample_name, 'GL', [-0.1, -0.2, -0.3])
     self.assertEqual(sample_data.get_genotype_likelihoods(sample_name),
                      [-0.1, -0.2, -0.3])
Пример #22
0
 def test_gets_exact_values_if_key_is_NR(self):
     sample_name = 'sample_name'
     sample_data = SampleData(['NR'], [sample_name])
     sample_data.add_sample_data(sample_name, 'NR', [100])
     self.assertEqual(sample_data.get_read_depth(sample_name), [100])
Пример #23
0
 def test_gets_dot_if_key_is_GL(self):
     sample_name = 'sample_name'
     sample_data = SampleData(['GL'], [sample_name])
     sample_data.add_sample_data(sample_name, 'GL', '.')
     self.assertEqual(sample_data.get_genotype_likelihoods(sample_name),
                      '.')
Пример #24
0
 def test_gets_dot_if_key_is_PL(self):
     sample_name = 'sample_name'
     sample_data = SampleData(['PL'], [sample_name])
     sample_data.set_genotype_likelihoods(sample_name, '.')
     self.assertEqual(sample_data.get_field(sample_name, 'PL'), '.')
Пример #25
0
 def test_gets_exact_values_if_key_is_PL(self):
     sample_name = 'sample_name'
     sample_data = SampleData(['PL'], [sample_name])
     sample_data.set_genotype_likelihoods(sample_name, [-0.1, -0.2, -0.3])
     self.assertEqual(sample_data.get_field(sample_name, 'PL'), [1, 2, 3])
Пример #26
0
def generate_records(schema, cols):
    alts = cols[ALT_COL].split(',')
    vars = [Variant(cols[CHROM_COL], int(cols[POS_COL]) -
                    1, cols[REF_COL], alt) for alt in alts]

    info_data_list = []
    if len(alts) == 1:
        # deferred parsing is simple with a single alt
        info_data_list.append(
            DeferredInfoData(
                schema,
                lambda: defer_parse_info_field(
                    schema,
                    cols[INFO_COL])))

    else:
        # extract and split info data into lists of length n_alts
        split_info_data = OrderedDict()
        for key, value in parse_info_field(cols[INFO_COL]):
            try:
                info_metadata = schema.get_info_data(key)
            except KeyError:
                split_info_data[key] = [
                    DeferredInfoValue(
                        schema, key, value) for index in range(
                        len(alts))]
            else:
                split_info_data[key] = info_metadata.split_alts(
                    value if isinstance(value, list) else value.split(','), n_alts=len(alts)
                )

        # construct InfoData objects from prepared info data
        for index in range(len(alts)):
            info_dict = OrderedDict([
                (key, values[index]) for key, values in list(split_info_data.items())
            ])
            info_data_list.append(InfoData(schema, info_dict))

    try:
        sample_format = cols[FORMAT_COL].split(':')
    except IndexError:
        sample_data_list = repeat(None)
    else:
        # extract sample format
        split_sample_data = {sample_name: sample_field.split(
            ':') for sample_name, sample_field in zip(schema.samples, cols[SAMPLE_COL:])}

        sample_data_list = [
            SampleData(
                cols[FORMAT_COL].split(':'),
                schema.samples) for _ in alts]
        for sample_name, sample_items in list(split_sample_data.items()):
            split_sample_items = {}

            # extract data from sample fields
            gt = None
            for key, item in zip(sample_format, sample_items):
                try:
                    if key == GENOTYPE_KEY:
                        gt = GenotypeCall(item)
                        values = [
                            GenotypeCall(gt.deliminator().join(
                                # Note: default value should be '.', but
                                # downstream tools aren't good enough to use it
                                {None: '.', 0: '0', 1 + index: '1'}.get(gt_index, '0') for gt_index in gt
                            ))
                            for index in range(len(alts))
                        ]
                    elif key == GENOTYPE_LIKELIHOODS_KEY or key == GENOTYPE_PHRED_LIKELIHOODS_KEY:
                        values = schema.get_sample_data(key).split_alts(
                            item.split(','), len(alts), gt)
                    else:
                        values = schema.get_sample_data(key).split_alts(
                            item.split(','), len(alts), None)
                    split_sample_items[key] = values
                except Exception as e:
                    raise type(e)(
                        "Error parsing field {} for sample {}: {}".format(
                            key, sample_name, e))

            # distribute data to each split sample meta-data container
            for index in range(len(alts)):
                sample_data = sample_data_list[index]
                for key, value in list(split_sample_items.items()):
                    sample_data.add_sample_data(sample_name, key, value[index])

    # generate & return record objects
    for var, info_data, sample_data in zip(
            vars, info_data_list, sample_data_list):
        qual = variant_quality_from_vcf(cols[QUALITY_COL])
        ids = variant_ids_from_vcf(cols[ID_COL])
        filts = filters_from_vcf(cols[FILTER_COL])
        yield Record(schema, var, ids, qual, filts, info_data, sample_data, len(alts) > 1)
Пример #27
0
 def test_gets_exact_values_if_key_is_NV(self):
     sample_name = 'sample_name'
     sample_data = SampleData(['NV'], [sample_name])
     sample_data.add_sample_data(sample_name, 'NV', [100])
     self.assertEqual(sample_data.get_variant_support(sample_name), [100])
Пример #28
0
 def test_genotype_field_default_value_is_assigned_when_sample_data_is_constructed(
         self):
     sample_data = SampleData(['GT'], ['sample_name'])
     self.assertEqual(sample_data.get_field('sample_name', 'GT'),
                      GenotypeCall("./."))
Пример #29
0
 def test_gets_list_of_none_if_key_is_GL(self):
     sample_name = 'sample_name'
     sample_data = SampleData(['GL'], [sample_name])
     sample_data.add_sample_data(sample_name, 'GL', [None, None, None])
     self.assertEqual(sample_data.get_genotype_likelihoods(sample_name),
                      [None, None, None])