Exemplo n.º 1
0
    def test_merge_variations(self):
        h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r")
        h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3},
                           ignore_complex_overlaps=True,
                           check_ref_matches=False)
        assert merger.ploidy == 2
        assert merger.samples == [b'TS-1', b'TS-11', b'TS-21', b'NA00001',
                                  b'NA00002', b'NA00003']
        expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged.h5'),
                                   'r')
        new_vars = VariationsArrays(ignore_overflows=True,
                                    ignore_undefined_fields=True)
        new_vars.put_vars(merger)

        for field in new_vars.keys():
            if 'float' in str(new_vars[field][:].dtype):
                assert numpy.all(remove_nans(expected_h5[field][:]) ==
                                 remove_nans(new_vars[field][:]))
            else:
                result = new_vars[field][:]
                try:
                    assert numpy.all(expected_h5[field][:] == result)
                except AssertionError:
                    print(field)
                    print(expected_h5[field][:])
                    print(result)

        # Change the order
        h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r")
        h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        merger = VarMerger(h5_2, h5_1, max_field_lens={'alt': 3},
                           ignore_complex_overlaps=True,
                           check_ref_matches=False)
        assert merger.ploidy == 2
        assert merger.samples == [b'NA00001', b'NA00002', b'NA00003',
                                  b'TS-1', b'TS-11', b'TS-21']
        expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged2.h5'),
                                   'r')
        new_vars = VariationsArrays(ignore_overflows=True,
                                    ignore_undefined_fields=True)
        new_vars.put_vars(merger)

        for field in new_vars.keys():
            if 'float' in str(new_vars[field][:].dtype):
                assert numpy.all(remove_nans(expected_h5[field][:]) ==
                                 remove_nans(new_vars[field][:]))
            else:
                result = new_vars[field][:]
                assert numpy.all(expected_h5[field][:] == result)
Exemplo n.º 2
0
    def test_merge_with_depth(self):

        vars1 = MockList([{'chrom': '1', 'pos': 1, 'ref': b'A', 'alt': [b'T'],
                           'gts': numpy.array([[0, 0], [1, 1]]),
                           'dp': numpy.array([1, 1])}])
        vars2 = MockList([{'chrom': '1', 'pos': 1, 'ref': b'A', 'alt': [b'T'],
                           'gts': numpy.array([[0, 0], [1, 1]]),
                           'dp': numpy.array([20, 20])}])
        vars1.samples = ['a', 'b']
        vars2.samples = ['c', 'd']
        merger = MockMerger(gt_shape=(4, 2))

        variation = VarMerger._merge_vars(merger, vars1[0], vars2[0])
        exp = {'gts': [[0, 0], [1, 1], [0, 0], [1, 1]], 'pos': 1,
               'ref': b'A', 'chrom': '1', 'alt': [b'T'], 'dp': [1, 1, 20, 20]}
        self.var_is_equal(exp, variation)

        # merge the same var with depth
        h5_1 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3},
                           ignore_complex_overlaps=True,
                           check_ref_matches=False, ignore_non_matching=True)
        new_vars = VariationsArrays(ignore_overflows=True,
                                    ignore_undefined_fields=True)

        first_snv_merged_depth = numpy.array([1, 8, 5, 1, 8, 5],
                                             dtype=numpy.int16)
        depth = list(merger.variations)[0][8][1]
        assert depth[0] == b'DP'
        assert numpy.all(depth[1] == first_snv_merged_depth)
        new_vars.put_vars(merger)
        assert '/calls/DP' in new_vars.keys()
        assert numpy.all(new_vars['/calls/DP'][0] == first_snv_merged_depth)
Exemplo n.º 3
0
 def test_delete_item_from_variationArray(self):
     vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf = VCFParser(vcf_fhand)
     snps = VariationsArrays(ignore_undefined_fields=True)
     snps.put_vars(vcf)
     del snps['/calls/GT']
     assert '/calls/GT' not in snps.keys()
     vcf_fhand.close()
Exemplo n.º 4
0
 def test_delete_item_from_variationArray(self):
     vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf = VCFParser(vcf_fhand, pre_read_max_size=1000)
     snps = VariationsArrays(ignore_undefined_fields=True)
     snps.put_vars(vcf)
     del snps['/calls/GT']
     assert '/calls/GT' not in snps.keys()
     vcf_fhand.close()
Exemplo n.º 5
0
    def test_field_filter(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = FieldFilter(kept_fields=[GT_FIELD])
        pipeline.append(flt)

        vars_out = VariationsArrays()
        pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
        assert list(vars_out.keys()) == [GT_FIELD]
        assert list(result2[FLT_VARS].keys()) == [GT_FIELD]
Exemplo n.º 6
0
    def test_field_filter(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = FieldFilter(kept_fields=[GT_FIELD])
        pipeline.append(flt)

        vars_out = VariationsArrays()
        pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
        assert list(vars_out.keys()) == [GT_FIELD]
        assert list(result2[FLT_VARS].keys()) == [GT_FIELD]
Exemplo n.º 7
0
    def test_min_mac(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = MacFilter(min_mac=10, max_mac=30, do_histogram=True)
        pipeline.append(flt, id_='filter1')

        vars_out = VariationsArrays()
        result = pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(result['filter1']['counts'], result2['counts'])
        assert numpy.allclose(result['filter1']['edges'], result2['edges'])
        assert not vars_out.keys()

        assert result2[FLT_VARS]['/calls/GT'].shape[0] == 0
Exemplo n.º 8
0
    def test_merge_variations(self):
        h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r")
        h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3},
                           ignore_complex_overlaps=True,
                           check_ref_matches=False)
        assert merger.ploidy == 2
        assert merger.samples == [b'TS-1', b'TS-11', b'TS-21', b'NA00001',
                                  b'NA00002', b'NA00003']
        expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged.h5'),
                                   'r')
        new_vars = VariationsArrays(ignore_undefined_fields=True)
        new_vars.put_vars(merger)

        first_h5 = h5_1
        second_h5 = h5_2

        field_paths = []
        for field_path in field_paths:
            print('path', field_path)
            print('first:')
            if field_path in first_h5:
                print(h5_1[field_path][:].shape)
            print('second:')
            if field_path in second_h5:
                print(second_h5[field_path][:].shape)
            print('expected:')
            print(expected_h5[field_path][:].shape)
            print('merged:')
            print(new_vars[field_path].shape)


        for field in new_vars.keys():
            if 'float' in str(new_vars[field][:].dtype):
                assert numpy.allclose(remove_nans(expected_h5[field][:]),
                                      remove_nans(new_vars[field][:]))
            else:
                result = new_vars[field][:]

                try:
                    if not expected_h5[field][:].shape == result.shape:
                        raise AssertionError('comparison failed for field: ' + field)
                    assert numpy.all(expected_h5[field][:] == result)
                except (AssertionError, ValueError, TypeError):
                    print(field)
                    print(expected_h5[field][:])
                    print(result)
                    raise

        # Change the order
        h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r")
        h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        merger = VarMerger(h5_2, h5_1, max_field_lens={'alt': 3},
                           ignore_complex_overlaps=True,
                           check_ref_matches=False)
        assert merger.ploidy == 2
        assert merger.samples == [b'NA00001', b'NA00002', b'NA00003',
                                  b'TS-1', b'TS-11', b'TS-21']
        expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged2.h5'),
                                   'r')
        new_vars = VariationsArrays(ignore_undefined_fields=True)
        new_vars.put_vars(merger)

        for field in new_vars.keys():
            if 'float' in str(new_vars[field][:].dtype):
                assert numpy.all(remove_nans(expected_h5[field][:]) ==
                                 remove_nans(new_vars[field][:]))
            else:
                result = new_vars[field][:]
                assert numpy.all(expected_h5[field][:] == result)