예제 #1
0
    def test_csv_parser(self):
        # standad file
        fhand = open(join(TEST_DATA_DIR, 'csv', 'standard_ex.tsv'), 'rb')
        var_info = {b'solcap_snp_sl_15058': {'chrom': b'chrom1', 'pos': 345},
                    b'solcap_snp_sl_60635': {'chrom': b'chrom1', 'pos': 346},
                    b'solcap_snp_sl_60604': {'chrom': b'chrom1', 'pos': 347}}
        parser = CSVParser(fhand, var_info, first_sample_column=1,
                           sep=b'\t')
        expected = [(b'chrom1', 345, b'solcap_snp_sl_15058', b'A',
                     [b'C', b'G'], None, None, None, [(b'GT', [(0, 0), (0, 0),
                                                               (0, 0), (0, 1),
                                                               (2, 2)])]),
                    (b'chrom1', 346, b'solcap_snp_sl_60635', b'G', None, None,
                     None, None, [(b'GT', [(0, 0), (0, 0), (0, 0), (0, 0),
                                           (-1, -1)])]),
                    (b'chrom1', 347, b'solcap_snp_sl_60604', b'C', [b'T'],
                     None, None, None, [(b'GT', [(0, 0), (-1, -1), (1, 0),
                                                 (0, 0), (0, 0)])])]

        assert list(parser.variations) == expected

        fhand.close()
        assert parser.samples == [b'SR-9', b'SR-12', b'SR-13', b'SR-15',
                                  b'SR-18']
        assert parser.max_field_lens['alt'] == 2
        assert parser.ploidy == 2

        # IUPAC file
        fhand = open(join(TEST_DATA_DIR, 'csv', 'iupac_ex3.txt'), 'rb')
        var_info = {b'1': {'chrom': b'SL2.40ch02', 'pos': 331954},
                    b'2': {'chrom': b'SL2.40ch02', 'pos': 681961},
                    b'3': {'chrom': b'SL2.40ch02', 'pos': 1511764}}

        parser = CSVParser(fhand, var_info, first_sample_column=3,
                           first_gt_column=3, sep=b'\t',
                           gt_splitter=create_iupac_allele_splitter())

        expected = [(b'SL2.40ch02', 331954, b'1', b'T', [b'G'], None, None,
                     None, [(b'GT', [(1, 1), (0, 0), (-1, -1)])]),
                    (b'SL2.40ch02', 681961, b'2', b'C', None, None, None,
                     None, [(b'GT', [(0, 0), (0, 0), (-1, -1)])]),
                    (b'SL2.40ch02', 1511764, b'3', b'A', [b'T'], None, None,
                     None, [(b'GT', [(1, 1), (1, 1), (0, 1)])]),
                    (b'SL2.40ch02', 331954, b'1', b'T', [b'G'], None, None,
                     None, [(b'GT', [(1, 1), (0, 0), (-1, -1)])])]
        for var, expect in zip(parser.variations, expected):
            assert var == expect

        fhand.close()
        assert parser.samples == [b'TS-1', b'TS-11', b'TS-21']
        assert parser.max_field_lens['alt'] == 1
        assert parser.ploidy == 2

        # pandas csv
        fhand = open(join(TEST_DATA_DIR, 'csv', 'pandas.csv'), 'rb')
        var_info = {b'solcap_snp_sl_15058': {'chrom': b'chrom1', 'pos': 345},
                    b'solcap_snp_sl_60635': {'chrom': b'chrom1', 'pos': 346},
                    b'solcap_snp_sl_60604': {'chrom': b'chrom1', 'pos': 347}}

        parser = CSVParser(fhand, var_info, first_sample_column=0, sep=b'\t')
        expected = [(b'chrom1', 345, b'solcap_snp_sl_15058', b'A',
                     [b'C', b'G'], None, None, None, [(b'GT', [(0, 0), (0, 0),
                                                               (0, 0), (0, 1),
                                                               (2, 2)])]),
                    (b'chrom1', 346, b'solcap_snp_sl_60635', b'G', None, None,
                     None, None, [(b'GT', [(0, 0), (0, 0), (0, 0), (0, 0),
                                           (-1, -1)])]),
                    (b'chrom1', 347, b'solcap_snp_sl_60604', b'C', [b'T'],
                     None, None, None, [(b'GT', [(0, 0), (-1, -1), (1, 0),
                                                 (0, 0), (0, 0)])])]
        for var, expect in zip(parser.variations, expected):
            assert var == expect

        fhand.close()
        assert parser.samples == [b'SR-9', b'SR-12', b'SR-13', b'SR-15',
                                  b'SR-18']
        assert parser.max_field_lens['alt'] == 2
        assert parser.ploidy == 2
예제 #2
0
 def test_create_iupac_splitter(self):
     spliter = create_iupac_allele_splitter()
     assert spliter(b'A') == (ord('A'), ord('A'))
     assert spliter(b'-') == None
     assert spliter(b'') == None
예제 #3
0
    def test_put_vars_from_csv(self):
        fhand_ex = open(join(TEST_DATA_DIR, 'csv', 'iupac_ex3.txt'), 'rb')
        var_info = {b'1': {'chrom': b'SL2.40ch02', 'pos': 331954},
                    b'2': {'chrom': b'SL2.40ch02', 'pos': 681961},
                    b'3': {'chrom': b'SL2.40ch02', 'pos': 1511764}}
        parser = CSVParser(fhand_ex, var_info, first_sample_column=3,
                           first_gt_column=3, sep=b'\t',
                           gt_splitter=create_iupac_allele_splitter(),
                           max_field_lens={'alt': 1},
                           max_field_str_lens={'alt': 1, 'chrom': 20,
                                               'ref': 1})

        with NamedTemporaryFile(suffix='.h5') as fhand:
            os.remove(fhand.name)
            h5 = VariationsH5(fhand.name, mode='w', ignore_overflows=True,
                              ignore_undefined_fields=True)
            h5.put_vars(parser)
            exp = [b'SL2.40ch02', b'SL2.40ch02', b'SL2.40ch02']
            assert list(h5['/variations/chrom'][:]) == exp
            alleles = list(zip(h5['/variations/ref'],
                           [alts[0] for alts in h5['/variations/alt']]))
            exp = [(b'G', b'T'), (b'C', b''), (b'A', b'T')]
            for als, aexp in zip(alleles, exp):
                assert set(als) == set(aexp)
            assert list(h5['/variations/pos'][:]) == [331954, 681961,
                                                      1511764]
            exp1 = numpy.array([[[1, 1], [0, 0], [-1, -1]],
                                [[0, 0], [0, 0], [-1, -1]],
                                [[0, 0], [0, 0], [1, 0]]])
            exp2 = numpy.array([[[0, 0], [1, 1], [-1, -1]],
                                [[0, 0], [0, 0], [-1, -1]],
                                [[1, 1], [1, 1], [0, 1]]])

            for gts, exp_gts1, exp_gts2 in zip(h5['/calls/GT'][:], exp1, exp2):
                for gt, ex1, ex2 in zip(gts, exp_gts1, exp_gts2):
                    assert set(gt) == set(ex1) or set(gt) == set(ex2)

        if os.path.exists(fhand.name):
            os.remove(fhand.name)
        fhand_ex.close()

        fhand_ex = open(join(TEST_DATA_DIR, 'csv',
                             'two_letter_coding_ex3.txt'), 'rb')
        var_info = {b'1': {'chrom': b'SL2.40ch02', 'pos': 331954},
                    b'2': {'chrom': b'SL2.40ch02', 'pos': 681961},
                    b'3': {'chrom': b'SL2.40ch02', 'pos': 1511764}}
        parser = CSVParser(fhand_ex, var_info, first_sample_column=3,
                           first_gt_column=3, sep=b'\t',
                           max_field_lens={'alt': 1},
                           max_field_str_lens={'alt': 1, 'chrom': 20,
                                               'ref': 1})

        h5 = VariationsArrays(ignore_overflows=True,
                              ignore_undefined_fields=True)
        h5.put_vars(parser)
        exp = [b'SL2.40ch02', b'SL2.40ch02', b'SL2.40ch02']
        assert list(h5['/variations/chrom'][:]) == exp
        alleles = list(zip(h5['/variations/ref'],
                       [alts[0] for alts in h5['/variations/alt']]))
        exp = [(b'G', b'T'), (b'C', b''), (b'A', b'T')]
        for als, aexp in zip(alleles, exp):
            assert set(als) == set(aexp)
        assert list(h5['/variations/pos'][:]) == [331954, 681961,
                                                  1511764]
        exp1 = numpy.array([[[1, 1], [0, 0], [-1, -1]],
                            [[0, 0], [0, 0], [-1, -1]],
                            [[0, 0], [0, 0], [1, 0]]])
        exp2 = numpy.array([[[0, 0], [1, 1], [-1, -1]],
                            [[0, 0], [0, 0], [-1, -1]],
                            [[1, 1], [1, 1], [0, 1]]])
        for gts, exp_gts1, exp_gts2 in zip(h5['/calls/GT'][:], exp1, exp2):
            for gt, ex1, ex2 in zip(gts, exp_gts1, exp_gts2):
                assert set(gt) == set(ex1) or set(gt) == set(ex2)
        fhand_ex.close()
예제 #4
0
    def test_csv_parser(self):
        # standad file
        fhand = open(join(TEST_DATA_DIR, 'csv', 'standard_ex.tsv'), 'rb')
        var_info = {b'solcap_snp_sl_15058': {'chrom': b'chrom1', 'pos': 345},
                    b'solcap_snp_sl_60635': {'chrom': b'chrom1', 'pos': 346},
                    b'solcap_snp_sl_60604': {'chrom': b'chrom1', 'pos': 347}}
        parser = CSVParser(fhand, var_info, first_sample_column=1,
                           sep=b'\t')
        expected = [(b'chrom1', 345, b'solcap_snp_sl_15058', b'A',
                     [b'C', b'G'], None, None, None, [(b'GT', [(0, 0), (0, 0),
                                                               (0, 0), (0, 1),
                                                               (2, 2)])]),
                    (b'chrom1', 346, b'solcap_snp_sl_60635', b'G', None, None,
                     None, None, [(b'GT', [(0, 0), (0, 0), (0, 0), (0, 0),
                                           (-1, -1)])]),
                    (b'chrom1', 347, b'solcap_snp_sl_60604', b'C', [b'T'],
                     None, None, None, [(b'GT', [(0, 0), (-1, -1), (1, 0),
                                                 (0, 0), (0, 0)])])]

        assert list(parser.variations) == expected

        fhand.close()
        assert parser.samples == [b'SR-9', b'SR-12', b'SR-13', b'SR-15',
                                  b'SR-18']
        assert parser.ploidy == 2

        # IUPAC file
        fhand = open(join(TEST_DATA_DIR, 'csv', 'iupac_ex3.txt'), 'rb')
        var_info = {b'1': {'chrom': b'SL2.40ch02', 'pos': 331954},
                    b'2': {'chrom': b'SL2.40ch02', 'pos': 681961},
                    b'3': {'chrom': b'SL2.40ch02', 'pos': 1511764}}

        parser = CSVParser(fhand, var_info, first_sample_column=3,
                           first_gt_column=3, sep=b'\t',
                           gt_splitter=create_iupac_allele_splitter())

        expected = [(b'SL2.40ch02', 331954, b'1', b'T', [b'G'], None, None,
                     None, [(b'GT', [(1, 1), (0, 0), (-1, -1)])]),
                    (b'SL2.40ch02', 681961, b'2', b'C', None, None, None,
                     None, [(b'GT', [(0, 0), (0, 0), (-1, -1)])]),
                    (b'SL2.40ch02', 1511764, b'3', b'A', [b'T'], None, None,
                     None, [(b'GT', [(1, 1), (1, 1), (0, 1)])]),
                    (b'SL2.40ch02', 331954, b'1', b'T', [b'G'], None, None,
                     None, [(b'GT', [(1, 1), (0, 0), (-1, -1)])])]
        for var, expect in zip(parser.variations, expected):
            assert var == expect

        fhand.close()
        assert parser.samples == [b'TS-1', b'TS-11', b'TS-21']
        assert parser.ploidy == 2

        # pandas csv
        fhand = open(join(TEST_DATA_DIR, 'csv', 'pandas.csv'), 'rb')
        var_info = {b'solcap_snp_sl_15058': {'chrom': b'chrom1', 'pos': 345},
                    b'solcap_snp_sl_60635': {'chrom': b'chrom1', 'pos': 346},
                    b'solcap_snp_sl_60604': {'chrom': b'chrom1', 'pos': 347}}

        parser = CSVParser(fhand, var_info, first_sample_column=0, sep=b'\t')
        expected = [(b'chrom1', 345, b'solcap_snp_sl_15058', b'A',
                     [b'C', b'G'], None, None, None, [(b'GT', [(0, 0), (0, 0),
                                                               (0, 0), (0, 1),
                                                               (2, 2)])]),
                    (b'chrom1', 346, b'solcap_snp_sl_60635', b'G', None, None,
                     None, None, [(b'GT', [(0, 0), (0, 0), (0, 0), (0, 0),
                                           (-1, -1)])]),
                    (b'chrom1', 347, b'solcap_snp_sl_60604', b'C', [b'T'],
                     None, None, None, [(b'GT', [(0, 0), (-1, -1), (1, 0),
                                                 (0, 0), (0, 0)])])]
        for var, expect in zip(parser.variations, expected):
            assert var == expect

        fhand.close()
        assert parser.samples == [b'SR-9', b'SR-12', b'SR-13', b'SR-15',
                                  b'SR-18']
        assert parser.ploidy == 2
예제 #5
0
 def test_create_iupac_splitter(self):
     spliter = create_iupac_allele_splitter()
     assert spliter(b'A') == (ord('A'), ord('A'))
     assert spliter(b'-') is None
     assert spliter(b'') is None
예제 #6
0
    def test_put_vars_from_csv(self):
        fhand_ex = open(join(TEST_DATA_DIR, 'csv', 'iupac_ex3.txt'), 'rb')
        var_info = {b'1': {'chrom': b'SL2.40ch02', 'pos': 331954},
                    b'2': {'chrom': b'SL2.40ch02', 'pos': 681961},
                    b'3': {'chrom': b'SL2.40ch02', 'pos': 1511764}}
        parser = CSVParser(fhand_ex, var_info, first_sample_column=3,
                           first_gt_column=3, sep=b'\t',
                           gt_splitter=create_iupac_allele_splitter())

        with NamedTemporaryFile(suffix='.h5') as fhand:
            os.remove(fhand.name)
            h5 = VariationsH5(fhand.name, mode='w',
                              ignore_undefined_fields=True)
            h5.put_vars(parser)
            exp = [b'SL2.40ch02', b'SL2.40ch02', b'SL2.40ch02']
            assert list(h5['/variations/chrom'][:]) == exp
            alleles = list(zip(h5['/variations/ref'],
                           [alts[0] for alts in h5['/variations/alt']]))
            exp = [(b'G', b'T'), (b'C', b''), (b'A', b'T')]
            for als, aexp in zip(alleles, exp):
                assert set(als) == set(aexp)
            assert list(h5['/variations/pos'][:]) == [331954, 681961,
                                                      1511764]
            exp1 = numpy.array([[[1, 1], [0, 0], [-1, -1]],
                                [[0, 0], [0, 0], [-1, -1]],
                                [[0, 0], [0, 0], [1, 0]]])
            exp2 = numpy.array([[[0, 0], [1, 1], [-1, -1]],
                                [[0, 0], [0, 0], [-1, -1]],
                                [[1, 1], [1, 1], [0, 1]]])

            for gts, exp_gts1, exp_gts2 in zip(h5['/calls/GT'][:], exp1, exp2):
                for gt, ex1, ex2 in zip(gts, exp_gts1, exp_gts2):
                    assert set(gt) == set(ex1) or set(gt) == set(ex2)

        if os.path.exists(fhand.name):
            os.remove(fhand.name)
        fhand_ex.close()

        fhand_ex = open(join(TEST_DATA_DIR, 'csv',
                             'two_letter_coding_ex3.txt'), 'rb')
        var_info = {b'1': {'chrom': b'SL2.40ch02', 'pos': 331954},
                    b'2': {'chrom': b'SL2.40ch02', 'pos': 681961},
                    b'3': {'chrom': b'SL2.40ch02', 'pos': 1511764}}
        parser = CSVParser(fhand_ex, var_info, first_sample_column=3,
                           first_gt_column=3, sep=b'\t')

        h5 = VariationsArrays(ignore_undefined_fields=True)
        h5.put_vars(parser)
        exp = [b'SL2.40ch02', b'SL2.40ch02', b'SL2.40ch02']
        assert list(h5['/variations/chrom'][:]) == exp
        alleles = list(zip(h5['/variations/ref'],
                       [alts[0] for alts in h5['/variations/alt']]))
        exp = [(b'G', b'T'), (b'C', b''), (b'A', b'T')]
        for als, aexp in zip(alleles, exp):
            assert set(als) == set(aexp)
        assert list(h5['/variations/pos'][:]) == [331954, 681961,
                                                  1511764]
        exp1 = numpy.array([[[1, 1], [0, 0], [-1, -1]],
                            [[0, 0], [0, 0], [-1, -1]],
                            [[0, 0], [0, 0], [1, 0]]])
        exp2 = numpy.array([[[0, 0], [1, 1], [-1, -1]],
                            [[0, 0], [0, 0], [-1, -1]],
                            [[1, 1], [1, 1], [0, 1]]])
        for gts, exp_gts1, exp_gts2 in zip(h5['/calls/GT'][:], exp1, exp2):
            for gt, ex1, ex2 in zip(gts, exp_gts1, exp_gts2):
                assert set(gt) == set(ex1) or set(gt) == set(ex2)
        fhand_ex.close()