示例#1
0
def main():
    description = 'Merge HDF5 files into a new HDF5 file'
    parser = _setup_argparse(description=description)
    args = _parse_args(parser)
    fields_function = {}
    allowed_functions = {'min': min, 'max': max, 'mean': mean}
    for field_f in args['fields_func']:
        field, function = field_f.split('=')
        if function not in allowed_functions:
            raise ('Function not supported')
        fields_function[field] = allowed_functions[function]
    merged_fpath = args['out_fpath']
    h5_1 = VariationsH5(args['in_fpaths'][0], 'r')
    h5_2 = VariationsH5(args['in_fpaths'][1], 'r')
    logging.basicConfig(filename=merged_fpath + '.log',
                        filemode='w',
                        level=logging.INFO)
    try:
        _, log = merge_variations(
            h5_1,
            h5_2,
            merged_fpath,
            ignore_overlaps=args['ignore_overlaps'],
            ignore_2_or_more_overlaps=args['ignore_more_overlaps'],
            fields_funct=fields_function,
            ignore_fields=args['ignore_fields'])
        logging.info(log)
    except FileExistsError:
        raise ('The output file already exists. Remove it to create a new one')
示例#2
0
def _merge_h5(h5_chroms_fpaths, out_h5_fpath):
    outh5 = VariationsH5(out_h5_fpath, 'w')
    for h5_chrom_fpath in h5_chroms_fpaths:
        inh5 = VariationsH5(h5_chrom_fpath, 'r')
        outh5.put_chunks(inh5.iterate_chunks())
        inh5.close()
    outh5.close()
示例#3
0
    def test_count_alleles(self):

        hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r')
        chunk = first(hdf5.iterate_chunks())
        genotypes = chunk['/calls/GT']
        expected = [[3, 3, 0], [5, 1, 0], [0, 2, 4], [6, 0, 0], [2, 3, 1]]
        counts = counts_by_row(genotypes, missing_value=-1)
        assert numpy.all(expected == counts)
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        chunks = hdf5.iterate_chunks(kept_fields=['/calls/GT'])
        chunks = (chunk['/calls/GT'] for chunk in chunks)
        matrix = first(chunks)
        for _ in range(20):
            extend_matrix(matrix, chunks)

        counts = counts_by_row(matrix, missing_value=-1)

        gts = [[[-1, -1], [-1, -1], [-1, -1], [0, 0], [0, 0], [0, 0]]]
        gts = numpy.array(gts)
        counts = counts_by_row(gts, missing_value=-1)
        assert numpy.all(counts == [[6]])

        gts = [[[0, 0], [0, 0], [0, 0]], [[0, 0], [0, 0], [0, 0]]]
        gts = numpy.array(gts)
        counts = counts_by_row(gts, missing_value=-1)
        assert numpy.all(counts == [[6, 6]])
示例#4
0
    def test_calculate_hwe(self):
        variations = VariationsArrays()
        gts = numpy.array([])
        variations['/calls/GT'] = gts
        variations['/variations/alt'] = gts
        result = calc_hwe_chi2_test(variations, min_num_genotypes=0,
                                    chunk_size=None)
        assert result.shape[0] == 0

        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [0, 1], [0, 1], [0, 0], [0, 1], [0, 0],
                            [0, 0], [0, 1], [1, 1], [0, 0]],
                           [[0, 0], [1, 0], [0, 1], [0, 0], [0, 1], [0, 0],
                            [0, 0], [1, 0], [1, 1], [0, 0]]])
        variations['/calls/GT'] = gts
        variations._create_matrix('/variations/alt', shape=(1, 1),
                                  dtype=numpy.int16, fillvalue=0)
        expected = numpy.array([[1.25825397e+01, 1.85240619e-03],
                                [1.25825397e+01, 1.85240619e-03]])
        result = calc_hwe_chi2_test(variations, min_num_genotypes=0,
                                    chunk_size=None)
        assert numpy.allclose(result, expected)

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        hwe_test1 = calc_hwe_chi2_test(hdf5, chunk_size=None)
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        hwe_test2 = calc_hwe_chi2_test(hdf5)
        assert numpy.allclose(hwe_test1, hwe_test2, equal_nan=True)
示例#5
0
    def test_calc_obs_het_sample(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        snps = VariationsArrays()
        snps.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT']))
        het_h5 = calc_obs_het_by_sample(hdf5)
        het_array = calc_obs_het_by_sample(snps)
        assert numpy.all(het_array == het_h5)

        gts = numpy.array([[[0, 0], [0, 1], [0, -1], [-1, -1]],
                           [[0, 0], [0, 0], [0, -1], [-1, -1]],
                           [[0, 0], [0, 0], [0, 0], [-1, -1]]])

        varis = {'/calls/GT': gts}
        het = calc_obs_het_by_sample(varis, chunk_size=None)
        assert numpy.allclose(het, [0, 1 / 3, 0, numpy.NaN], equal_nan=True)

        gts = numpy.array([])
        varis = {'/calls/GT': gts}
        het = calc_obs_het_by_sample(varis, chunk_size=None)
        assert het.shape[0] == 0

        snps = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        calc_obs_het_by_sample(snps, min_call_dp=3)
        calc_obs_het_by_sample(snps, min_call_dp=3, max_call_dp=20)
        het_0 = calc_obs_het_by_sample(snps)
        het = calc_obs_het_by_sample(snps, chunk_size=None)
        assert numpy.allclose(het_0, het)
示例#6
0
    def test_ignore_non_matching(self):

        h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r")
        h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3},
                           ignore_complex_overlaps=True,
                           check_ref_matches=False, ignore_non_matching=True)
        new_vars = VariationsArrays(ignore_undefined_fields=True)
        new_vars.put_vars(merger)
        assert new_vars.num_variations == 1
示例#7
0
def filter_missing_rates_from_hdf5():
    fpath = join(TEST_DATA_DIR, 'performance', 'inca_torvum_all_snps.h5')
    h5 = VariationsH5(fpath, mode='r')

    filter_chunk = missing_rate_filter_fact(min_=0.8)
    chunks = h5.iterate_chunks(kept_fields=['/calls/GT'])
    filtered_chunks = map(filter_chunk, chunks)

    out_fpath = NamedTemporaryFile(suffix='.h5')
    os.remove(out_fpath.name)
    h5_2 = VariationsH5(out_fpath.name, mode='w')
    h5_2.put_chunks(filtered_chunks)
    h5_2.close()
示例#8
0
    def test_calc_missing_gt_rates(self):
        gts = numpy.array([])
        varis = {'/calls/GT': gts}
        called_vars = calc_called_gt(varis, rates=False)
        assert called_vars.shape[0] == 0
        called_vars = calc_called_gt(varis, rates=True)
        assert called_vars.shape[0] == 0

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        arrays = VariationsArrays()
        arrays.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT']))
        rates = calc_missing_gt(arrays)
        rates2 = calc_missing_gt(hdf5)
        assert rates.shape == (943,)
        assert numpy.allclose(rates, rates2)
        assert numpy.min(rates) == 0
        assert numpy.all(rates <= 1)

        gts = numpy.array([[[0, 0], [0, 0]], [[0, 0], [-1, -1]],
                           [[0, 0], [-1, -1]], [[-1, -1], [-1, -1]]])
        varis = {'/calls/GT': gts}
        expected = numpy.array([2, 1, 1, 0])
        called_vars = calc_called_gt(varis, rates=False)
        assert numpy.all(called_vars == expected)

        missing_vars = calc_missing_gt(varis, rates=False)
        assert numpy.all(missing_vars == 2 - expected)

        expected = numpy.array([0, 0.5, 0.5, 1])
        rates = calc_called_gt(varis)
        assert numpy.allclose(rates, 1 - expected)

        rates = calc_missing_gt(varis)
        assert numpy.allclose(rates, expected)
示例#9
0
 def test_count_alleles_by_freq(self):
     h5 = VariationsH5(join(TEST_DATA_DIR, 'limon.h5'), mode='r')
     # flt = SampleFilter(['V51'])
     # v51 = flt(h5)[FLT_VARS]
     chunk = first(h5.iterate_chunks())
     freqs_by_snp = calc_allele_freq_by_depth(chunk)
     assert numpy.all(freqs_by_snp[0] == [0, 1, 0, 0])
示例#10
0
    def test_calc_dp_means(self):
        snps = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        means = calc_depth_mean_by_sample(snps)

        means2 = calc_depth_mean_by_sample(snps, chunk_size=None)
        assert means.shape[0] == 153
        assert numpy.allclose(means, means2)
示例#11
0
    def test_calc_obs_het(self):
        gts = numpy.array([])
        dps = numpy.array([])
        varis = {'/calls/GT': gts, '/calls/DP': dps}
        het = calc_obs_het(varis, min_num_genotypes=0)
        assert het.shape[0] == 0

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        snps = VariationsArrays()
        snps.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT']))
        het_h5 = calc_obs_het(hdf5, min_num_genotypes=0)
        het_array = calc_obs_het(snps, min_num_genotypes=0)
        assert numpy.all(het_array == het_h5)

        gts = numpy.array([[[0, 0], [0, 1], [0, -1], [-1, -1]],
                           [[0, 0], [0, 0], [0, -1], [-1, -1]]])

        dps = numpy.array([[5, 12, 10, 10],
                           [10, 10, 10, 10]])

        varis = {'/calls/GT': gts, '/calls/DP': dps}
        het = calc_obs_het(varis, min_num_genotypes=0)
        assert numpy.allclose(het, [0.5, 0])

        het = calc_obs_het(varis, min_num_genotypes=10)
        assert numpy.allclose(het, [numpy.NaN, numpy.NaN], equal_nan=True)

        het = calc_obs_het(varis, min_num_genotypes=0, min_call_dp=10)
        assert numpy.allclose(het, [1, 0])

        het = calc_obs_het(varis, min_num_genotypes=0, max_call_dp=11)
        assert numpy.allclose(het, [0, 0])

        het = calc_obs_het(varis, min_num_genotypes=0, min_call_dp=5)
        assert numpy.allclose(het, [0.5, 0])
示例#12
0
    def test_append_matrix(self):
        in_fpath = join(TEST_DATA_DIR, '1000snps.hdf5')
        array = numpy.array([[1, 1, 1], [2, 2, 2]])
        expected = ([[1, 8, 5], [3, 5, 3], [6, 0, 4], [7, 4, 2], [4, 2, 3],
                     [1, 1, 1], [2, 2, 2]])
        expected2 = [[1, 1, 1], [2, 2, 2], [1, 8, 5], [3, 5, 3], [6, 0, 4],
                     [7, 4, 2], [4, 2, 3], [1, 1, 1], [2, 2, 2], [1, 8, 5],
                     [3, 5, 3], [6, 0, 4], [7, 4, 2], [4, 2, 3], [1, 1, 1],
                     [2, 2, 2]]
        with NamedTemporaryFile(suffix='.h5') as fhand_out:
            shutil.copy(in_fpath, fhand_out.name)
            hdf5 = VariationsH5(fhand_out.name, mode='r+')
            dset = hdf5['/calls/DP']
            orig_array = dset[()]
            append_matrix(dset, array)
            assert numpy.all(dset[()] == expected)

            append_matrix(dset, dset)

            array2 = numpy.array([[1, 1, 1], [2, 2, 2]])
            append_matrix(array2, dset[()])
            assert numpy.all(expected2 == array2)

        append_matrix(orig_array, array)
        assert numpy.all(orig_array == expected)
示例#13
0
    def test_calc_allele_obs_distrib_2D(self):
        variations = {'/calls/AO': numpy.array([[[0, 0], [5, 0], [-1, -1],
                                                 [0, -1], [0, 0], [0, 10],
                                                 [20, 0], [25, 0], [20, 20],
                                                 [0, 0]]]),
                      '/calls/RO': numpy.array([[0, 5, 15, 7, 10, 0, 0, 25,
                                                 20, 10]]),
                      '/calls/GQ': numpy.array([[40, 30, 35, 30, 0,
                                                 40, 30, 35, 30, 0]]),
                      '/calls/GT': numpy.array([[[0, 0], [1, 0], [-1, -1],
                                                 [0, -1], [0, 0], [0, 10],
                                                 [1, 0], [0, 0], [0, 0],
                                                 [1, 0]]])}
        hist, _, ybins = hist2d_allele_observations(variations,
                                                    chunk_size=None)
        assert hist[0, 0] == 1
        assert hist[-1, -1] == 1
        assert ybins[0] == 0

        hist, _, _ = hist2d_allele_observations(variations,
                                                mask_func=call_is_het,
                                                chunk_size=None)
        assert hist[0, 0] == 0
        assert hist[-1, -1] == 0

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        hist, xbins, ybins = hist2d_allele_observations(hdf5,
                                                        mask_func=call_is_het,
                                                        chunk_size=None)
        hist2, xbins2, ybins2 = hist2d_allele_observations(hdf5,
                                                           mask_func=call_is_het,
                                                           chunk_size=10)
        assert numpy.allclose(xbins, xbins2)
        assert numpy.allclose(ybins, ybins2)
        assert numpy.all(hist == hist2)
示例#14
0
 def test_ld_along_genome(self):
     hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r')
     ld = calc_ld_along_genome(hdf5,
                               max_dist=100000000,
                               chunk_size=3,
                               min_num_gts=1,
                               max_maf=1.1)
     assert list(ld)[0:1] == [(0.0, 2960.0, (b'20', 14370, b'20', 17330))]
示例#15
0
    def test_gst(self):
        h5 = VariationsH5(join(TEST_DATA_DIR, 'limon.h5'), mode='r')
        # flt = SampleFilter(['V51'])
        # v51 = flt(h5)[FLT_VARS]
        chunk = first(h5.iterate_chunks())

        dists = calc_gst_per_loci(chunk, populations=[['V51'], ['F49']])
        assert dists[0] == 0
示例#16
0
 def test_copy(self):
     in_snps = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r')
     for klass in VAR_MAT_CLASSES:
         out_snps = _init_var_mat(klass)
         in_snps.copy(out_snps, kept_fields=['/calls/GT'])
         assert '/calls/GQ' not in out_snps.keys()
         assert out_snps['/calls/GT'].shape == (5, 3, 2)
         assert numpy.all(out_snps['/calls/GT'][:] == in_snps['/calls/GT'])
示例#17
0
    def test_iterate_wins(self):
        fpath = join(TEST_DATA_DIR, 'ril.hdf5')
        hd5 = VariationsH5(fpath, mode='r')
        wins = hd5.iterate_wins(win_size=1000000)

        hd5_2 = VariationsArrays()
        hd5_2.put_chunks(wins)
        numpy.all(hd5['/variations/pos'] == hd5_2['/variations/pos'])
示例#18
0
    def test_iterate_chroms(self):
        fpath = join(TEST_DATA_DIR, 'ril.hdf5')
        hd5 = VariationsH5(fpath, mode='r')
        wins = hd5.iterate_chroms()

        hd5_2 = VariationsArrays()
        hd5_2.put_chunks([win for _, win in wins])
        numpy.all(hd5['/variations/pos'] == hd5_2['/variations/pos'])
示例#19
0
    def test_add_depth(self):
        snps = VariationsH5(join(TEST_DATA_DIR, 'expected_merged3.h5'), 'r')
        snps2 = snps.get_chunk(slice(None, None))
        add_mock_depth(snps2, 30)

        assert snps2[DP_FIELD].shape == (snps2.num_variations,
                                         len(snps2.samples))
        assert snps2[DP_FIELD][0, 0] == 30
示例#20
0
    def test_create_hdf5_with_chunks(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r')
        out_fhand = NamedTemporaryFile(suffix='.hdf5')
        out_fpath = out_fhand.name
        out_fhand.close()
        hdf5_2 = VariationsH5(out_fpath, 'w')
        try:
            hdf5_2.put_chunks(hdf5.iterate_chunks())
            assert sorted(hdf5_2['calls'].keys()) == ['DP', 'GQ', 'GT', 'HQ']
            assert numpy.all(hdf5['/calls/GT'][:] == hdf5_2['/calls/GT'][:])
        finally:
            os.remove(out_fpath)

        hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r')
        out_fhand = NamedTemporaryFile(suffix='.hdf5')
        out_fpath = out_fhand.name
        out_fhand.close()
        hdf5_2 = VariationsH5(out_fpath, 'w')
        try:
            hdf5_2.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT']))
            assert list(hdf5_2['calls'].keys()) == ['GT']
            assert numpy.all(hdf5['/calls/GT'][:] == hdf5_2['/calls/GT'][:])
        finally:
            os.remove(out_fpath)

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        hdf5_2 = VariationsArrays()
        hdf5_2.put_chunks(hdf5.iterate_chunks(random_sample_rate=0.2))
        _, prob = ttest_ind(hdf5['/variations/pos'][:],
                            hdf5_2['/variations/pos'][:])
        assert prob > 0.05
        assert hdf5_2.num_variations / hdf5.num_variations - 0.2 < 0.1
        chrom = hdf5_2['/variations/chrom'][0]
        pos = hdf5_2['/variations/pos'][0]
        index = PosIndex(hdf5)
        idx = index.index_pos(chrom, pos)
        old_snp = hdf5['/calls/GT'][idx]
        new_snp = hdf5_2['/calls/GT'][0]
        assert numpy.all(old_snp == new_snp)

        # putting empty chunks
        hdf5_2.put_chunks(None)
        hdf5_2.put_chunks([])
        chunk = hdf5.get_chunk(slice(1000, None))
        hdf5_2.put_chunks([chunk])

        old_snp = hdf5['/calls/DP'][idx]
        new_snp = hdf5_2['/calls/DP'][0]
        assert numpy.all(old_snp == new_snp)

        hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r')
        hdf5_2 = VariationsArrays()
        hdf5_2.put_chunks(hdf5.iterate_chunks(random_sample_rate=0))
        assert hdf5_2.num_variations == 0

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        hdf5_3 = VariationsArrays()
        hdf5_3.put_chunks(hdf5.iterate_chunks(random_sample_rate=0.01))
示例#21
0
 def test_ld_random_pairs_from_different_chroms(self):
     hdf5 = VariationsH5(join(TEST_DATA_DIR, 'tomato.apeki_gbs.calmd.h5'),
                         mode='r')
     variations = hdf5.get_chunk(slice(5000, 15000))
     mafs = calc_maf(variations, min_num_genotypes=10, chunk_size=None)
     mafs[numpy.isnan(mafs)] = 1
     variations = variations.get_chunk(mafs < 0.95)
     lds = calc_ld_random_pairs_from_different_chroms(variations, 100)
     lds = list(lds)
     assert len(lds) == 100
示例#22
0
 def xtest_real_file(self):
     fpath = '/home/peio/work_in/test_variation5/write_vcf/original.h5'
     vcf_fpath = '/home/peio/work_in/test_variation5/write_vcf/traditom_tier1.vcf'
     out_fhand = open(vcf_fpath, 'w')
     #         kept_fields = ['/variations/chrom', '/variations/pos', '/variations/ref',
     #                        '/variations/alt', '/variations/qual', '/calls/GT',
     #                        '/calls/GQ', '/calls/DP', '/calls/AO', '/calls/RO']
     #         vcfparser = VCFParser(open(vcf_fpath, 'rb'), pre_read_max_size=10000)
     h5 = VariationsH5(fpath=fpath, mode='r')
     # h5.put_vars(vcfparser)
     h5.write_vcf(out_fhand)
示例#23
0
    def test_calc_distrib_for_sample(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        snps = VariationsArrays()
        snps.put_chunks(hdf5.iterate_chunks())
        distrib, _ = calc_field_distrib_for_a_sample(hdf5, field='/calls/DP',
                                                     sample='1_17_1_gbs',
                                                     n_bins=15)
        assert distrib.shape == (15,)

        distrib2, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP',
                                                      n_bins=15,
                                                      sample='1_17_1_gbs',
                                                      chunk_size=None)
        assert numpy.all(distrib == distrib2)

        distrib3, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP',
                                                      n_bins=15,
                                                      sample='1_17_1_gbs',
                                                      chunk_size=50)
        assert numpy.all(distrib3 == distrib2)

        vars_ = VariationsArrays()
        vars_['/calls/DP'] = numpy.array([[10, 5, 15],
                                          [0, 15, 10]])
        vars_['/calls/GT'] = numpy.array([[[0, 0], [0, 1], [1, 1]],
                                          [[0, 0], [0, 1], [1, 1]]])
        vars_.samples = list(range(3))
        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16)
        expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])

        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16,
                                                    mask_field='/calls/GT',
                                                    mask_func=call_is_het)
        expec = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])

        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16,
                                                    mask_field='/calls/GT',
                                                    mask_func=call_is_hom)
        expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])
示例#24
0
    def test_calc_maf_distrib_by_chunk(self):
        varis = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        calc_maf_for_chunk = partial(calc_maf, min_num_genotypes=1,
                                     chunk_size=None)

        distrib, bins = histogram_for_chunks(varis, calc_maf_for_chunk,
                                             n_bins=10)
        dist_expected = [53, 72, 77, 66, 73, 129, 74, 73, 49, 277]

        bins_expected = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95,
                         1.]
        assert numpy.allclose(bins, bins_expected)
        assert numpy.allclose(distrib, dist_expected)
示例#25
0
    def test_count_compatible_snsp_in_strands(self):
        fpath = join(TEST_DATA_DIR, 'csv', 'iupac_ex.h5')
        h5 = VariationsH5(fpath, "r")

        custom_alleles = numpy.array([[b'G', b'T'], [b'G', b'T'], [b'G',
                                                                   b'T']])
        array_spec_matrix = numpy.array([[True, False, True],
                                         [True, True, False],
                                         [True, True, False]])
        snps_check, counts = count_compatible_snps(h5, array_spec_matrix,
                                                   custom_alleles)
        assert counts == [1, 0, 2]
        assert snps_check == 3
示例#26
0
    def test_create_arrays_with_chunks(self):

        for klass in VAR_MAT_CLASSES:
            in_snps = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'),
                                   mode='r')
            var_mat = _init_var_mat(klass)
            try:
                var_mat.put_chunks(in_snps.iterate_chunks())
                result = var_mat['/calls/GT'][:]
                assert numpy.all(in_snps['/calls/GT'][:] == result)
                in_snps.close()
            finally:
                pass
示例#27
0
    def test_annotator_h5(self):
        annot_id = 'test'
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        annotator = IsVariableAnnotator(annot_id=annot_id,
                                        samples=['1_14_1_gbs', '1_17_1_gbs'])

        result = annotator(hdf5)
        annotated_variations = result[ANNOTATED_VARS]
        field = '/variations/info/{}'.format(annot_id)
        assert annotated_variations.metadata[field]['Type'] == 'Integer'
        assert annotated_variations.metadata[field]['Number'] == 1

        assert field in annotated_variations.keys()
        assert annotated_variations[field][3] == FALSE_INT
示例#28
0
    def test_pipeline(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = MinCalledGTsFilter(min_called=0.1, range_=(0, 1))
        pipeline.append(flt, id_='filter1')

        vars_out = VariationsArrays()
        result = pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(result['filter1']['counts'], result2['counts'])
        assert numpy.allclose(result['filter1']['edges'], result2['edges'])
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
        assert (
            result['filter1'][FLT_STATS][N_KEPT] == result2[FLT_STATS][N_KEPT])
        assert result['filter1'][FLT_STATS][TOT] == result2[FLT_STATS][TOT]
        assert (result['filter1'][FLT_STATS][N_FILTERED_OUT] ==
                result2[FLT_STATS][N_FILTERED_OUT])

        # check with no range set
        pipeline = Pipeline()
        flt = MinCalledGTsFilter(min_called=0.1, do_histogram=True)
        pipeline.append(flt, id_='filter1')

        vars_out = VariationsArrays()
        result = pipeline.run(hdf5, vars_out)

        result2 = flt(hdf5)
        assert numpy.allclose(result['filter1']['counts'], result2['counts'])
        assert numpy.allclose(result['filter1']['edges'], result2['edges'])
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])

        # With rates False
        pipeline = Pipeline()
        flt = MinCalledGTsFilter(min_called=20, rates=False, do_histogram=True)
        pipeline.append(flt, id_='filter1')

        vars_out = VariationsArrays()
        result = pipeline.run(hdf5, vars_out)

        result2 = flt(hdf5)
        assert result['filter1']['order'] == 0
        assert numpy.allclose(result['filter1']['counts'], result2['counts'])
        assert numpy.allclose(result['filter1']['edges'], result2['edges'])
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
示例#29
0
    def test_calc_gt_type_stats(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        result = calc_gt_type_stats(hdf5)
        assert result.shape == (4, 153)
        assert numpy.all(numpy.sum(result, axis=0) == 943)

        gts = numpy.array([[[0, 0], [1, 1], [0, -1], [-1, -1]],
                           [[0, -1], [0, 0], [0, -1], [-1, -1]],
                           [[0, 1], [0, 0], [0, 0], [-1, -1]]])

        varis = {'/calls/GT': gts}
        res = calc_gt_type_stats(varis)
        expected = [[1, 2, 1, 0], [1, 0, 0, 0], [0, 1, 0, 0], [1, 0, 2, 3]]
        assert numpy.all(res == expected)
示例#30
0
    def test_fieldpath(self):
        pipeline = Pipeline()
        annot_id = 'test'
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        field = '/variations/info/{}'.format(annot_id)
        annotator = IsVariableAnnotator(annot_id=annot_id,
                                        samples=['1_14_1_gbs', '1_17_1_gbs'])
        pipeline.append(annotator)
        annotator = FieldValueFilter(field_path=field, value=0)
        pipeline.append(annotator)

        vars_out = VariationsArrays()
        pipeline.run(hdf5, vars_out)
        assert vars_out.num_variations == 484