Пример #1
0
    def test_set_gt_to_missing_by_dp2(self):
        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [1, 1], [0, 1], [1, 1], [0, 0]],
                           [[0, 0], [0, 0], [0, 1], [0, 0], [1, 1]]])
        dps = numpy.array([[10, 20, 5, 20, 25], [10, 2, 5, 15, 5]])
        variations[GT_FIELD] = gts
        variations[DP_FIELD] = dps

        set_low_dp_gts_to_missing = LowDPGTsToMissingSetter(min_dp=10)
        res = set_low_dp_gts_to_missing(variations)[FLT_VARS]

        assert numpy.all(res[DP_FIELD] == numpy.array([[10, 20, -1, 20, 25],
                                                       [10, -1, -1, 15, -1]]))
        assert numpy.all(res[GT_FIELD] == numpy.array([[[0, 0], [1, 1], [
            -1, -1
        ], [1, 1], [0, 0]], [[0, 0], [-1, -1], [-1, -1], [0, 0], [-1, -1]]]))

        variations = VariationsArrays()
        variations[GT_FIELD] = gts
        variations[DP_FIELD] = dps

        set_low_dp_gts_to_missing = LowDPGTsToMissingSetter(
            min_dp=10, query_field_to_missing=False)
        res = set_low_dp_gts_to_missing(variations)[FLT_VARS]
        assert numpy.all(res[DP_FIELD] == variations[DP_FIELD])
Пример #2
0
    def test_filter_chi2_gt_sample_sets(self):
        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]],
                           [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]],
                           [[0, 0], [0, 0], [0, 1], [1, 1], [1, 1], [1, 1]],
                           [[0, 0], [0, 0], [0, 1], [1, 1], [1, 1], [1, 1]]])
        variations[GT_FIELD] = gts
        variations.samples = [1, 2, 3, 4, 5, 6]
        samples1 = [1, 2, 3]
        samples2 = [4, 5, 6]
        flt = Chi2GtFreqs2SampleSetsFilter(samples1,
                                           samples2,
                                           min_pval=0.05,
                                           n_bins=2,
                                           report_selection=True)
        res = flt(variations)
        assert list(res[COUNTS]) == [2, 2]
        assert numpy.all(res[FLT_VARS][GT_FIELD] == gts[:2, ...])
        assert res[FLT_STATS][N_KEPT] == 2
        assert res[FLT_STATS][TOT] == 4
        assert res[FLT_STATS][N_FILTERED_OUT] == 2
        assert res[SELECTED_VARS].shape

        flt = Chi2GtFreqs2SampleSetsFilter(samples1,
                                           samples2,
                                           min_pval=0.05,
                                           n_bins=2,
                                           return_discarded=True)
        res = flt(variations)
        assert res[DISCARDED_VARS].num_variations
Пример #3
0
    def test_filter_macs(self):
        # with some missing values
        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [1, 1], [0, 1], [1, 1], [0, 0]],
                           [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]],
                           [[0, 0], [0, 0], [0, 0], [0, 0], [0, 1]],
                           [[0, 0], [-1, -1], [0, 1], [0, 0], [1, 1]]])
        variations[GT_FIELD] = gts
        filtered = MacFilter(min_num_genotypes=5,
                             report_selection=True)(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == gts)
        assert filtered[FLT_STATS][N_KEPT] == 4
        assert filtered[FLT_STATS][TOT] == 4
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 0
        assert filtered[SELECTED_VARS].shape

        filtered = MacFilter(min_mac=0, min_num_genotypes=5)(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == gts[[0, 1, 2]])
        assert filtered[FLT_STATS][N_KEPT] == 3
        assert filtered[FLT_STATS][TOT] == 4
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 1

        # without missing values
        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [1, 1], [0, 1], [1, 1], [0, 0]],
                           [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]],
                           [[0, 0], [0, 0], [0, 0], [0, 0], [0, 1]],
                           [[0, 0], [0, 0], [0, 1], [0, 0], [1, 1]]])
        variations[GT_FIELD] = gts

        expected = numpy.array([[[0, 0], [1, 1], [0, 1], [1, 1], [0, 0]],
                                [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]],
                                [[0, 0], [0, 0], [0, 1], [0, 0], [1, 1]]])
        filtered = MacFilter(max_mac=4, min_num_genotypes=0)(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == expected)

        expected = numpy.array([[[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]],
                                [[0, 0], [0, 0], [0, 1], [0, 0], [1, 1]]])
        filtered = MacFilter(min_mac=3.5, max_mac=4,
                             min_num_genotypes=0)(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == expected)

        expected = numpy.array([[[0, 0], [1, 1], [0, 1], [1, 1], [0, 0]]])
        filtered = MacFilter(max_mac=3, min_num_genotypes=0)(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == expected)

        filtered = MacFilter(min_mac=2, max_mac=5,
                             min_num_genotypes=0)(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == variations[GT_FIELD])

        # With hdf5 files
        variations = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        filtered = MacFilter(min_mac=130, max_mac=150)(variations)
        counts = Counter(filtered[FLT_VARS][GT_FIELD].flat)
        assert counts == {-1: 64530, 0: 36977, 1: 18716, 2: 35}
Пример #4
0
    def test_filter_mafs_2(self):
        # with some missing values
        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [1, 1], [0, 1], [1, 1], [0, 0]],
                           [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]],
                           [[0, 0], [0, 0], [0, 0], [0, 0], [0, 1]],
                           [[0, 0], [-1, -1], [0, 1], [0, 0], [1, 1]]])
        variations[GT_FIELD] = gts
        filtered = MafFilter(min_num_genotypes=5)(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == gts)
        assert filtered[FLT_STATS][N_KEPT] == 4
        assert filtered[FLT_STATS][TOT] == 4
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 0

        filtered = MafFilter(min_num_genotypes=5, min_maf=0)(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == gts[[0, 1, 2]])
        assert filtered[FLT_STATS][N_KEPT] == 3
        assert filtered[FLT_STATS][TOT] == 4
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 1

        # without missing values
        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [1, 1], [0, 1], [1, 1], [0, 0]],
                           [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]],
                           [[0, 0], [0, 0], [0, 0], [0, 0], [0, 1]],
                           [[0, 0], [0, 0], [0, 1], [0, 0], [1, 1]]])
        variations[GT_FIELD] = gts

        expected = numpy.array([[[0, 0], [1, 1], [0, 1], [1, 1], [0, 0]],
                                [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]],
                                [[0, 0], [0, 0], [0, 1], [0, 0], [1, 1]]])
        filtered = MafFilter(min_num_genotypes=0, max_maf=0.8)(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == expected)

        expected = numpy.array([[[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]],
                                [[0, 0], [0, 0], [0, 1], [0, 0], [1, 1]]])
        filtered = MafFilter(min_num_genotypes=0, min_maf=0.6,
                             max_maf=0.8)(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == expected)

        expected = numpy.array([[[0, 0], [1, 1], [0, 1], [1, 1], [0, 0]]])
        filtered = MafFilter(min_num_genotypes=0, max_maf=0.5)(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == expected)

        filtered = MafFilter(min_num_genotypes=0, min_maf=0.5,
                             max_maf=1)(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == variations[GT_FIELD])

        # With hdf5 files
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        filtered = MafFilter(min_maf=0.6, max_maf=0.9)(hdf5)
        counts = Counter(filtered[FLT_VARS][GT_FIELD].flat)
        assert counts == {0: 57805, -1: 55792, 1: 32504, 2: 162, 3: 5}
Пример #5
0
    def test_filter_high_density(self):
        in_vars = VariationsArrays()
        out_vars = VariationsArrays()
        res = filter_variation_density(in_vars, out_vars=out_vars,
                                       max_density=1, window=1)
        assert not out_vars.keys()
        assert not res

        varis = VariationsArrays()
        out_vars = VariationsArrays()
        varis[CHROM_FIELD] = numpy.array([b'chr1'] * 6)
        varis[POS_FIELD] = numpy.array([1, 2, 3, 4, 10, 11])
        result = filter_variation_density(varis, out_vars=out_vars,
                                          max_density=2, window=3, n_bins=2)
        assert list(out_vars[POS_FIELD]) == [1, 4, 10, 11]
        assert list(result[COUNTS]) == [4, 2]
        assert list(result[EDGES]) == [2., 2.5, 3.]
        assert result[FLT_STATS][N_KEPT] == 4
        assert result[FLT_STATS][TOT] == 6
        assert result[FLT_STATS][N_FILTERED_OUT] == 2

        varis = VariationsArrays()
        out_vars = VariationsArrays()
        varis[CHROM_FIELD] = numpy.array([b'chr1'] * 6)
        varis[POS_FIELD] = numpy.array([1, 2, 3, 4, 10, 11])
        result = filter_variation_density(varis, out_vars=out_vars,
                                          max_density=2, window=3, n_bins=2,
                                          chunk_size=1)
        assert list(out_vars[POS_FIELD]) == [1, 4, 10, 11]
        assert list(result[COUNTS]) == [4, 2]
        assert list(result[EDGES]) == [2., 2.5, 3.]
        assert result[FLT_STATS][N_KEPT] == 4
        assert result[FLT_STATS][TOT] == 6
        assert result[FLT_STATS][N_FILTERED_OUT] == 2

        varis = VariationsArrays()
        out_vars = VariationsArrays()
        varis[CHROM_FIELD] = numpy.array([b'chr1'] * 6)
        varis[POS_FIELD] = numpy.array([1, 2, 3, 4, 10, 11])
        result = filter_variation_density(varis, out_vars=out_vars,
                                          max_density=2, window=3, n_bins=2,
                                          chunk_size=None)
        assert list(out_vars[POS_FIELD]) == [1, 4, 10, 11]
        assert list(result[COUNTS]) == [4, 2]
        assert list(result[EDGES]) == [2., 2.5, 3.]
        assert result[FLT_STATS][N_KEPT] == 4
        assert result[FLT_STATS][TOT] == 6
        assert result[FLT_STATS][N_FILTERED_OUT] == 2
Пример #6
0
    def test_filter_by_pos(self):
        variations = VariationsArrays()
        pos = numpy.array([5, 10, 15, 20, 25])
        chrom = numpy.array(['1'] * len(pos))
        variations[POS_FIELD] = pos
        variations[CHROM_FIELD] = chrom

        regions = [('1', 5, 10)]

        filtered = SNPPositionFilter(regions)(variations)
        assert filtered[FLT_VARS][POS_FIELD] == [5]

        regions = [('2', 5, 10)]
        filtered = SNPPositionFilter(regions)(variations)
        assert not filtered[FLT_VARS][POS_FIELD].size

        regions = [('1', 23, 30), ('2', 5, 10)]
        filtered = SNPPositionFilter(regions)(variations)
        assert filtered[FLT_VARS][POS_FIELD] == [25]

        regions = [('1', 23, 30), ('2', 5, 10)]
        filtered = SNPPositionFilter(regions, reverse=True)(variations)
        assert numpy.all(filtered[FLT_VARS][POS_FIELD] == [5, 10, 15, 20])

        regions = [('2', )]
        filtered = SNPPositionFilter(regions)(variations)
        assert not filtered[FLT_VARS][POS_FIELD].size
Пример #7
0
    def test_filter_biallelic(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        kept_fields = [GT_FIELD]
        snps = hdf5.iterate_chunks(kept_fields=kept_fields)
        chunk = first(snps)

        flt_chunk = NonBiallelicFilter(report_selection=True)(chunk)
        kept = flt_chunk[FLT_VARS][GT_FIELD].shape[0]
        assert flt_chunk[FLT_VARS][GT_FIELD].shape[1:] == (153, 2)
        assert flt_chunk[FLT_STATS][N_KEPT] == kept
        assert flt_chunk[FLT_STATS][TOT] == SNPS_PER_CHUNK
        assert flt_chunk[FLT_STATS][N_FILTERED_OUT] == SNPS_PER_CHUNK - kept
        assert flt_chunk[SELECTED_VARS].shape

        flt_chunk = NonBiallelicFilter(report_selection=True,
                                       reverse=True)(chunk)
        kept = flt_chunk[FLT_VARS][GT_FIELD].shape[0]
        assert flt_chunk[FLT_VARS][GT_FIELD].shape[1:] == (153, 2)
        assert flt_chunk[FLT_STATS][N_KEPT] == kept
        assert flt_chunk[FLT_STATS][TOT] == SNPS_PER_CHUNK
        assert flt_chunk[FLT_STATS][N_FILTERED_OUT] == SNPS_PER_CHUNK - kept
        assert flt_chunk[SELECTED_VARS].shape

        variations = VariationsArrays()
        gts = numpy.array([
            [[0, 0], [1, 1], [0, 1]],
            [[0, 0], [0, 0], [0, 0]],
            [[0, 0], [1, 1], [2, 2]],
        ])
        variations[GT_FIELD] = gts
        flt_vars = NonBiallelicFilter()(variations)[FLT_VARS]
        expected = numpy.array([[[0, 0], [1, 1], [0, 1]]])
        assert numpy.all(flt_vars[GT_FIELD] == expected)
Пример #8
0
    def test_filter_allele_depth_based_maf(self):
        allele_depths_snp1 = [
            [10, 0, 1],  # Allele Obervation in sample1
            [4, 6, 1]
        ]  # Allele Obervation in sample2
        allele_depths_snp2 = [
            [10, 0, 0],  # Allele Obervation in sample1
            [0, 5, 7]
        ]  # Allele Obervation in sample2
        allele_depths_snp3 = [
            [-1, -1, -1],  # Allele Obervation in sample1
            [-1, -1, -1]
        ]  # Allele Obervation in sample2

        allele_depths = numpy.array(
            [allele_depths_snp1, allele_depths_snp2, allele_depths_snp3])
        varis = VariationsArrays()
        varis[AD_FIELD] = allele_depths
        filtered_vars = AlleleObservationBasedMafFilter(
            max_maf=0.8)(varis)[FLT_VARS]
        assert filtered_vars.num_variations == 2

        filtered_vars = AlleleObservationBasedMafFilter(
            max_maf=0.5)(varis)[FLT_VARS]
        expected = numpy.array([allele_depths_snp2])
        assert numpy.all(filtered_vars[AD_FIELD] == expected)

        filtered_vars = AlleleObservationBasedMafFilter(
            min_maf=0.8)(varis)[FLT_VARS]
        assert not filtered_vars.num_variations
        expected = numpy.array([allele_depths_snp2])
        assert numpy.all(filtered_vars[AD_FIELD] == expected)
Пример #9
0
    def test_annotation_filter(self):
        annot_id = 'test'
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        variations = VariationsArrays()
        variations.put_chunks(hdf5.iterate_chunks())

        annotator = IsVariableAnnotator(annot_id=annot_id)
        result = annotator(variations)
        annotated_variations = result[ANNOTATED_VARS]
        field = '/variations/info/{}'.format(annot_id)

        annotator = FieldValueFilter(field_path=field, value=0)
        filtered = annotator(annotated_variations)
        assert filtered[FLT_STATS][N_KEPT] == 133
        assert filtered[FLT_STATS][TOT] == 943
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 810
Пример #10
0
    def test_find(self):

        snps = VariationsArrays()
        chroms = [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]
        pos = [1, 2, 3, 1, 2, 3, 1, 2, 3, 2, 4, 6]
        snps[CHROM_FIELD] = numpy.array(chroms)
        snps[POS_FIELD] = numpy.array(pos)
Пример #11
0
 def test_filter_by_pos(self):
     variations = VariationsArrays()
     pos = numpy.array([5, 10, 15, 20, 25])
     chrom = numpy.array(['1'] * len(pos))
     variations[POS_FIELD] = pos
     variations[CHROM_FIELD] = chrom
     filtered = VarsSamplingFilter(sample_rate=0.5)(variations)[FLT_VARS]
     assert filtered.num_variations == 2
Пример #12
0
    def test_filter_indels(self):
        variations = VariationsArrays()
        alt = [['A', 'T', 'CG'], ['A', '', ''], ['C', '', ''], ['G', '', '']]
        variations[ALT_FIELD] = numpy.array(alt)
        filter_gts = IndelFilter()
        res = filter_gts(variations)
        assert numpy.all(res[FLT_VARS][ALT_FIELD] == alt[1:])

        variations = VariationsArrays()
        alt = [['A', 'TT', 'T'], ['A', '', ''], ['C', '', ''], ['G', '', '']]
        variations[ALT_FIELD] = numpy.array(alt)
        filter_gts = IndelFilter()
        res = filter_gts(variations)
        assert numpy.all(res[FLT_VARS][ALT_FIELD] == alt[1:])

        variations = VariationsArrays()
        alt = [[b'A', b'TT'], [b'A', b''], [b'C', b''], [b'G', b'']]
        variations[ALT_FIELD] = numpy.array(alt)
        filter_gts = IndelFilter()
        res = filter_gts(variations)
        assert numpy.all(res[FLT_VARS][ALT_FIELD] == alt[1:])

        variations = VariationsArrays()
        alt = [[b'A', b''], [b'A', b''], [b'C', b''], [b'GT', b'T']]
        variations[ALT_FIELD] = numpy.array(alt)
        filter_gts = IndelFilter()
        res = filter_gts(variations)
        assert numpy.all(res[FLT_VARS][ALT_FIELD] == alt[:-1])

        variations = VariationsArrays()
        alt = [['A', 'T'], ['A', ''], ['C', ''], ['G', '']]
        variations[ALT_FIELD] = numpy.array(alt)
        filter_gts = IndelFilter()
        res = filter_gts(variations)
        assert numpy.all(res[FLT_VARS][ALT_FIELD] == alt)

        variations = VariationsArrays()
        alt = [[b'A'], [b'A'], [b'C'], [b'G']]
        variations[ALT_FIELD] = numpy.array(alt)
        filter_gts = IndelFilter()
        res = filter_gts(variations)
        assert numpy.all(res[FLT_VARS][ALT_FIELD] == alt)

        variations = VariationsArrays()
        alt = [[b'A'], [b'A'], [b'C'], [b'GT']]
        variations[ALT_FIELD] = numpy.array(alt)
        filter_gts = IndelFilter()
        res = filter_gts(variations)
        assert numpy.all(res[FLT_VARS][ALT_FIELD] == alt[:-1])

        variations = VariationsArrays()
        alt = [['A'], ['A'], ['C'], ['GT']]
        variations[ALT_FIELD] = numpy.array(alt)
        filter_gts = IndelFilter()
        res = filter_gts(variations)
        assert numpy.all(res[FLT_VARS][ALT_FIELD] == alt[:-1])
Пример #13
0
    def test_filter_or(self):
        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [1, 1], [0, 1], [1, 1], [0, 0]],
                           [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]],
                           [[0, 0], [0, 0], [0, 0], [0, 0], [0, 1]],
                           [[0, 0], [0, 0], [0, 1], [0, 0], [1, 1]]])
        variations[GT_FIELD] = gts
        variations.samples = [1, 2, 3, 4, 5]

        filter1 = ObsHetFilter(min_num_genotypes=0)
        filter2 = ObsHetFilter(min_het=0.2, min_num_genotypes=0)

        filtered = OrFilter([filter1, filter2])(variations)

        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == gts)
        assert filtered[FLT_STATS][N_KEPT] == 4
        assert filtered[FLT_STATS][TOT] == 4
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 0
Пример #14
0
    def test_filter_or(self):
        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [1, 1], [0, 1], [1, 1], [0, 0]],
                           [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]],
                           [[0, 0], [0, 0], [0, 0], [0, 0], [0, 1]],
                           [[0, 0], [0, 0], [0, 1], [0, 0], [1, 1]]])
        variations[GT_FIELD] = gts
        variations.samples = [1, 2, 3, 4, 5]

        filter1 = ObsHetFilter(min_num_genotypes=0)
        filter2 = ObsHetFilter(min_het=0.2, min_num_genotypes=0)

        filtered = OrFilter([filter1, filter2])(variations)

        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == gts)
        assert filtered[FLT_STATS][N_KEPT] == 4
        assert filtered[FLT_STATS][TOT] == 4
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 0
Пример #15
0
 def test_filter_by_pos2(self):
     variations = VariationsArrays()
     pos = numpy.array([5, 10, 15, 20, 25])
     chrom = numpy.array(['1'] * len(pos))
     variations[POS_FIELD] = pos
     variations[CHROM_FIELD] = chrom
     filtered = VarsSamplingFilter2(num_vars=2)(variations)[FLT_VARS]
     assert filtered.num_variations == 2
     filtered = VarsSamplingFilter2(num_vars=5)(variations)[FLT_VARS]
     assert numpy.all(filtered[POS_FIELD] == pos)
Пример #16
0
    def test_fix_duplicated_alleles(self):
        gt = numpy.array([[[0, 0], [0, 4], [-1, -1], [0, 1], [0, 0], [0, 0],
                           [0, 0], [0, 1], [0, 0], [0, 0], [2, 2], [3, 3]],
                          [[0, 0], [0, 0], [0, 2], [0, 1], [0, 0], [0, 1],
                           [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 0]],
                          [[0, 0], [0, 0], [0, 0], [0, 1], [0, 0], [1, 2],
                           [0, 2], [0, 1], [1, 1], [0, 0], [0, 0], [0, 3]],
                          [[2, 2], [0, 0], [0, 1], [-1, -1], [0, 0], [0, 1],
                           [0, 0], [-1, -1], [0, 0], [0, 0], [0, 0], [0, 0]],
                          [[-1, -1], [-1, -1], [0, 1], [1, 1], [-1, -1],
                           [-1, -1], [0, 1], [1, 1], [0, 2], [-1, -1],
                           [-1, -1], [-1, -1]]])

        alt = numpy.array([[b'T', b'C', b'A', b'C', b''],
                           [b'GTC', b'ATC', b'', b'', b''],
                           [b'C', b'C', b'T', b'C', b''],
                           [b'T', b'A', b'', b'', b''],
                           [b'A', b'A', b'T', b'', b'']])

        ref = numpy.array([b'G', b'G', b'C', b'T', b'C'])

        variations = VariationsArrays()
        variations[GT_FIELD] = gt
        variations[ALT_FIELD] = alt
        variations[REF_FIELD] = ref

        gt_expected = numpy.array([[[0, 0], [0, 2], [-1, -1], [0, 1], [0, 0],
                                    [0, 0], [0, 0], [0, 1], [0, 0], [0, 0],
                                    [2, 2], [3, 3]],
                                   [[0, 0], [0, 0], [0, 2], [0, 1], [0, 0],
                                    [0, 1], [0, 0], [0, 1], [0, 0], [0, 0],
                                    [0, 0], [0, 0]],
                                   [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0],
                                    [0, 0], [0, 0], [0, 0], [0, 0], [0, 0],
                                    [0, 0], [0, 1]],
                                   [[1, 1], [0, 0], [0, 0], [-1, -1], [0, 0],
                                    [0, 0], [0, 0], [-1, -1], [0, 0], [0, 0],
                                    [0, 0], [0, 0]],
                                   [[-1, -1], [-1, -1], [0, 1], [1, 1],
                                    [-1, -1], [-1, -1], [0, 1], [1, 1], [0, 1],
                                    [-1, -1], [-1, -1], [-1, -1]]])

        alt_expected = numpy.array([[b'T', b'C', b'A', b'', b''],
                                    [b'GTC', b'ATC', b'', b'', b''],
                                    [b'T', b'', b'', b'', b''],
                                    [b'A', b'', b'', b'', b''],
                                    [b'A', b'T', b'', b'', b'']])

        fix_duplicated_alleles = DuplicatedAlleleFixer()
        fix_duplicated_alleles(variations)

        assert numpy.all(variations[GT_FIELD] == gt_expected)
        assert numpy.all(variations[ALT_FIELD] == alt_expected)
Пример #17
0
    def test_filter_chi2_gt_sample_sets(self):
        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]],
                           [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]],
                           [[0, 0], [0, 0], [0, 1], [1, 1], [1, 1], [1, 1]],
                           [[0, 0], [0, 0], [0, 1], [1, 1], [1, 1], [1, 1]]])
        variations[GT_FIELD] = gts
        variations.samples = [1, 2, 3, 4, 5, 6]
        samples1 = [1, 2, 3]
        samples2 = [4, 5, 6]
        flt = Chi2GtFreqs2SampleSetsFilter(samples1, samples2, min_pval=0.05,
                                           n_bins=2, report_selection=True)
        res = flt(variations)
        assert list(res[COUNTS]) == [2, 2]
        assert numpy.all(res[FLT_VARS][GT_FIELD] == gts[:2, ...])
        assert res[FLT_STATS][N_KEPT] == 2
        assert res[FLT_STATS][TOT] == 4
        assert res[FLT_STATS][N_FILTERED_OUT] == 2
        assert res[SELECTED_VARS].shape

        flt = Chi2GtFreqs2SampleSetsFilter(samples1, samples2, min_pval=0.05,
                                           n_bins=2, return_discarded=True)
        res = flt(variations)
        assert res[DISCARDED_VARS].num_variations
Пример #18
0
    def test_filter_non_variable_and_all_missing(self):
        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]],
                           [[0, 0], [0, 0], [0, 0], [0, 0], [-1, -1]],
                           [[0, 1], [0, 0], [0, 0], [-1, -1], [-1, -1]],
                           [[-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        variations[GT_FIELD] = gts

        expected = numpy.array([[[0, 1], [0, 0], [0, 0], [-1, -1], [-1, -1]]])
        filter_gts = VariableAndNotAllMissing()
        filtered = filter_gts(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == expected)
        assert filtered[FLT_STATS][N_KEPT] == 1
        assert filtered[FLT_STATS][TOT] == 4
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 3
Пример #19
0
    def test_filter_snps_missing_gts_or_het(self):
        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]],
                           [[0, 0], [0, 0], [0, 0], [0, 0], [1, 0]],
                           [[0, 0], [0, 0], [1, 0], [-1, -1], [-1, -1]],
                           [[0, 0], [0, 0], [0, 0], [-1, -1], [-1, -1]]])
        variations[GT_FIELD] = gts

        expected = numpy.array([[[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]]])
        filter_gts = NoMissingGTsOrHetFilter()
        filtered = filter_gts(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == expected)
        assert filtered[FLT_STATS][N_KEPT] == 1
        assert filtered[FLT_STATS][TOT] == 4
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 3
Пример #20
0
    def test_set_gt_to_missing_by_qual(self):
        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [1, 1], [0, 1], [1, 1], [0, 0]],
                           [[0, 0], [0, 0], [0, 1], [0, 0], [1, 1]]])
        gqs = numpy.array([[10, 20, 5, 20, 25], [10, 2, 5, 15, 5]])
        variations[GT_FIELD] = gts
        variations[GQ_FIELD] = gqs
        set_low_qual_gts_to_missing = LowQualGTsToMissingSetter(min_qual=0)
        set_low_qual_gts_to_missing(variations)
        filtered = variations[GT_FIELD]
        assert numpy.all(filtered == gts)

        expected = numpy.array([[[0, 0], [1, 1], [-1, -1], [1, 1], [0, 0]],
                                [[0, 0], [-1, -1], [-1, -1], [0, 0], [-1,
                                                                      -1]]])
        set_low_qual_gts_to_missing = LowQualGTsToMissingSetter(min_qual=10)
        set_low_qual_gts_to_missing(variations)
        assert numpy.all(variations[GT_FIELD] == expected)

        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [1, 1], [0, 1], [1, 1], [0, 0]],
                           [[0, 0], [0, 0], [0, 1], [0, 0], [1, 1]]])
        gqs = numpy.array([[10, 20, 5, 20, 25], [10, 2, 5, 15, 5]])
        variations[GT_FIELD] = gts
        variations[GQ_FIELD] = gqs
        set_low_qual_gts_to_missing(variations)
        assert numpy.all(variations[GT_FIELD] == expected)

        set_low_qual_gts_to_missing = LowQualGTsToMissingSetter(min_qual=100)
        set_low_qual_gts_to_missing(variations)
        assert numpy.all(variations[GT_FIELD] == -1)

        h5_1 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        set_low_qual_gts_to_missing = LowQualGTsToMissingSetter(min_qual=0)
        h5_2 = set_low_qual_gts_to_missing(h5_1)
        assert numpy.all(h5_1[GT_FIELD][:] == h5_2[FLT_VARS][GT_FIELD])
Пример #21
0
    def test_filter_quality_snps(self):
        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [1, 1]], [[0, 1], [1, 1]], [[0, 0], [0,
                                                                         0]],
                           [[0, 0], [0, 0]], [[0, 1], [0, 0]]])
        snp_quals = numpy.array([5, 10, 15, 5, 20])
        variations[GT_FIELD] = gts
        variations['/variations/qual'] = snp_quals

        filtered = SNPQualFilter(report_selection=True)(variations)
        filtered_qual = filtered[FLT_VARS]['/variations/qual']
        filtered_gts = filtered[FLT_VARS][GT_FIELD]
        assert numpy.all(variations['/variations/qual'] == filtered_qual)
        assert numpy.all(variations[GT_FIELD] == filtered_gts)
        assert filtered[SELECTED_VARS].shape

        expected_gts = numpy.array([[[0, 0], [0, 0]], [[0, 1], [0, 0]]])
        exp_snp_quals = numpy.array([15, 20])
        filtered = SNPQualFilter(min_qual=15)(variations)
        assert numpy.all(
            filtered[FLT_VARS]['/variations/qual'] == exp_snp_quals)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == expected_gts)
        assert filtered[FLT_STATS][N_KEPT] == 2
        assert filtered[FLT_STATS][TOT] == 5
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 3

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        kept_fields = ['/variations/qual']
        snps = hdf5.iterate_chunks(kept_fields=kept_fields)
        chunk = first(snps)
        flt_chunk = SNPQualFilter(min_qual=530)(chunk)[FLT_VARS]
        assert first(flt_chunk.values()).shape[0] > 126

        flt_chunk = SNPQualFilter()(chunk)[FLT_VARS]
        assert first(flt_chunk.values()).shape[0] == SNPS_PER_CHUNK

        flt_chunk = SNPQualFilter(max_qual=1000)(chunk)[FLT_VARS]
        assert first(flt_chunk.values()).shape[0] > 92

        flt_chunk = SNPQualFilter(min_qual=530, max_qual=1000)(chunk)[FLT_VARS]
        assert first(flt_chunk.values()).shape[0] > 18

        flt_chunk = SNPQualFilter(min_qual=math.inf)(chunk)[FLT_VARS]
        assert first(flt_chunk.values()).shape[0] == 20

        flt_chunk = SNPQualFilter(max_qual=-1)(chunk)[FLT_VARS]
        assert first(flt_chunk.values()).shape[0] == 0
Пример #22
0
    def test_index(self):
        snps = VariationsArrays()
        chroms = [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]
        pos = [1, 2, 3, 1, 2, 3, 1, 2, 3, 2, 4, 6]
        snps[CHROM_FIELD] = numpy.array(chroms)
        snps[POS_FIELD] = numpy.array(pos)

        index = PosIndex(snps)
        assert index.index_pos(1, 1) == 0
        assert index.index_pos(2, 1) == 3
        assert index.index_pos(3, 1) == 6
        assert index.index_pos(4, 1) == 9
        assert index.index_pos(4, 2) == 9
        assert index.index_pos(4, 3) == 10
        assert index.index_pos(4, 4) == 10
        assert index.get_chrom_range_index(1) == (0, 2)
        assert index.get_chrom_range_pos(1) == (1, 3)
        assert index.covered_length == 10
Пример #23
0
    def test_filter_called_gt(self):
        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]],
                           [[0, 0], [0, 0], [0, 0], [0, 0], [-1, -1]],
                           [[0, 0], [0, 0], [0, 0], [-1, -1], [-1, -1]],
                           [[0, 0], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        variations[GT_FIELD] = gts

        expected = numpy.array([[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]])
        filter_gts = MinCalledGTsFilter(min_called=5, rates=False)
        filtered = filter_gts(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == expected)
        assert filtered[FLT_STATS][N_KEPT] == 1
        assert filtered[FLT_STATS][TOT] == 4
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 3

        expected = numpy.array([[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]],
                                [[0, 0], [0, 0], [0, 0], [0, 0], [-1, -1]],
                                [[0, 0], [0, 0], [0, 0], [-1, -1], [-1, -1]]])
        filter_gts = MinCalledGTsFilter(min_called=2, rates=False)
        filtered = filter_gts(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == expected)

        expected = numpy.array([[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]],
                                [[0, 0], [0, 0], [0, 0], [0, 0], [-1, -1]],
                                [[0, 0], [0, 0], [0, 0], [-1, -1], [-1, -1]]])
        filter_gts = MinCalledGTsFilter(min_called=0.4, rates=True)
        filtered = filter_gts(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == expected)

        filter_gts = MinCalledGTsFilter(min_called=0, rates=True)
        filtered = filter_gts(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == variations[GT_FIELD])

        # With hdf5 file
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        filter_gts = MinCalledGTsFilter(min_called=0.4,
                                        rates=True,
                                        report_selection=True)
        filtered = filter_gts(hdf5)
        counts = Counter(filtered[FLT_VARS][GT_FIELD].flat)
        assert counts == {0: 89936, 1: 50473, -1: 40972, 2: 378, 3: 5}
        assert filtered[SELECTED_VARS].shape
Пример #24
0
    def test_filter_samples_by_missing(self):
        variations = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        chunk = first(variations.iterate_chunks())

        new_var = VariationsArrays()
        filter_samples_by_missing_rate(chunk,
                                       min_called_rate=0.9,
                                       out_vars=new_var)
        assert len(new_var.samples) == 0

        new_var = VariationsArrays()
        filter_samples_by_missing_rate(chunk,
                                       min_called_rate=0.1,
                                       out_vars=new_var)
        assert len(new_var.samples) == len(chunk.samples)

        # for some samples
        new_var = VariationsArrays()
        filter_samples_by_missing_rate(chunk,
                                       min_called_rate=0.1,
                                       out_vars=new_var,
                                       samples=['1_18_4_gbs', '1_19_4_gbs'])
        assert new_var.samples == ['1_18_4_gbs', '1_19_4_gbs']

        # for het
        new_var = VariationsArrays()
        filter_samples_by_missing_rate(chunk, max_het=0.001, out_vars=new_var)
        assert new_var.samples == [
            '1_35_2_gbs', '4_136B_2_gbs', '4_5_5_gbs', '5_66B_3_gbs'
        ]

        # check that it works by chunk
        new_var = VariationsArrays()
        res = filter_samples_by_missing_rate(variations,
                                             min_called_rate=0.2,
                                             out_vars=new_var,
                                             do_histogram=True)
        new_var2 = VariationsArrays()
        res2 = filter_samples_by_missing_rate(variations,
                                              min_called_rate=0.2,
                                              out_vars=new_var2,
                                              chunk_size=None,
                                              do_histogram=True)

        assert res2['missing_rates'].shape[0] == len(variations.samples)
        assert res2['selected_samples'].shape[0] == len(variations.samples)
        assert new_var.samples == new_var2.samples
        assert numpy.all(new_var[GT_FIELD] == new_var2[GT_FIELD])
        assert numpy.allclose(res[EDGES], res2[EDGES])
        assert numpy.all(res[COUNTS][:] == res2[COUNTS][:])
Пример #25
0
    def test_bisect(self):

        snps = VariationsArrays()
        chroms = [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]
        pos = [1, 3, 5, 1, 2, 3, 1, 2, 3, 2, 4, 6]
        snps[CHROM_FIELD] = numpy.array(chroms)
        snps[POS_FIELD] = numpy.array(pos)

        # bisect right
        assert var_bisect_right(snps, 1, 0) == 0
        assert var_bisect_right(snps, 1, 1) == 1
        assert var_bisect_right(snps, 1, 3) == 2
        assert var_bisect_right(snps, 1, 5) == 3
        assert var_bisect_right(snps, 2, 1) == 4
        assert var_bisect_right(snps, 4, 7) == 12

        assert var_bisect_left(snps, 1, 0) == 0
        assert var_bisect_left(snps, 1, 1) == 0
        assert var_bisect_left(snps, 1, 3) == 1
        assert var_bisect_left(snps, 1, 5) == 2
        assert var_bisect_left(snps, 2, 1) == 3
        assert var_bisect_left(snps, 4, 7) == 12

        # find_re
        try:
            find_le(snps, 1, 0)
            self.fail('Value Error expected')
        except ValueError:
            pass

        assert find_le(snps, 1, 1) == 0
        assert find_le(snps, 1, 2) == 0
        assert find_le(snps, 1, 3) == 1
        assert find_le(snps, 1, 7) == 2
        assert find_le(snps, 2, 1) == 3
        assert find_le(snps, 2, 2) == 4
        assert find_le(snps, 4, 7) == 11

        assert find_ge(snps, 1, 0) == 0
        assert find_ge(snps, 1, 1) == 0
        assert find_ge(snps, 1, 2) == 1
        assert find_ge(snps, 1, 7) == 3
        assert find_ge(snps, 2, 1) == 3
        assert find_ge(snps, 2, 2) == 4
        try:
            find_ge(snps, 4, 7)
            self.fail('Value Error expected')
        except ValueError:
            pass

        assert index(snps, 1, 1) == 0
        try:
            index(snps, 0, 7)
            self.fail('Index Error expected')
        except IndexError:
            pass
        try:
            index(snps, 4, 3)
            self.fail('Index Error expected')
        except IndexError:
            pass
        assert index(snps, 4, 2) == 9
Пример #26
0
    def test_filter_obs_het(self):
        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [1, 1], [0, 1], [1, 1], [0, 0]],
                           [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]],
                           [[0, 0], [0, 0], [0, 0], [0, 0], [0, 1]],
                           [[0, 0], [0, 0], [0, 1], [0, 0], [1, 1]]])
        variations[GT_FIELD] = gts
        variations.samples = [1, 2, 3, 4, 5]

        filtered = ObsHetFilter(min_num_genotypes=0,
                                report_selection=True)(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == gts)
        assert filtered[FLT_STATS][N_KEPT] == 4
        assert filtered[FLT_STATS][TOT] == 4
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 0
        assert filtered[SELECTED_VARS].shape

        filtered = ObsHetFilter(min_het=0.2, min_num_genotypes=0)(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == gts[[0, 2, 3]])
        assert filtered[FLT_STATS][N_KEPT] == 3
        assert filtered[FLT_STATS][TOT] == 4
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 1

        filtered = ObsHetFilter(min_het=0.2, min_num_genotypes=10)(variations)
        assert filtered[FLT_STATS][N_KEPT] == 0
        assert filtered[FLT_STATS][TOT] == 4
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 4

        filtered = ObsHetFilter(min_het=0.2, min_num_genotypes=10,
                                keep_missing=True)(variations)
        assert filtered[FLT_STATS][N_KEPT] == 4
        assert filtered[FLT_STATS][TOT] == 4
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 0

        return
        filtered = ObsHetFilter(max_het=0.1, min_num_genotypes=0)(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == gts[[1]])

        filtered = ObsHetFilter(min_het=0.2, max_het=0.3,
                                min_num_genotypes=0)(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == gts[[0, 2, 3]])

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        filtered = ObsHetFilter(min_het=0.6, max_het=0.9)(hdf5)
        counts = Counter(filtered[FLT_VARS][GT_FIELD].flat)
        assert counts == {}

        filtered = ObsHetFilter(min_het=0.6, max_het=0.9, min_call_dp=5)(hdf5)
        counts = Counter(filtered[FLT_VARS][GT_FIELD].flat)
        assert counts == {0: 978, -1: 910, 1: 774, 2: 92}

        filtered = ObsHetFilter(min_het=0.6, max_het=0.9, min_call_dp=5,
                                n_bins=3, range_=(0, 1))(hdf5)
        counts = Counter(filtered[FLT_VARS][GT_FIELD].flat)
        assert counts == {0: 978, -1: 910, 1: 774, 2: 92}
        assert numpy.all(filtered[COUNTS] == [391, 14, 10])
        assert numpy.all(filtered[EDGES] == [0, 1 / 3, 2 / 3, 1])

        samples = hdf5.samples[:50]
        filtered = ObsHetFilter(min_het=0.6, max_het=0.9, min_call_dp=5,
                                n_bins=3, range_=(0, 1), samples=samples)(hdf5)
        counts = Counter(filtered[FLT_VARS][GT_FIELD].flat)
        assert sum(filtered[COUNTS]) == sum([339, 14, 6])
Пример #27
0
    def test_filter_high_density(self):
        in_vars = VariationsArrays()
        out_vars = VariationsArrays()
        res = filter_variation_density(in_vars,
                                       out_vars=out_vars,
                                       max_density=1,
                                       window=1)
        assert not out_vars.keys()
        assert not res

        varis = VariationsArrays()
        out_vars = VariationsArrays()
        varis[CHROM_FIELD] = numpy.array([b'chr1'] * 6)
        varis[POS_FIELD] = numpy.array([1, 2, 3, 4, 10, 11])
        result = filter_variation_density(varis,
                                          out_vars=out_vars,
                                          max_density=2,
                                          window=3,
                                          n_bins=2)
        assert list(out_vars[POS_FIELD]) == [1, 4, 10, 11]
        assert list(result[COUNTS]) == [4, 2]
        assert list(result[EDGES]) == [2., 2.5, 3.]
        assert result[FLT_STATS][N_KEPT] == 4
        assert result[FLT_STATS][TOT] == 6
        assert result[FLT_STATS][N_FILTERED_OUT] == 2

        varis = VariationsArrays()
        out_vars = VariationsArrays()
        varis[CHROM_FIELD] = numpy.array([b'chr1'] * 6)
        varis[POS_FIELD] = numpy.array([1, 2, 3, 4, 10, 11])
        result = filter_variation_density(varis,
                                          out_vars=out_vars,
                                          max_density=2,
                                          window=3,
                                          n_bins=2,
                                          chunk_size=1)
        assert list(out_vars[POS_FIELD]) == [1, 4, 10, 11]
        assert list(result[COUNTS]) == [4, 2]
        assert list(result[EDGES]) == [2., 2.5, 3.]
        assert result[FLT_STATS][N_KEPT] == 4
        assert result[FLT_STATS][TOT] == 6
        assert result[FLT_STATS][N_FILTERED_OUT] == 2

        varis = VariationsArrays()
        out_vars = VariationsArrays()
        varis[CHROM_FIELD] = numpy.array([b'chr1'] * 6)
        varis[POS_FIELD] = numpy.array([1, 2, 3, 4, 10, 11])
        result = filter_variation_density(varis,
                                          out_vars=out_vars,
                                          max_density=2,
                                          window=3,
                                          n_bins=2,
                                          chunk_size=None)
        assert list(out_vars[POS_FIELD]) == [1, 4, 10, 11]
        assert list(result[COUNTS]) == [4, 2]
        assert list(result[EDGES]) == [2., 2.5, 3.]
        assert result[FLT_STATS][N_KEPT] == 4
        assert result[FLT_STATS][TOT] == 6
        assert result[FLT_STATS][N_FILTERED_OUT] == 2
Пример #28
0
    def test_filter_obs_het(self):
        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [1, 1], [0, 1], [1, 1], [0, 0]],
                           [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]],
                           [[0, 0], [0, 0], [0, 0], [0, 0], [0, 1]],
                           [[0, 0], [0, 0], [0, 1], [0, 0], [1, 1]]])
        variations[GT_FIELD] = gts
        variations.samples = [1, 2, 3, 4, 5]

        filtered = ObsHetFilter(min_num_genotypes=0,
                                report_selection=True)(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == gts)
        assert filtered[FLT_STATS][N_KEPT] == 4
        assert filtered[FLT_STATS][TOT] == 4
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 0
        assert filtered[SELECTED_VARS].shape

        filtered = ObsHetFilter(min_het=0.2, min_num_genotypes=0)(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == gts[[0, 2, 3]])
        assert filtered[FLT_STATS][N_KEPT] == 3
        assert filtered[FLT_STATS][TOT] == 4
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 1

        filtered = ObsHetFilter(min_het=0.2, min_num_genotypes=10)(variations)
        assert filtered[FLT_STATS][N_KEPT] == 0
        assert filtered[FLT_STATS][TOT] == 4
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 4

        filtered = ObsHetFilter(min_het=0.2,
                                min_num_genotypes=10,
                                keep_missing=True)(variations)
        assert filtered[FLT_STATS][N_KEPT] == 4
        assert filtered[FLT_STATS][TOT] == 4
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 0

        return
        filtered = ObsHetFilter(max_het=0.1, min_num_genotypes=0)(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == gts[[1]])

        filtered = ObsHetFilter(min_het=0.2, max_het=0.3,
                                min_num_genotypes=0)(variations)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == gts[[0, 2, 3]])

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        filtered = ObsHetFilter(min_het=0.6, max_het=0.9)(hdf5)
        counts = Counter(filtered[FLT_VARS][GT_FIELD].flat)
        assert counts == {}

        filtered = ObsHetFilter(min_het=0.6, max_het=0.9, min_call_dp=5)(hdf5)
        counts = Counter(filtered[FLT_VARS][GT_FIELD].flat)
        assert counts == {0: 978, -1: 910, 1: 774, 2: 92}

        filtered = ObsHetFilter(min_het=0.6,
                                max_het=0.9,
                                min_call_dp=5,
                                n_bins=3,
                                range_=(0, 1))(hdf5)
        counts = Counter(filtered[FLT_VARS][GT_FIELD].flat)
        assert counts == {0: 978, -1: 910, 1: 774, 2: 92}
        assert numpy.all(filtered[COUNTS] == [391, 14, 10])
        assert numpy.all(filtered[EDGES] == [0, 1 / 3, 2 / 3, 1])

        samples = hdf5.samples[:50]
        filtered = ObsHetFilter(min_het=0.6,
                                max_het=0.9,
                                min_call_dp=5,
                                n_bins=3,
                                range_=(0, 1),
                                samples=samples)(hdf5)
        counts = Counter(filtered[FLT_VARS][GT_FIELD].flat)
        assert sum(filtered[COUNTS]) == sum([339, 14, 6])