Exemplo n.º 1
0
    def test_count_value_per_row(self):
        mat = numpy.array([[0, 0], [1, -1], [2, -1], [-1, -1]])
        missing_counter = row_value_counter_fact(value=-1)
        assert numpy.all(missing_counter(mat) == [0, 1, 1, 2])

        missing_counter = row_value_counter_fact(value=-1, ratio=True)
        assert numpy.allclose(missing_counter(mat), [0., 0.5, 0.5, 1.])

        hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r')
        chunks = list(hdf5.iterate_chunks())
        gt_chunk = first(select_dset_from_chunks(chunks, '/calls/GT'))

        homo_counter = row_value_counter_fact(value=2)
        assert numpy.all(homo_counter(gt_chunk) == [0, 0, 4, 0, 1])

        missing_counter = row_value_counter_fact(value=2, ratio=True)
        expected = [0., 0, 0.66666, 0., 0.166666]
        assert numpy.allclose(missing_counter(gt_chunk), expected)
        hdf5.close()
Exemplo n.º 2
0
    def test_count_value_per_row(self):
        mat = numpy.array([[0, 0], [1, -1], [2, -1], [-1, -1]])
        missing_counter = row_value_counter_fact(value=-1)
        assert numpy.all(missing_counter(mat) == [0, 1, 1, 2])

        missing_counter = row_value_counter_fact(value=-1, ratio=True)
        assert numpy.allclose(missing_counter(mat), [0., 0.5, 0.5, 1.])

        hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r')
        chunks = list(hdf5.iterate_chunks())
        gt_chunk = first(select_dset_from_chunks(chunks, '/calls/GT'))

        homo_counter = row_value_counter_fact(value=2)
        assert numpy.all(homo_counter(gt_chunk) == [0, 0, 4, 0, 1])

        missing_counter = row_value_counter_fact(value=2, ratio=True)
        expected = [0., 0, 0.66666, 0., 0.166666]
        assert numpy.allclose(missing_counter(gt_chunk), expected)
        hdf5.close()
Exemplo n.º 3
0
def _count_gts(variations):

    if variations[GT_FIELD].shape[0] == 0:
        return numpy.array([]), numpy.array([]), numpy.array([])

    gts = variations[GT_FIELD]
    gts = gts[...]

    # get rid of genotypes with missing alleles
    missing_alleles = gts == MISSING_INT
    miss_gts = numpy.any(missing_alleles, axis=2)

    # We pack the genotype of a sample that is in third axes as two
    # integers as one integer: 1, 1 -> 11 0, 1 -> 01, 0, 0-> 0
    gts_per_haplo = [(gts[:, :, idx].astype(numpy.int16)) * (100**idx)
                     for idx in range(gts.shape[2])]
    packed_gts = None
    for gts_ in gts_per_haplo:
        if packed_gts is None:
            packed_gts = gts_
        else:
            packed_gts += gts_
    packed_gts[miss_gts] = MISSING_INT

    different_gts = numpy.unique(packed_gts)
    # Count genotypes, h**o, het and alleles
    ploidy = gts.shape[2]
    counts = {}
    for gt in different_gts:
        count_gt_by_row = row_value_counter_fact(gt)
        gt_counts = count_gt_by_row(packed_gts)
        if gt == MISSING_INT:
            continue
        unpacked_gt = _packed_gt_to_tuple(gt, ploidy)
        if unpacked_gt not in counts:
            counts[unpacked_gt] = gt_counts
        else:
            counts[unpacked_gt] += gt_counts
    return counts
Exemplo n.º 4
0
def _count_gts(variations):

    if variations[GT_FIELD].shape[0] == 0:
        return numpy.array([]), numpy.array([]), numpy.array([])

    gts = variations[GT_FIELD]
    gts = gts[...]

    # get rid of genotypes with missing alleles
    missing_alleles = gts == MISSING_INT
    miss_gts = numpy.any(missing_alleles, axis=2)

    # We pack the genotype of a sample that is in third axes as two
    # integers as one integer: 1, 1 -> 11 0, 1 -> 01, 0, 0-> 0
    gts_per_haplo = [(gts[:, :, idx].astype(numpy.int16)) * (100 ** idx) for idx in range(gts.shape[2])]
    packed_gts = None
    for gts_ in gts_per_haplo:
        if packed_gts is None:
            packed_gts = gts_
        else:
            packed_gts += gts_
    packed_gts[miss_gts] = MISSING_INT

    different_gts = numpy.unique(packed_gts)
    # Count genotypes, h**o, het and alleles
    ploidy = gts.shape[2]
    counts = {}
    for gt in different_gts:
        count_gt_by_row = row_value_counter_fact(gt)
        gt_counts = count_gt_by_row(packed_gts)
        if gt == MISSING_INT:
            continue
        unpacked_gt = _packed_gt_to_tuple(gt, ploidy)
        if unpacked_gt not in counts:
            counts[unpacked_gt] = gt_counts
        else:
            counts[unpacked_gt] += gt_counts
    return counts