예제 #1
0
    def test_calc_missing_gt_rates(self):
        gts = numpy.array([])
        varis = {'/calls/GT': gts}
        called_vars = calc_called_gt(varis, rates=False)
        assert called_vars.shape[0] == 0
        called_vars = calc_called_gt(varis, rates=True)
        assert called_vars.shape[0] == 0

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        arrays = VariationsArrays()
        arrays.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT']))
        rates = calc_missing_gt(arrays)
        rates2 = calc_missing_gt(hdf5)
        assert rates.shape == (943,)
        assert numpy.allclose(rates, rates2)
        assert numpy.min(rates) == 0
        assert numpy.all(rates <= 1)

        gts = numpy.array([[[0, 0], [0, 0]], [[0, 0], [-1, -1]],
                           [[0, 0], [-1, -1]], [[-1, -1], [-1, -1]]])
        varis = {'/calls/GT': gts}
        expected = numpy.array([2, 1, 1, 0])
        called_vars = calc_called_gt(varis, rates=False)
        assert numpy.all(called_vars == expected)

        missing_vars = calc_missing_gt(varis, rates=False)
        assert numpy.all(missing_vars == 2 - expected)

        expected = numpy.array([0, 0.5, 0.5, 1])
        rates = calc_called_gt(varis)
        assert numpy.allclose(rates, 1 - expected)

        rates = calc_missing_gt(varis)
        assert numpy.allclose(rates, expected)
    def test_calc_missing_gt_rates(self):
        gts = numpy.array([])
        varis = {'/calls/GT': gts}
        called_vars = calc_called_gt(varis, rates=False)
        assert called_vars.shape[0] == 0
        called_vars = calc_called_gt(varis, rates=True)
        assert called_vars.shape[0] == 0

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        arrays = VariationsArrays()
        arrays.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT']))
        rates = calc_missing_gt(arrays)
        rates2 = calc_missing_gt(hdf5)
        assert rates.shape == (943,)
        assert numpy.allclose(rates, rates2)
        assert numpy.min(rates) == 0
        assert numpy.all(rates <= 1)

        gts = numpy.array([[[0, 0], [0, 0]], [[0, 0], [-1, -1]],
                           [[0, 0], [-1, -1]], [[-1, -1], [-1, -1]]])
        varis = {'/calls/GT': gts}
        expected = numpy.array([2, 1, 1, 0])
        called_vars = calc_called_gt(varis, rates=False)
        assert numpy.all(called_vars == expected)

        missing_vars = calc_missing_gt(varis, rates=False)
        assert numpy.all(missing_vars == 2 - expected)

        expected = numpy.array([0, 0.5, 0.5, 1])
        rates = calc_called_gt(varis)
        assert numpy.allclose(rates, 1 - expected)

        rates = calc_missing_gt(varis)
        assert numpy.allclose(rates, expected)
예제 #3
0
def _calc_sample_missing_rates(variations, chunk_size, min_called_rate,
                               max_het):

    if chunk_size is None:
        chunks = [variations]
    else:
        chunks = variations.iterate_chunks(kept_fields=[GT_FIELD],
                                           chunk_size=chunk_size)
    missing = None
    het_counts = None
    for chunk in chunks:
        chunk_missing = calc_called_gt(chunk, rates=False, axis=0)
        if min_called_rate is not None:
            if missing is None:
                missing = chunk_missing
            else:
                missing += chunk_missing

        if max_het is not None:
            is_het = call_is_het(chunk[GT_FIELD])
            chunk_het_counts = numpy.sum(is_het, axis=0)
            if het_counts is None:
                het_counts = chunk_het_counts
            else:
                het_counts += chunk_het_counts

    res = {}
    if min_called_rate is not None:
        rates = missing / variations.num_variations
        res['missing_rates'] = rates
    if max_het is not None:
        obs_hets = het_counts / variations.num_variations
        res['obs_hets'] = obs_hets

    return res
예제 #4
0
def calc_number_of_alleles_and_number_called_gts(
        variations,
        populations=None,
        pop_sample_filters=None,
        min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT):
    pop_sample_filters = _prepare_pop_sample_filters(populations,
                                                     pop_sample_filters)

    num_alleles_per_snp = {}
    num_called_gt = {}
    for pop_id, pop_sample_filter in pop_sample_filters.items():
        vars_for_pop = pop_sample_filter(variations)[FLT_VARS]

        allele_counts_per_allele_and_snp = _count_alleles_per_allele_and_snp(
            vars_for_pop)

        chunk_num_alleles_per_snp = numpy.sum(
            allele_counts_per_allele_and_snp != 0, axis=1)
        chunk_num_alleles_per_snp = _mask_stats_with_few_samples(
            chunk_num_alleles_per_snp,
            vars_for_pop,
            min_num_genotypes,
            masking_value=0)
        num_alleles_per_snp[pop_id] = chunk_num_alleles_per_snp
        num_called_gt[pop_id] = calc_called_gt(vars_for_pop, rates=False)
    return num_alleles_per_snp, num_called_gt
예제 #5
0
def _calc_sample_missing_rates(variations, chunk_size):

    if chunk_size is None:
        chunks = [variations]
    else:
        chunks = variations.iterate_chunks(kept_fields=[GT_FIELD],
                                           chunk_size=chunk_size)
    missing = None
    for chunk in chunks:
        chunk_missing = calc_called_gt(chunk, rates=False, axis=0)
        if missing is None:
            missing = chunk_missing
        else:
            missing += chunk_missing
    rates = missing / variations.num_variations
    return rates
예제 #6
0
 def _calc_stat(self, variations):
     return calc_called_gt(variations, rates=self.rates)
예제 #7
0
 def _calc_stat(self, variations):
     return calc_called_gt(variations, rates=self.rates)
예제 #8
0
def calc_number_of_private_alleles(
        variations,
        populations=None,
        pop_sample_filters=None,
        min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT):

    pop_sample_filters = _prepare_pop_sample_filters(populations,
                                                     pop_sample_filters)

    if len(pop_sample_filters) < 2:
        raise ValueError('At least two populations are required')

    different_alleles = numpy.sort(numpy.unique(variations[GT_FIELD]))
    if different_alleles[0] == MISSING_INT:
        different_alleles = different_alleles[1:]

    tot_num_called_gts = calc_called_gt(variations, rates=False)

    allele_counts_per_pop_per_allele_and_snp = {}
    not_enough_genotypes_masks = {}
    for pop_id, pop_sample_filter in pop_sample_filters.items():
        vars_for_pop = pop_sample_filter(variations)[FLT_VARS]
        allele_counts = _count_alleles_per_allele_and_snp(
            vars_for_pop, different_alleles)
        if min_num_genotypes:
            num_called_gts_in_pop = calc_called_gt(vars_for_pop, rates=False)
            num_called_gts_in_other_pops = tot_num_called_gts - num_called_gts_in_pop
            mask = numpy.logical_or(
                num_called_gts_in_pop < min_num_genotypes,
                num_called_gts_in_other_pops < min_num_genotypes)
            not_enough_genotypes_masks[pop_id] = mask
        allele_counts_per_pop_per_allele_and_snp[pop_id] = allele_counts

    private_alleles = {}
    for pop_id in pop_sample_filters:
        other_pops_allele_counts = None
        for other_pop in pop_sample_filters:
            if other_pop == pop_id:
                continue
            pop_counts = allele_counts_per_pop_per_allele_and_snp[other_pop]
            if other_pops_allele_counts is None:
                other_pops_allele_counts = pop_counts
            else:
                other_pops_allele_counts = numpy.add(other_pops_allele_counts,
                                                     pop_counts)
        this_pop_allele_counts = allele_counts_per_pop_per_allele_and_snp[
            pop_id]

        alleles_present_in_this_pop = this_pop_allele_counts > 0
        alleles_not_present_in_other_pops = other_pops_allele_counts == 0
        alleles_present_in_this_pop_not_in_others = numpy.logical_and(
            alleles_present_in_this_pop, alleles_not_present_in_other_pops)

        private_alleles_for_pop = numpy.sum(
            alleles_present_in_this_pop_not_in_others, axis=1)

        if min_num_genotypes:
            mask = not_enough_genotypes_masks[pop_id]
            private_alleles_for_pop[mask] = 0

        private_alleles[pop_id] = private_alleles_for_pop
    return private_alleles