def test_calc_missing_gt_rates(self): gts = numpy.array([]) varis = {'/calls/GT': gts} called_vars = calc_called_gt(varis, rates=False) assert called_vars.shape[0] == 0 called_vars = calc_called_gt(varis, rates=True) assert called_vars.shape[0] == 0 hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') arrays = VariationsArrays() arrays.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT'])) rates = calc_missing_gt(arrays) rates2 = calc_missing_gt(hdf5) assert rates.shape == (943,) assert numpy.allclose(rates, rates2) assert numpy.min(rates) == 0 assert numpy.all(rates <= 1) gts = numpy.array([[[0, 0], [0, 0]], [[0, 0], [-1, -1]], [[0, 0], [-1, -1]], [[-1, -1], [-1, -1]]]) varis = {'/calls/GT': gts} expected = numpy.array([2, 1, 1, 0]) called_vars = calc_called_gt(varis, rates=False) assert numpy.all(called_vars == expected) missing_vars = calc_missing_gt(varis, rates=False) assert numpy.all(missing_vars == 2 - expected) expected = numpy.array([0, 0.5, 0.5, 1]) rates = calc_called_gt(varis) assert numpy.allclose(rates, 1 - expected) rates = calc_missing_gt(varis) assert numpy.allclose(rates, expected)
def _calc_sample_missing_rates(variations, chunk_size, min_called_rate, max_het): if chunk_size is None: chunks = [variations] else: chunks = variations.iterate_chunks(kept_fields=[GT_FIELD], chunk_size=chunk_size) missing = None het_counts = None for chunk in chunks: chunk_missing = calc_called_gt(chunk, rates=False, axis=0) if min_called_rate is not None: if missing is None: missing = chunk_missing else: missing += chunk_missing if max_het is not None: is_het = call_is_het(chunk[GT_FIELD]) chunk_het_counts = numpy.sum(is_het, axis=0) if het_counts is None: het_counts = chunk_het_counts else: het_counts += chunk_het_counts res = {} if min_called_rate is not None: rates = missing / variations.num_variations res['missing_rates'] = rates if max_het is not None: obs_hets = het_counts / variations.num_variations res['obs_hets'] = obs_hets return res
def calc_number_of_alleles_and_number_called_gts( variations, populations=None, pop_sample_filters=None, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT): pop_sample_filters = _prepare_pop_sample_filters(populations, pop_sample_filters) num_alleles_per_snp = {} num_called_gt = {} for pop_id, pop_sample_filter in pop_sample_filters.items(): vars_for_pop = pop_sample_filter(variations)[FLT_VARS] allele_counts_per_allele_and_snp = _count_alleles_per_allele_and_snp( vars_for_pop) chunk_num_alleles_per_snp = numpy.sum( allele_counts_per_allele_and_snp != 0, axis=1) chunk_num_alleles_per_snp = _mask_stats_with_few_samples( chunk_num_alleles_per_snp, vars_for_pop, min_num_genotypes, masking_value=0) num_alleles_per_snp[pop_id] = chunk_num_alleles_per_snp num_called_gt[pop_id] = calc_called_gt(vars_for_pop, rates=False) return num_alleles_per_snp, num_called_gt
def _calc_sample_missing_rates(variations, chunk_size): if chunk_size is None: chunks = [variations] else: chunks = variations.iterate_chunks(kept_fields=[GT_FIELD], chunk_size=chunk_size) missing = None for chunk in chunks: chunk_missing = calc_called_gt(chunk, rates=False, axis=0) if missing is None: missing = chunk_missing else: missing += chunk_missing rates = missing / variations.num_variations return rates
def _calc_stat(self, variations): return calc_called_gt(variations, rates=self.rates)
def calc_number_of_private_alleles( variations, populations=None, pop_sample_filters=None, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT): pop_sample_filters = _prepare_pop_sample_filters(populations, pop_sample_filters) if len(pop_sample_filters) < 2: raise ValueError('At least two populations are required') different_alleles = numpy.sort(numpy.unique(variations[GT_FIELD])) if different_alleles[0] == MISSING_INT: different_alleles = different_alleles[1:] tot_num_called_gts = calc_called_gt(variations, rates=False) allele_counts_per_pop_per_allele_and_snp = {} not_enough_genotypes_masks = {} for pop_id, pop_sample_filter in pop_sample_filters.items(): vars_for_pop = pop_sample_filter(variations)[FLT_VARS] allele_counts = _count_alleles_per_allele_and_snp( vars_for_pop, different_alleles) if min_num_genotypes: num_called_gts_in_pop = calc_called_gt(vars_for_pop, rates=False) num_called_gts_in_other_pops = tot_num_called_gts - num_called_gts_in_pop mask = numpy.logical_or( num_called_gts_in_pop < min_num_genotypes, num_called_gts_in_other_pops < min_num_genotypes) not_enough_genotypes_masks[pop_id] = mask allele_counts_per_pop_per_allele_and_snp[pop_id] = allele_counts private_alleles = {} for pop_id in pop_sample_filters: other_pops_allele_counts = None for other_pop in pop_sample_filters: if other_pop == pop_id: continue pop_counts = allele_counts_per_pop_per_allele_and_snp[other_pop] if other_pops_allele_counts is None: other_pops_allele_counts = pop_counts else: other_pops_allele_counts = numpy.add(other_pops_allele_counts, pop_counts) this_pop_allele_counts = allele_counts_per_pop_per_allele_and_snp[ pop_id] alleles_present_in_this_pop = this_pop_allele_counts > 0 alleles_not_present_in_other_pops = other_pops_allele_counts == 0 alleles_present_in_this_pop_not_in_others = numpy.logical_and( alleles_present_in_this_pop, alleles_not_present_in_other_pops) private_alleles_for_pop = numpy.sum( alleles_present_in_this_pop_not_in_others, axis=1) if min_num_genotypes: mask = not_enough_genotypes_masks[pop_id] private_alleles_for_pop[mask] = 0 private_alleles[pop_id] = private_alleles_for_pop return private_alleles