def _calc_stat(self, variations): vars_for_stat = self._filter_samples_for_stats(variations) assert len(vars_for_stat.samples) == self.sample_dp_means.shape[0] dps = vars_for_stat[DP_FIELD] if dps.shape[0] == 0: # No SNPs raise ValueError('No SNPs to filter') if is_dataset(dps): dps = dps[:] num_no_miss_calls = numpy.sum(dps > 0, axis=1) high_dp_calls = dps > self._too_high_dps het_calls = call_is_het(vars_for_stat[GT_FIELD]) het_and_high_dp_calls = numpy.logical_and(high_dp_calls, het_calls) num_high_dp_and_het_calls = numpy.sum(het_and_high_dp_calls, axis=1) with numpy.errstate(all='ignore'): # This is the stat freq_high_dp_and_het_calls = (num_high_dp_and_het_calls / num_no_miss_calls) return freq_high_dp_and_het_calls
def _calc_sample_missing_rates(variations, chunk_size, min_called_rate, max_het): if chunk_size is None: chunks = [variations] else: chunks = variations.iterate_chunks(kept_fields=[GT_FIELD], chunk_size=chunk_size) missing = None het_counts = None for chunk in chunks: chunk_missing = calc_called_gt(chunk, rates=False, axis=0) if min_called_rate is not None: if missing is None: missing = chunk_missing else: missing += chunk_missing if max_het is not None: is_het = call_is_het(chunk[GT_FIELD]) chunk_het_counts = numpy.sum(is_het, axis=0) if het_counts is None: het_counts = chunk_het_counts else: het_counts += chunk_het_counts res = {} if min_called_rate is not None: rates = missing / variations.num_variations res['missing_rates'] = rates if max_het is not None: obs_hets = het_counts / variations.num_variations res['obs_hets'] = obs_hets return res
def __call__(self, variations): vars_for_stat = self._filter_samples_for_stats(variations) assert len(vars_for_stat.samples) == self.sample_dp_means.shape[0] dps = vars_for_stat[DP_FIELD] if is_dataset(dps): dps = dps[:] num_no_miss_calls = numpy.sum(dps > 0, axis=1) high_dp_calls = dps > self._too_high_dps num_high_dp_calls = numpy.sum(high_dp_calls, axis=1) with numpy.errstate(all='ignore'): # This is the stat freq_high_dp = num_high_dp_calls / num_no_miss_calls result = {} if self.do_histogram: counts, edges = histogram(freq_high_dp, n_bins=self.n_bins, range_=self.range) result[COUNTS] = counts result[EDGES] = edges if self.do_filtering or self.report_selection: het_call = call_is_het(vars_for_stat[GT_FIELD]) with numpy.errstate(all='ignore'): obs_het = numpy.sum(het_call, axis=1) / num_no_miss_calls with numpy.errstate(all='ignore'): too_much_het = numpy.greater(obs_het, self.max_obs_het) with numpy.errstate(all='ignore'): snps_too_high = numpy.greater(freq_high_dp, self.max_high_dp_freq) to_remove = numpy.logical_and(too_much_het, snps_too_high) selected_snps = numpy.logical_not(to_remove) if self.report_selection: result[SELECTED_VARS] = selected_snps if self.do_filtering: flt_vars = variations.get_chunk(selected_snps) n_kept = numpy.count_nonzero(selected_snps) tot = selected_snps.shape[0] n_filtered_out = tot - n_kept result[FLT_VARS] = flt_vars result[FLT_STATS] = { N_KEPT: n_kept, N_FILTERED_OUT: n_filtered_out, TOT: tot } return result
def __call__(self, variations): vars_for_stat = self._filter_samples_for_stats(variations) assert len(vars_for_stat.samples) == self.sample_dp_means.shape[0] dps = vars_for_stat[DP_FIELD] if is_dataset(dps): dps = dps[:] num_no_miss_calls = numpy.sum(dps > 0, axis=1) high_dp_calls = dps > self._too_high_dps num_high_dp_calls = numpy.sum(high_dp_calls, axis=1) with numpy.errstate(all='ignore'): # This is the stat freq_high_dp = num_high_dp_calls / num_no_miss_calls result = {} if self.do_histogram: counts, edges = histogram(freq_high_dp, n_bins=self.n_bins, range_=self.range) result[COUNTS] = counts result[EDGES] = edges if self.do_filtering or self.report_selection: het_call = call_is_het(vars_for_stat[GT_FIELD]) with numpy.errstate(all='ignore'): obs_het = numpy.sum(het_call, axis=1) / num_no_miss_calls with numpy.errstate(all='ignore'): too_much_het = numpy.greater(obs_het, self.max_obs_het) with numpy.errstate(all='ignore'): snps_too_high = numpy.greater(freq_high_dp, self.max_high_dp_freq) to_remove = numpy.logical_and(too_much_het, snps_too_high) selected_snps = numpy.logical_not(to_remove) if self.report_selection: result[SELECTED_VARS] = selected_snps if self.do_filtering: flt_vars = variations.get_chunk(selected_snps) n_kept = numpy.count_nonzero(selected_snps) tot = selected_snps.shape[0] n_filtered_out = tot - n_kept result[FLT_VARS] = flt_vars result[FLT_STATS] = {N_KEPT: n_kept, N_FILTERED_OUT: n_filtered_out, TOT: tot} return result
def _calc_stat(self, variations): vars_for_stat = self._filter_samples_for_stats(variations) assert len(vars_for_stat.samples) == self.sample_dp_means.shape[0] dps = vars_for_stat[DP_FIELD] if is_dataset(dps): dps = dps[:] num_no_miss_calls = numpy.sum(dps > 0, axis=1) high_dp_calls = dps > self._too_high_dps het_calls = call_is_het(vars_for_stat[GT_FIELD]) het_and_high_dp_calls = numpy.logical_and(high_dp_calls, het_calls) num_high_dp_and_het_calls = numpy.sum(het_and_high_dp_calls, axis=1) with numpy.errstate(all='ignore'): # This is the stat freq_high_dp_and_het_calls = (num_high_dp_and_het_calls / num_no_miss_calls) return freq_high_dp_and_het_calls