def remove_low_call_rate_samples(variations, min_call_rate, rates=True, filter_id='sample_call_rate', calc_histogram=False, n_bins=DEF_NUM_BINS, limits=None): num_missing_gts = calc_missing_gt_per_sample(variations, rates=rates) if rates: num_called = 1 - num_missing_gts else: num_called = utils_array.get_shape_item(variations.gt, 0) - num_missing_gts selected_samples = num_called >= min_call_rate variations = keep_samples_with_mask(variations, selected_samples)[FLT_VARS] num_selected_samples = va.count_nonzero(selected_samples) num_filtered_samples = va.count_nonzero(va.logical_not(selected_samples)) flt_stats = { N_SAMPLES_KEPT: num_selected_samples, N_SAMPLES_FILTERED_OUT: num_filtered_samples } if calc_histogram: limits = (0, 1) if rates else (0, len(variations.num_variations)) counts, bin_edges = va.histogram(num_called, n_bins=n_bins, limits=limits) flt_stats[COUNT] = counts flt_stats[BIN_EDGES] = bin_edges flt_stats[HIST_RANGE] = [min_call_rate] return {FLT_VARS: variations, FLT_ID: filter_id, FLT_STATS: flt_stats}
def keep_variable_variations(variations, max_alleles, filter_id='variable_variations'): gts = variations[GT_FIELD] some_not_missing_gts = va.any(gts != MISSING_INT, axis=2) selected_vars1 = va.any(some_not_missing_gts, axis=1) allele_counts = count_alleles(gts, max_alleles=max_alleles, count_missing=False) num_alleles_per_snp = va.sum(allele_counts > 0, axis=1) selected_vars2 = num_alleles_per_snp > 1 selected_vars = va.logical_and(selected_vars1, selected_vars2) selected_variations = variations.get_vars(selected_vars) num_selected_vars = va.count_nonzero(selected_vars) num_filtered = va.count_nonzero(va.logical_not(selected_vars)) flt_stats = {N_KEPT: num_selected_vars, N_FILTERED_OUT: num_filtered} return { FLT_VARS: selected_variations, FLT_ID: filter_id, FLT_STATS: flt_stats }
def remove_low_call_rate_vars(variations, min_call_rate, rates=True, filter_id='call_rate', calc_histogram=False, n_bins=DEF_NUM_BINS, limits=None): num_missing_gts = calc_missing_gt(variations, rates=rates) if rates: num_called = 1 - num_missing_gts else: num_called = variations.gt.shape[1] - num_missing_gts selected_vars = num_called >= min_call_rate variations = variations.get_vars(selected_vars) num_selected_vars = va.count_nonzero(selected_vars) num_filtered = va.count_nonzero(va.logical_not(selected_vars)) flt_stats = {N_KEPT: num_selected_vars, N_FILTERED_OUT: num_filtered} if calc_histogram: limits = (0, 1) if rates else (0, len(variations.num_samples)) counts, bin_edges = va.histogram(num_called, n_bins=n_bins, limits=limits) flt_stats[COUNT] = counts flt_stats[BIN_EDGES] = bin_edges flt_stats['limits'] = [min_call_rate] return {FLT_VARS: variations, FLT_ID: filter_id, FLT_STATS: flt_stats}
def _filter_by_snp_position(variations, regions, filter_id, reverse=False): selected_vars = _select_variations_in_region(variations, regions) if reverse: selected_vars = va.logical_not(selected_vars) selected_variations = variations.get_vars(selected_vars) # print('sel', selected_variations.shape) num_selected_vars = va.count_nonzero(selected_vars) num_filtered = va.count_nonzero(va.logical_not(selected_vars)) flt_stats = {N_KEPT: num_selected_vars, N_FILTERED_OUT: num_filtered} return { FLT_VARS: selected_variations, FLT_ID: filter_id, FLT_STATS: flt_stats }
def _select_vars(variations, stats, min_allowable=None, max_allowable=None): with np.errstate(invalid='ignore'): selector_max = None if max_allowable is None else stats <= max_allowable selector_min = None if min_allowable is None else stats >= min_allowable if selector_max is None and selector_min is not None: selected_vars = selector_min elif selector_max is not None and selector_min is None: selected_vars = selector_max elif selector_max is not None and selector_min is not None: selected_vars = selector_min & selector_max else: selected_vars = _filter_no_row(variations) variations = variations.get_vars(selected_vars) num_selected_vars = va.count_nonzero(selected_vars) num_filtered = va.count_nonzero(va.logical_not(selected_vars)) flt_stats = {N_KEPT: num_selected_vars, N_FILTERED_OUT: num_filtered} return {FLT_VARS: variations, FLT_STATS: flt_stats}
def calc_dset_pop_distance(variations, max_alleles, populations, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, min_call_dp_for_het=0, silence_runtime_warnings=False): '''This is an implementation of the formulas proposed in GenAlex''' pop_ids = list(range(len(populations))) variations_per_pop = [ keep_samples(variations, pop_samples)[FLT_VARS] for pop_samples in populations ] accumulated_dists = {} accumulated_hs = {} accumulated_ht = {} num_vars = {} for pop_id1, pop_id2 in combinations(pop_ids, 2): vars_for_pop1 = variations_per_pop[pop_id1] vars_for_pop2 = variations_per_pop[pop_id2] res = _calc_pairwise_dest(vars_for_pop1, vars_for_pop2, max_alleles=max_alleles, min_call_dp_for_het=min_call_dp_for_het, min_num_genotypes=min_num_genotypes) res['corrected_hs'] res['corrected_ht'] num_vars_in_chunk = va.count_nonzero(~va.isnan(res['corrected_hs'])) hs_in_chunk = va.nansum(res['corrected_hs']) ht_in_chunk = va.nansum(res['corrected_ht']) key = (pop_id1, pop_id2) if key in accumulated_dists: accumulated_hs[key] += hs_in_chunk accumulated_ht[key] += ht_in_chunk num_vars[key] += num_vars_in_chunk else: accumulated_hs[key] = hs_in_chunk accumulated_ht[key] = ht_in_chunk num_vars[key] = num_vars_in_chunk task = { 'accumulated_hs': accumulated_hs, 'accumulated_ht': accumulated_ht, 'num_vars': num_vars } result = compute(task, silence_runtime_warnings=silence_runtime_warnings) computed_accumulated_hs = result['accumulated_hs'] computed_accumulated_ht = result['accumulated_ht'] computed_num_vars = result['num_vars'] tot_n_pops = len(populations) dists = np.empty(int((tot_n_pops**2 - tot_n_pops) / 2)) dists[:] = np.nan num_pops = 2 for idx, (pop_id1, pop_id2) in enumerate(combinations(pop_ids, 2)): key = pop_id1, pop_id2 if key in accumulated_hs: with np.errstate(invalid='ignore'): corrected_hs = computed_accumulated_hs[ key] / computed_num_vars[key] corrected_ht = computed_accumulated_ht[ key] / computed_num_vars[key] dest = (num_pops / (num_pops - 1)) * ((corrected_ht - corrected_hs) / (1 - corrected_hs)) else: dest = np.nan dists[idx] = dest return dists