def remove_low_call_rate_samples(variations, min_call_rate, rates=True, filter_id='sample_call_rate', calc_histogram=False, n_bins=DEF_NUM_BINS, limits=None): num_missing_gts = calc_missing_gt_per_sample(variations, rates=rates) if rates: num_called = 1 - num_missing_gts else: num_called = utils_array.get_shape_item(variations.gt, 0) - num_missing_gts selected_samples = num_called >= min_call_rate variations = keep_samples_with_mask(variations, selected_samples)[FLT_VARS] num_selected_samples = va.count_nonzero(selected_samples) num_filtered_samples = va.count_nonzero(va.logical_not(selected_samples)) flt_stats = { N_SAMPLES_KEPT: num_selected_samples, N_SAMPLES_FILTERED_OUT: num_filtered_samples } if calc_histogram: limits = (0, 1) if rates else (0, len(variations.num_variations)) counts, bin_edges = va.histogram(num_called, n_bins=n_bins, limits=limits) flt_stats[COUNT] = counts flt_stats[BIN_EDGES] = bin_edges flt_stats[HIST_RANGE] = [min_call_rate] return {FLT_VARS: variations, FLT_ID: filter_id, FLT_STATS: flt_stats}
def keep_variable_variations(variations, max_alleles, filter_id='variable_variations'): gts = variations[GT_FIELD] some_not_missing_gts = va.any(gts != MISSING_INT, axis=2) selected_vars1 = va.any(some_not_missing_gts, axis=1) allele_counts = count_alleles(gts, max_alleles=max_alleles, count_missing=False) num_alleles_per_snp = va.sum(allele_counts > 0, axis=1) selected_vars2 = num_alleles_per_snp > 1 selected_vars = va.logical_and(selected_vars1, selected_vars2) selected_variations = variations.get_vars(selected_vars) num_selected_vars = va.count_nonzero(selected_vars) num_filtered = va.count_nonzero(va.logical_not(selected_vars)) flt_stats = {N_KEPT: num_selected_vars, N_FILTERED_OUT: num_filtered} return { FLT_VARS: selected_variations, FLT_ID: filter_id, FLT_STATS: flt_stats }
def calc_diversities(variations, max_alleles, min_num_genotypes, min_call_dp_for_het_call=MIN_DP_FOR_CALL_HET, polymorphic_threshold=0.95): diversities = {} mafs = calc_maf_by_gt(variations, max_alleles, min_num_genotypes=min_num_genotypes) mafs_no_nan = mafs[va.logical_not(va.isnan(mafs))] num_variable_vars = va.sum(mafs_no_nan < 0.9999999999) diversities['num_variable_vars'] = num_variable_vars snp_is_poly = mafs_no_nan <= polymorphic_threshold num_poly = va.sum(snp_is_poly) diversities['num_polymorphic_vars'] = num_poly exp_het = calc_expected_het(variations, max_alleles=max_alleles, min_num_genotypes=min_num_genotypes) diversities['exp_het'] = va.nanmean(exp_het) obs_het = calc_obs_het(variations, min_call_dp_for_het_call=min_call_dp_for_het_call, min_num_genotypes=min_num_genotypes) diversities['obs_het'] = va.nanmean(obs_het) diversities['num_total_variations'] = variations.num_variations return diversities
def remove_low_call_rate_vars(variations, min_call_rate, rates=True, filter_id='call_rate', calc_histogram=False, n_bins=DEF_NUM_BINS, limits=None): num_missing_gts = calc_missing_gt(variations, rates=rates) if rates: num_called = 1 - num_missing_gts else: num_called = variations.gt.shape[1] - num_missing_gts selected_vars = num_called >= min_call_rate variations = variations.get_vars(selected_vars) num_selected_vars = va.count_nonzero(selected_vars) num_filtered = va.count_nonzero(va.logical_not(selected_vars)) flt_stats = {N_KEPT: num_selected_vars, N_FILTERED_OUT: num_filtered} if calc_histogram: limits = (0, 1) if rates else (0, len(variations.num_samples)) counts, bin_edges = va.histogram(num_called, n_bins=n_bins, limits=limits) flt_stats[COUNT] = counts flt_stats[BIN_EDGES] = bin_edges flt_stats['limits'] = [min_call_rate] return {FLT_VARS: variations, FLT_ID: filter_id, FLT_STATS: flt_stats}
def _filter_by_snp_position(variations, regions, filter_id, reverse=False): selected_vars = _select_variations_in_region(variations, regions) if reverse: selected_vars = va.logical_not(selected_vars) selected_variations = variations.get_vars(selected_vars) # print('sel', selected_variations.shape) num_selected_vars = va.count_nonzero(selected_vars) num_filtered = va.count_nonzero(va.logical_not(selected_vars)) flt_stats = {N_KEPT: num_selected_vars, N_FILTERED_OUT: num_filtered} return { FLT_VARS: selected_variations, FLT_ID: filter_id, FLT_STATS: flt_stats }
def _call_is_het(variations, is_missing=None): is_hom = _call_is_hom(variations, is_missing=is_missing) # if is_hom.shape[0] == 0: # return is_hom, is_missing is_het = va.logical_not(is_hom) if is_missing is not None: is_het[is_missing] = False return is_het
def hmean(array, axis=0, dtype=None): if axis is None: array = array.ravel() size = array.shape[0] else: size = array.shape[axis] with np.errstate(divide='ignore'): inverse_mean = va.sum(1.0 / array, axis=axis, dtype=dtype) is_inf = va.logical_not(va.isfinite(inverse_mean)) hmean = size / inverse_mean hmean[is_inf] = np.nan return hmean
def _calc_obs_het_counts(variations, axis, min_call_dp_for_het_call, max_call_dp_for_het_call=None): is_missing = va.any(variations[GT_FIELD] == MISSING_INT, axis=2) if min_call_dp_for_het_call is not None or max_call_dp_for_het_call is not None: dps = variations[DP_FIELD] if min_call_dp_for_het_call is not None: low_dp = dps < min_call_dp_for_het_call is_missing = va.logical_or(is_missing, low_dp) if max_call_dp_for_het_call is not None: high_dp = dps > max_call_dp_for_het_call is_missing = va.logical_or(is_missing, high_dp) is_het = _call_is_het(variations, is_missing=is_missing) return (va.sum(is_het, axis=axis), va.sum(va.logical_not(is_missing), axis=axis))
def _get_gts_non_missing_in_both(vars1, vars2): num_missing_gts1 = calc_missing_gt(vars1, rates=True) num_missing_gts2 = calc_missing_gt(vars2, rates=True) is_called = va.logical_not( va.logical_or(num_missing_gts1, num_missing_gts2)) gts1 = vars1[GT_FIELD] gts2 = vars2[GT_FIELD] gts1 = gts1[is_called] gts2 = gts2[is_called] indi1 = gts1[:, 0] indi2 = gts2[:, 0] return indi1, indi2
def _select_vars(variations, stats, min_allowable=None, max_allowable=None): with np.errstate(invalid='ignore'): selector_max = None if max_allowable is None else stats <= max_allowable selector_min = None if min_allowable is None else stats >= min_allowable if selector_max is None and selector_min is not None: selected_vars = selector_min elif selector_max is not None and selector_min is None: selected_vars = selector_max elif selector_max is not None and selector_min is not None: selected_vars = selector_min & selector_max else: selected_vars = _filter_no_row(variations) variations = variations.get_vars(selected_vars) num_selected_vars = va.count_nonzero(selected_vars) num_filtered = va.count_nonzero(va.logical_not(selected_vars)) flt_stats = {N_KEPT: num_selected_vars, N_FILTERED_OUT: num_filtered} return {FLT_VARS: variations, FLT_STATS: flt_stats}