def keep_variable_variations(variations, max_alleles, filter_id='variable_variations'): gts = variations[GT_FIELD] some_not_missing_gts = va.any(gts != MISSING_INT, axis=2) selected_vars1 = va.any(some_not_missing_gts, axis=1) allele_counts = count_alleles(gts, max_alleles=max_alleles, count_missing=False) num_alleles_per_snp = va.sum(allele_counts > 0, axis=1) selected_vars2 = num_alleles_per_snp > 1 selected_vars = va.logical_and(selected_vars1, selected_vars2) selected_variations = variations.get_vars(selected_vars) num_selected_vars = va.count_nonzero(selected_vars) num_filtered = va.count_nonzero(va.logical_not(selected_vars)) flt_stats = {N_KEPT: num_selected_vars, N_FILTERED_OUT: num_filtered} return { FLT_VARS: selected_variations, FLT_ID: filter_id, FLT_STATS: flt_stats }
def gts_as_mat012(gts): '''It transforms the GT matrix into 0 (major allele h**o), 1 (het), 2(other hom)''' gts012 = va.sum(gts, axis=2) gts012[va.any(gts == MISSING_INT, axis=2)] = MISSING_INT gts012[gts012 >= 1 ] = 2 gts012[va.logical_and(gts012 == 2, va.any(gts == 0, axis=2))] = 1 return gts012
def calc_ld_random_pairs_from_different_chroms(variations, num_pairs, max_maf=0.95, min_num_gts=10, silence_runtime_warnings=False): chroms = va.make_sure_array_is_in_memory(variations[CHROM_FIELD], silence_runtime_warnings=silence_runtime_warnings) different_chroms = np.unique(chroms) if different_chroms.size < 2: raise ValueError('Only one chrom in variations') max_alleles = variations[ALT_FIELD].shape[1] mafs = calc_maf_by_gt(variations, max_alleles, min_num_gts) mafs = va.make_sure_array_is_in_memory(mafs, silence_runtime_warnings=silence_runtime_warnings) if va.any(va.isnan(mafs)) or va.any(mafs > max_maf): msg = 'Not enough genotypes or MAF below allowed maximum, Rogers Huff calculations known to go wrong for very high maf' raise RuntimeError(msg) gts = va.make_sure_array_is_in_memory(variations[GT_FIELD], silence_runtime_warnings=silence_runtime_warnings) num_variations = gts.shape[0] pairs_computed = 0 while True: snp_idx1 = random.randrange(num_variations) snp_idx2 = random.randrange(num_variations) chrom1 = chroms[snp_idx1] chrom2 = chroms[snp_idx2] if chrom1 == chrom2: continue gts_snp1 = gts[snp_idx1] gts_snp2 = gts[snp_idx2] r2_ld = _calc_rogers_huff_r_for_snp_pair(gts_snp1, gts_snp2, min_num_gts=min_num_gts) if not math.isnan(r2_ld): yield chrom1, snp_idx1, chrom2, snp_idx2, r2_ld pairs_computed += 1 if pairs_computed >= num_pairs: break
def _kosman(vars1, vars2): indi1, indi2 = _get_gts_non_missing_in_both(vars1, vars2) if indi1.shape[1] != 2: raise ValueError('Only diploid are allowed') alleles_comparison1 = indi1 == indi2.transpose()[:, :, None] alleles_comparison2 = indi2 == indi1.transpose()[:, :, None] result = va.add(va.any(alleles_comparison2, axis=2).sum(axis=0), va.any(alleles_comparison1, axis=2).sum(axis=0), dtype=np.float64) result[result == 0] = 1 result[result == 4] = 0 mask = va.logical_and(result != 1, result != 0) result[mask] = 0.5 return result
def _calc_obs_het_counts(variations, axis, min_call_dp_for_het_call, max_call_dp_for_het_call=None): is_missing = va.any(variations[GT_FIELD] == MISSING_INT, axis=2) if min_call_dp_for_het_call is not None or max_call_dp_for_het_call is not None: dps = variations[DP_FIELD] if min_call_dp_for_het_call is not None: low_dp = dps < min_call_dp_for_het_call is_missing = va.logical_or(is_missing, low_dp) if max_call_dp_for_het_call is not None: high_dp = dps > max_call_dp_for_het_call is_missing = va.logical_or(is_missing, high_dp) is_het = _call_is_het(variations, is_missing=is_missing) return (va.sum(is_het, axis=axis), va.sum(va.logical_not(is_missing), axis=axis))