def summarize_variations(in_zarr_path, out_dir_path, draw_missin_rate=True, draw_mac=True, draw_maf=True, draw_obs_het=True, min_call_dp_for_het_call=MIN_DP_FOR_CALL_HET, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, num_bins=DEF_NUM_BINS, silence_runtime_warnings=True): stats = {} variations = load_zarr(in_zarr_path) max_alleles = variations[ALT_FIELD].shape[1] num_variations = variations.num_variations num_samples = variations.num_samples if draw_missin_rate: _stats = calc_called_gt(variations, rates=True) counts, edges = va.histogram(_stats, n_bins=num_bins, limits=(0, 1)) stats['called'] = {'counts': counts, 'edges': edges} if draw_mac: _stats = calc_mac(variations, max_alleles, min_num_genotypes) counts, edges = va.histogram(_stats, n_bins=num_bins, limits=(0, variations.num_samples)) stats['mac'] = {'counts': counts, 'edges': edges} if draw_maf: _stats = calc_maf_by_gt(variations, max_alleles, min_num_genotypes) counts, edges = va.histogram(_stats, n_bins=num_bins, limits=(0, 1)) stats['maf'] = {'counts': counts, 'edges': edges} if draw_obs_het: _stats = calc_obs_het( variations, min_num_genotypes=min_num_genotypes, min_call_dp_for_het_call=min_call_dp_for_het_call) counts, edges = va.histogram(_stats, n_bins=num_bins, limits=(0, 1)) stats['obs_heterocigosity'] = {'counts': counts, 'edges': edges} computed_stats = compute(stats, silence_runtime_warnings=silence_runtime_warnings) for kind, stats in computed_stats.items(): with (out_dir_path / f'{kind}.png').open('wb') as out_fhand: plot_histogram(stats['counts'], stats['edges'], out_fhand, log_scale=True) with (out_dir_path / 'stats.txt').open('w') as fhand: fhand.write(f'STATS FOR: {in_zarr_path.name}\n') fhand.write('-----------' + '-' * len(in_zarr_path.name) + '\n') fhand.write(f'Num. variations: {num_variations}\n') fhand.write(f'Num. samples: {num_samples}\n') fhand.write('\n')
def remove_low_call_rate_samples(variations, min_call_rate, rates=True, filter_id='sample_call_rate', calc_histogram=False, n_bins=DEF_NUM_BINS, limits=None): num_missing_gts = calc_missing_gt_per_sample(variations, rates=rates) if rates: num_called = 1 - num_missing_gts else: num_called = utils_array.get_shape_item(variations.gt, 0) - num_missing_gts selected_samples = num_called >= min_call_rate variations = keep_samples_with_mask(variations, selected_samples)[FLT_VARS] num_selected_samples = va.count_nonzero(selected_samples) num_filtered_samples = va.count_nonzero(va.logical_not(selected_samples)) flt_stats = { N_SAMPLES_KEPT: num_selected_samples, N_SAMPLES_FILTERED_OUT: num_filtered_samples } if calc_histogram: limits = (0, 1) if rates else (0, len(variations.num_variations)) counts, bin_edges = va.histogram(num_called, n_bins=n_bins, limits=limits) flt_stats[COUNT] = counts flt_stats[BIN_EDGES] = bin_edges flt_stats[HIST_RANGE] = [min_call_rate] return {FLT_VARS: variations, FLT_ID: filter_id, FLT_STATS: flt_stats}
def filter_by_mac(variations, max_alleles, max_allowable_mac=None, min_allowable_mac=None, filter_id='filter_by_mac', min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, calc_histogram=False, n_bins=DEF_NUM_BINS, limits=None): macs = calc_mac(variations, max_alleles=max_alleles, min_num_genotypes=min_num_genotypes) # print(compute(macs)) result = _select_vars(variations, macs, min_allowable_mac, max_allowable_mac) if calc_histogram: if limits is None: limits = (0, variations.num_samples) counts, bin_edges = va.histogram(macs, n_bins=n_bins, limits=limits) result[FLT_STATS][COUNT] = counts result[FLT_STATS][BIN_EDGES] = bin_edges limits = [] if min_allowable_mac is not None: limits.append(min_allowable_mac) if max_allowable_mac is not None: limits.append(max_allowable_mac) result[FLT_STATS]['limits'] = limits return { FLT_VARS: result[FLT_VARS], FLT_ID: filter_id, FLT_STATS: result[FLT_STATS] }
def remove_low_call_rate_vars(variations, min_call_rate, rates=True, filter_id='call_rate', calc_histogram=False, n_bins=DEF_NUM_BINS, limits=None): num_missing_gts = calc_missing_gt(variations, rates=rates) if rates: num_called = 1 - num_missing_gts else: num_called = variations.gt.shape[1] - num_missing_gts selected_vars = num_called >= min_call_rate variations = variations.get_vars(selected_vars) num_selected_vars = va.count_nonzero(selected_vars) num_filtered = va.count_nonzero(va.logical_not(selected_vars)) flt_stats = {N_KEPT: num_selected_vars, N_FILTERED_OUT: num_filtered} if calc_histogram: limits = (0, 1) if rates else (0, len(variations.num_samples)) counts, bin_edges = va.histogram(num_called, n_bins=n_bins, limits=limits) flt_stats[COUNT] = counts flt_stats[BIN_EDGES] = bin_edges flt_stats['limits'] = [min_call_rate] return {FLT_VARS: variations, FLT_ID: filter_id, FLT_STATS: flt_stats}
def test_calc_maf_by_gt2(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') mafs = calc_maf_by_gt(variations, max_alleles=3, min_num_genotypes=0) # res = compute(mafs, silence_runtime_warnings=True) counts, edges = va.histogram(mafs, n_bins=5, limits=(0, 1)) cc = compute({ 'counts': counts, 'edges': edges }, silence_runtime_warnings=True) self.assertTrue(np.all(cc['counts'] == [0, 0, 4, 2, 0])) self.assertTrue( np.all(np.isclose(cc['edges'], [0, 0.2, 0.4, 0.6, 0.8, 1])))
def filter_by_obs_heterocigosis( variations, max_allowable_het=None, min_allowable_het=None, min_call_dp_for_het_call=None, max_call_dp_for_het_call=None, filter_id='obs_het', min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, calc_histogram=False, n_bins=DEF_NUM_BINS, limits=None): obs_het = calc_obs_het(variations, min_num_genotypes=min_num_genotypes, min_call_dp_for_het_call=min_call_dp_for_het_call, max_call_dp_for_het_call=max_call_dp_for_het_call) result = _select_vars(variations, obs_het, min_allowable=min_allowable_het, max_allowable=max_allowable_het) if calc_histogram: if limits is None: limits = (0, 1) counts, bin_edges = va.histogram(obs_het, n_bins=n_bins, limits=limits) result[FLT_STATS][COUNT] = counts result[FLT_STATS][BIN_EDGES] = bin_edges limits = [] if min_allowable_het is not None: limits.append(min_allowable_het) if max_allowable_het is not None: limits.append(max_allowable_het) result[FLT_STATS]['limits'] = limits return { FLT_VARS: result[FLT_VARS], FLT_ID: filter_id, FLT_STATS: result[FLT_STATS] }