def filter_variations(in_zarr_path, out_zarr_path, samples_to_keep=None, samples_to_remove=None, regions_to_remove=None, regions_to_keep=None, min_call_rate=None, min_dp_setter=None, remove_non_variable_snvs=None, max_allowable_mac=None, max_allowable_het=None, min_call_dp_for_het_call=None, verbose=True, out_fhand=sys.stdout, calc_histogram=False): pipeline_tasks = {} variations = load_zarr(in_zarr_path) max_alleles = variations[ALT_FIELD].shape[1] task = {FLT_VARS: variations} if samples_to_keep is not None: task = keep_samples(task[FLT_VARS], samples_to_keep) _add_task_to_pipeline(pipeline_tasks, task) if samples_to_remove is not None: task = remove_samples(task[FLT_VARS], samples_to_remove) _add_task_to_pipeline(pipeline_tasks, task) if regions_to_remove is not None: task = remove_variations_in_regions(task[FLT_VARS], regions_to_remove) _add_task_to_pipeline(pipeline_tasks, task) if regions_to_keep is not None: task = keep_variations_in_regions(task[FLT_VARS], regions_to_keep) _add_task_to_pipeline(pipeline_tasks, task) if min_dp_setter is not None: task = min_depth_gt_to_missing(task[FLT_VARS], min_depth=min_dp_setter) _add_task_to_pipeline(pipeline_tasks, task) if remove_non_variable_snvs: task = keep_variable_variations(task[FLT_VARS], max_alleles=max_alleles) _add_task_to_pipeline(pipeline_tasks, task) if max_allowable_mac is not None: if samples_to_keep: max_allowable_mac = len(samples_to_keep) - max_allowable_mac elif samples_to_remove: max_allowable_mac = len(variations.samples) - \ len(samples_to_remove) - max_allowable_mac else: max_allowable_mac = len(variations.samples) - max_allowable_mac task = filter_by_mac(task[FLT_VARS], max_allowable_mac=max_allowable_mac, max_alleles=max_alleles, calc_histogram=calc_histogram) _add_task_to_pipeline(pipeline_tasks, task) if min_call_rate: task = remove_low_call_rate_vars(task[FLT_VARS], min_call_rate=min_call_rate, calc_histogram=calc_histogram) _add_task_to_pipeline(pipeline_tasks, task) if max_allowable_het is not None and min_call_dp_for_het_call is not None: task = filter_by_obs_heterocigosis( task[FLT_VARS], max_allowable_het=max_allowable_het, min_call_dp_for_het_call=min_call_dp_for_het_call, calc_histogram=calc_histogram) _add_task_to_pipeline(pipeline_tasks, task) delayed_store = prepare_zarr_storage(task[FLT_VARS], out_zarr_path) pipeline_tasks[FLT_VARS] = delayed_store result = compute(pipeline_tasks, store_variation_to_memory=False, silence_runtime_warnings=True) if verbose: for filter_id, task_result in result[FLT_STATS].items(): if N_KEPT in task_result: total = task_result[N_FILTERED_OUT] + task_result[N_KEPT] out_fhand.write(f"Filter: {filter_id}\n") out_fhand.write("-" * (8 + len(filter_id)) + '\n') out_fhand.write(f"Processed: {total}\n") out_fhand.write(f"Kept vars: {task_result[N_KEPT]}\n") out_fhand.write( f"Filtered out: {task_result[N_FILTERED_OUT]}\n\n") return result
def test_iterate_chunk_pairs(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr', num_vars_per_chunk=1) variations = remove_low_call_rate_vars(variations, min_call_rate=0)[FLT_VARS] for p in iterate_chunk_pairs(variations, max_distance=100000): self.assertTrue(len(p), 2)
def create_dask_variations(num_vars_per_chunk=DEFAULT_VARIATION_NUM_IN_CHUNK): return load_zarr(TEST_DATA_DIR / 'test.zarr', num_vars_per_chunk=num_vars_per_chunk)
def _load_one_dask(): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') return variations
def _create_empty_dask_variations(): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') return remove_low_call_rate_vars(variations, min_call_rate=1.1)[FLT_VARS]
def test_compute_vars_to_memory(self): zarr_path = TEST_DATA_DIR / 'test.zarr' variations = load_zarr(zarr_path) da1 = da.from_array(np.array([1, 2, 3, 4, 5])) da2 = da.from_array(np.array([6, 7, 8, 9, 0])) da3 = da1 + da2 initial = { 'vars': variations, 'data': { 'da1': da1, 'da2': da2, 'da3': da3 } } processed = compute(initial, store_variation_to_memory=True) variations2 = processed['vars'] self.assertTrue( np.all(variations.samples.compute() == variations2.samples)) self.assertTrue( np.all(variations[GT_FIELD].compute() == variations2[GT_FIELD])) self.assertTrue(np.all(processed['data']['da1'] == [1, 2, 3, 4, 5])) self.assertTrue(np.all(processed['data']['da3'] == [7, 9, 11, 13, 5])) # if we are not storing or computing to memory, the variation # should be removed from the compute result zarr_path = TEST_DATA_DIR / 'test.zarr' variations = load_zarr(zarr_path) da1 = da.from_array(np.array([1, 2, 3, 4, 5])) da2 = da.from_array(np.array([6, 7, 8, 9, 0])) da3 = da1 + da2 initial = { 'vars': variations, 'data': { 'da1': da1, 'da2': da2, 'da3': da3 } } processed = compute(initial, store_variation_to_memory=False) self.assertNotIn('vars', processed) initial = { 'vars': variations, 'data': { 'da': { 'da1': da1, 'da2': da2, 'da3': da3 } } } # recursive processed = compute(initial, store_variation_to_memory=False) assert np.all(processed['data']['da']['da1'] == [1, 2, 3, 4, 5]) assert np.all(processed['data']['da']['da2'] == [6, 7, 8, 9, 0]) assert np.all(processed['data']['da']['da3'] == [7, 9, 11, 13, 5]) self.assertNotIn('vars', processed) initial = { 'vars': variations, 'data': { 'd': { 'da': { 'da1': da1, 'da2': da2, 'da3': da3 } } } } processed = compute(initial, store_variation_to_memory=False) assert np.all(processed['data']['d']['da']['da1'] == [1, 2, 3, 4, 5]) assert np.all(processed['data']['d']['da']['da2'] == [6, 7, 8, 9, 0]) assert np.all(processed['data']['d']['da']['da3'] == [7, 9, 11, 13, 5])