示例#1
0
def filter_variations(in_zarr_path,
                      out_zarr_path,
                      samples_to_keep=None,
                      samples_to_remove=None,
                      regions_to_remove=None,
                      regions_to_keep=None,
                      min_call_rate=None,
                      min_dp_setter=None,
                      remove_non_variable_snvs=None,
                      max_allowable_mac=None,
                      max_allowable_het=None,
                      min_call_dp_for_het_call=None,
                      verbose=True,
                      out_fhand=sys.stdout,
                      calc_histogram=False):

    pipeline_tasks = {}
    variations = load_zarr(in_zarr_path)
    max_alleles = variations[ALT_FIELD].shape[1]
    task = {FLT_VARS: variations}

    if samples_to_keep is not None:
        task = keep_samples(task[FLT_VARS], samples_to_keep)
        _add_task_to_pipeline(pipeline_tasks, task)

    if samples_to_remove is not None:
        task = remove_samples(task[FLT_VARS], samples_to_remove)
        _add_task_to_pipeline(pipeline_tasks, task)

    if regions_to_remove is not None:
        task = remove_variations_in_regions(task[FLT_VARS], regions_to_remove)
        _add_task_to_pipeline(pipeline_tasks, task)

    if regions_to_keep is not None:
        task = keep_variations_in_regions(task[FLT_VARS], regions_to_keep)
        _add_task_to_pipeline(pipeline_tasks, task)

    if min_dp_setter is not None:
        task = min_depth_gt_to_missing(task[FLT_VARS], min_depth=min_dp_setter)
        _add_task_to_pipeline(pipeline_tasks, task)

    if remove_non_variable_snvs:
        task = keep_variable_variations(task[FLT_VARS],
                                        max_alleles=max_alleles)
        _add_task_to_pipeline(pipeline_tasks, task)

    if max_allowable_mac is not None:
        if samples_to_keep:
            max_allowable_mac = len(samples_to_keep) - max_allowable_mac
        elif samples_to_remove:
            max_allowable_mac = len(variations.samples) - \
                len(samples_to_remove) - max_allowable_mac
        else:
            max_allowable_mac = len(variations.samples) - max_allowable_mac
        task = filter_by_mac(task[FLT_VARS],
                             max_allowable_mac=max_allowable_mac,
                             max_alleles=max_alleles,
                             calc_histogram=calc_histogram)
        _add_task_to_pipeline(pipeline_tasks, task)

    if min_call_rate:
        task = remove_low_call_rate_vars(task[FLT_VARS],
                                         min_call_rate=min_call_rate,
                                         calc_histogram=calc_histogram)
        _add_task_to_pipeline(pipeline_tasks, task)

    if max_allowable_het is not None and min_call_dp_for_het_call is not None:
        task = filter_by_obs_heterocigosis(
            task[FLT_VARS],
            max_allowable_het=max_allowable_het,
            min_call_dp_for_het_call=min_call_dp_for_het_call,
            calc_histogram=calc_histogram)
        _add_task_to_pipeline(pipeline_tasks, task)

    delayed_store = prepare_zarr_storage(task[FLT_VARS], out_zarr_path)
    pipeline_tasks[FLT_VARS] = delayed_store

    result = compute(pipeline_tasks,
                     store_variation_to_memory=False,
                     silence_runtime_warnings=True)

    if verbose:
        for filter_id, task_result in result[FLT_STATS].items():
            if N_KEPT in task_result:
                total = task_result[N_FILTERED_OUT] + task_result[N_KEPT]
                out_fhand.write(f"Filter: {filter_id}\n")
                out_fhand.write("-" * (8 + len(filter_id)) + '\n')
                out_fhand.write(f"Processed: {total}\n")
                out_fhand.write(f"Kept vars: {task_result[N_KEPT]}\n")
                out_fhand.write(
                    f"Filtered out: {task_result[N_FILTERED_OUT]}\n\n")

    return result
示例#2
0
 def test_iterate_chunk_pairs(self):
     variations = load_zarr(TEST_DATA_DIR / 'test.zarr', num_vars_per_chunk=1)
     variations = remove_low_call_rate_vars(variations, min_call_rate=0)[FLT_VARS]
     for p in iterate_chunk_pairs(variations, max_distance=100000):
         self.assertTrue(len(p), 2)
示例#3
0
def create_dask_variations(num_vars_per_chunk=DEFAULT_VARIATION_NUM_IN_CHUNK):
    return load_zarr(TEST_DATA_DIR / 'test.zarr',
                     num_vars_per_chunk=num_vars_per_chunk)
示例#4
0
def _load_one_dask():
    variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
    return variations
示例#5
0
def _create_empty_dask_variations():
    variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
    return remove_low_call_rate_vars(variations, min_call_rate=1.1)[FLT_VARS]
示例#6
0
    def test_compute_vars_to_memory(self):
        zarr_path = TEST_DATA_DIR / 'test.zarr'
        variations = load_zarr(zarr_path)
        da1 = da.from_array(np.array([1, 2, 3, 4, 5]))
        da2 = da.from_array(np.array([6, 7, 8, 9, 0]))
        da3 = da1 + da2

        initial = {
            'vars': variations,
            'data': {
                'da1': da1,
                'da2': da2,
                'da3': da3
            }
        }
        processed = compute(initial, store_variation_to_memory=True)
        variations2 = processed['vars']
        self.assertTrue(
            np.all(variations.samples.compute() == variations2.samples))
        self.assertTrue(
            np.all(variations[GT_FIELD].compute() == variations2[GT_FIELD]))
        self.assertTrue(np.all(processed['data']['da1'] == [1, 2, 3, 4, 5]))
        self.assertTrue(np.all(processed['data']['da3'] == [7, 9, 11, 13, 5]))
        # if we are not storing or computing to memory, the variation
        # should be removed from the compute result
        zarr_path = TEST_DATA_DIR / 'test.zarr'
        variations = load_zarr(zarr_path)
        da1 = da.from_array(np.array([1, 2, 3, 4, 5]))
        da2 = da.from_array(np.array([6, 7, 8, 9, 0]))
        da3 = da1 + da2

        initial = {
            'vars': variations,
            'data': {
                'da1': da1,
                'da2': da2,
                'da3': da3
            }
        }

        processed = compute(initial, store_variation_to_memory=False)

        self.assertNotIn('vars', processed)

        initial = {
            'vars': variations,
            'data': {
                'da': {
                    'da1': da1,
                    'da2': da2,
                    'da3': da3
                }
            }
        }

        # recursive
        processed = compute(initial, store_variation_to_memory=False)
        assert np.all(processed['data']['da']['da1'] == [1, 2, 3, 4, 5])
        assert np.all(processed['data']['da']['da2'] == [6, 7, 8, 9, 0])
        assert np.all(processed['data']['da']['da3'] == [7, 9, 11, 13, 5])
        self.assertNotIn('vars', processed)

        initial = {
            'vars': variations,
            'data': {
                'd': {
                    'da': {
                        'da1': da1,
                        'da2': da2,
                        'da3': da3
                    }
                }
            }
        }
        processed = compute(initial, store_variation_to_memory=False)
        assert np.all(processed['data']['d']['da']['da1'] == [1, 2, 3, 4, 5])
        assert np.all(processed['data']['d']['da']['da2'] == [6, 7, 8, 9, 0])
        assert np.all(processed['data']['d']['da']['da3'] == [7, 9, 11, 13, 5])