예제 #1
0
def remove_low_call_rate_samples(variations,
                                 min_call_rate,
                                 rates=True,
                                 filter_id='sample_call_rate',
                                 calc_histogram=False,
                                 n_bins=DEF_NUM_BINS,
                                 limits=None):

    num_missing_gts = calc_missing_gt_per_sample(variations, rates=rates)
    if rates:
        num_called = 1 - num_missing_gts
    else:
        num_called = utils_array.get_shape_item(variations.gt,
                                                0) - num_missing_gts

    selected_samples = num_called >= min_call_rate
    variations = keep_samples_with_mask(variations, selected_samples)[FLT_VARS]

    num_selected_samples = va.count_nonzero(selected_samples)
    num_filtered_samples = va.count_nonzero(va.logical_not(selected_samples))

    flt_stats = {
        N_SAMPLES_KEPT: num_selected_samples,
        N_SAMPLES_FILTERED_OUT: num_filtered_samples
    }

    if calc_histogram:
        limits = (0, 1) if rates else (0, len(variations.num_variations))
        counts, bin_edges = va.histogram(num_called,
                                         n_bins=n_bins,
                                         limits=limits)
        flt_stats[COUNT] = counts
        flt_stats[BIN_EDGES] = bin_edges
        flt_stats[HIST_RANGE] = [min_call_rate]
    return {FLT_VARS: variations, FLT_ID: filter_id, FLT_STATS: flt_stats}
예제 #2
0
def keep_variable_variations(variations,
                             max_alleles,
                             filter_id='variable_variations'):
    gts = variations[GT_FIELD]
    some_not_missing_gts = va.any(gts != MISSING_INT, axis=2)
    selected_vars1 = va.any(some_not_missing_gts, axis=1)
    allele_counts = count_alleles(gts,
                                  max_alleles=max_alleles,
                                  count_missing=False)
    num_alleles_per_snp = va.sum(allele_counts > 0, axis=1)
    selected_vars2 = num_alleles_per_snp > 1

    selected_vars = va.logical_and(selected_vars1, selected_vars2)

    selected_variations = variations.get_vars(selected_vars)

    num_selected_vars = va.count_nonzero(selected_vars)
    num_filtered = va.count_nonzero(va.logical_not(selected_vars))

    flt_stats = {N_KEPT: num_selected_vars, N_FILTERED_OUT: num_filtered}

    return {
        FLT_VARS: selected_variations,
        FLT_ID: filter_id,
        FLT_STATS: flt_stats
    }
예제 #3
0
def remove_low_call_rate_vars(variations,
                              min_call_rate,
                              rates=True,
                              filter_id='call_rate',
                              calc_histogram=False,
                              n_bins=DEF_NUM_BINS,
                              limits=None):
    num_missing_gts = calc_missing_gt(variations, rates=rates)
    if rates:
        num_called = 1 - num_missing_gts
    else:
        num_called = variations.gt.shape[1] - num_missing_gts

    selected_vars = num_called >= min_call_rate
    variations = variations.get_vars(selected_vars)

    num_selected_vars = va.count_nonzero(selected_vars)
    num_filtered = va.count_nonzero(va.logical_not(selected_vars))

    flt_stats = {N_KEPT: num_selected_vars, N_FILTERED_OUT: num_filtered}

    if calc_histogram:
        limits = (0, 1) if rates else (0, len(variations.num_samples))
        counts, bin_edges = va.histogram(num_called,
                                         n_bins=n_bins,
                                         limits=limits)
        flt_stats[COUNT] = counts
        flt_stats[BIN_EDGES] = bin_edges
        flt_stats['limits'] = [min_call_rate]

    return {FLT_VARS: variations, FLT_ID: filter_id, FLT_STATS: flt_stats}
예제 #4
0
def _filter_by_snp_position(variations, regions, filter_id, reverse=False):
    selected_vars = _select_variations_in_region(variations, regions)
    if reverse:
        selected_vars = va.logical_not(selected_vars)

    selected_variations = variations.get_vars(selected_vars)
    #     print('sel', selected_variations.shape)
    num_selected_vars = va.count_nonzero(selected_vars)
    num_filtered = va.count_nonzero(va.logical_not(selected_vars))

    flt_stats = {N_KEPT: num_selected_vars, N_FILTERED_OUT: num_filtered}

    return {
        FLT_VARS: selected_variations,
        FLT_ID: filter_id,
        FLT_STATS: flt_stats
    }
예제 #5
0
def _select_vars(variations, stats, min_allowable=None, max_allowable=None):
    with np.errstate(invalid='ignore'):
        selector_max = None if max_allowable is None else stats <= max_allowable
        selector_min = None if min_allowable is None else stats >= min_allowable

    if selector_max is None and selector_min is not None:
        selected_vars = selector_min
    elif selector_max is not None and selector_min is None:
        selected_vars = selector_max
    elif selector_max is not None and selector_min is not None:
        selected_vars = selector_min & selector_max
    else:
        selected_vars = _filter_no_row(variations)

    variations = variations.get_vars(selected_vars)

    num_selected_vars = va.count_nonzero(selected_vars)
    num_filtered = va.count_nonzero(va.logical_not(selected_vars))

    flt_stats = {N_KEPT: num_selected_vars, N_FILTERED_OUT: num_filtered}

    return {FLT_VARS: variations, FLT_STATS: flt_stats}
예제 #6
0
def calc_dset_pop_distance(variations,
                           max_alleles,
                           populations,
                           min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT,
                           min_call_dp_for_het=0,
                           silence_runtime_warnings=False):
    '''This is an implementation of the formulas proposed in GenAlex'''
    pop_ids = list(range(len(populations)))
    variations_per_pop = [
        keep_samples(variations, pop_samples)[FLT_VARS]
        for pop_samples in populations
    ]

    accumulated_dists = {}
    accumulated_hs = {}
    accumulated_ht = {}
    num_vars = {}

    for pop_id1, pop_id2 in combinations(pop_ids, 2):
        vars_for_pop1 = variations_per_pop[pop_id1]
        vars_for_pop2 = variations_per_pop[pop_id2]

        res = _calc_pairwise_dest(vars_for_pop1,
                                  vars_for_pop2,
                                  max_alleles=max_alleles,
                                  min_call_dp_for_het=min_call_dp_for_het,
                                  min_num_genotypes=min_num_genotypes)

        res['corrected_hs']
        res['corrected_ht']
        num_vars_in_chunk = va.count_nonzero(~va.isnan(res['corrected_hs']))

        hs_in_chunk = va.nansum(res['corrected_hs'])
        ht_in_chunk = va.nansum(res['corrected_ht'])

        key = (pop_id1, pop_id2)
        if key in accumulated_dists:
            accumulated_hs[key] += hs_in_chunk
            accumulated_ht[key] += ht_in_chunk
            num_vars[key] += num_vars_in_chunk
        else:
            accumulated_hs[key] = hs_in_chunk
            accumulated_ht[key] = ht_in_chunk
            num_vars[key] = num_vars_in_chunk

    task = {
        'accumulated_hs': accumulated_hs,
        'accumulated_ht': accumulated_ht,
        'num_vars': num_vars
    }

    result = compute(task, silence_runtime_warnings=silence_runtime_warnings)
    computed_accumulated_hs = result['accumulated_hs']
    computed_accumulated_ht = result['accumulated_ht']
    computed_num_vars = result['num_vars']

    tot_n_pops = len(populations)
    dists = np.empty(int((tot_n_pops**2 - tot_n_pops) / 2))
    dists[:] = np.nan
    num_pops = 2
    for idx, (pop_id1, pop_id2) in enumerate(combinations(pop_ids, 2)):
        key = pop_id1, pop_id2
        if key in accumulated_hs:
            with np.errstate(invalid='ignore'):
                corrected_hs = computed_accumulated_hs[
                    key] / computed_num_vars[key]
                corrected_ht = computed_accumulated_ht[
                    key] / computed_num_vars[key]
            dest = (num_pops /
                    (num_pops - 1)) * ((corrected_ht - corrected_hs) /
                                       (1 - corrected_hs))
        else:
            dest = np.nan
        dists[idx] = dest
    return dists