예제 #1
0
def remove_low_call_rate_samples(variations,
                                 min_call_rate,
                                 rates=True,
                                 filter_id='sample_call_rate',
                                 calc_histogram=False,
                                 n_bins=DEF_NUM_BINS,
                                 limits=None):

    num_missing_gts = calc_missing_gt_per_sample(variations, rates=rates)
    if rates:
        num_called = 1 - num_missing_gts
    else:
        num_called = utils_array.get_shape_item(variations.gt,
                                                0) - num_missing_gts

    selected_samples = num_called >= min_call_rate
    variations = keep_samples_with_mask(variations, selected_samples)[FLT_VARS]

    num_selected_samples = va.count_nonzero(selected_samples)
    num_filtered_samples = va.count_nonzero(va.logical_not(selected_samples))

    flt_stats = {
        N_SAMPLES_KEPT: num_selected_samples,
        N_SAMPLES_FILTERED_OUT: num_filtered_samples
    }

    if calc_histogram:
        limits = (0, 1) if rates else (0, len(variations.num_variations))
        counts, bin_edges = va.histogram(num_called,
                                         n_bins=n_bins,
                                         limits=limits)
        flt_stats[COUNT] = counts
        flt_stats[BIN_EDGES] = bin_edges
        flt_stats[HIST_RANGE] = [min_call_rate]
    return {FLT_VARS: variations, FLT_ID: filter_id, FLT_STATS: flt_stats}
예제 #2
0
def keep_variable_variations(variations,
                             max_alleles,
                             filter_id='variable_variations'):
    gts = variations[GT_FIELD]
    some_not_missing_gts = va.any(gts != MISSING_INT, axis=2)
    selected_vars1 = va.any(some_not_missing_gts, axis=1)
    allele_counts = count_alleles(gts,
                                  max_alleles=max_alleles,
                                  count_missing=False)
    num_alleles_per_snp = va.sum(allele_counts > 0, axis=1)
    selected_vars2 = num_alleles_per_snp > 1

    selected_vars = va.logical_and(selected_vars1, selected_vars2)

    selected_variations = variations.get_vars(selected_vars)

    num_selected_vars = va.count_nonzero(selected_vars)
    num_filtered = va.count_nonzero(va.logical_not(selected_vars))

    flt_stats = {N_KEPT: num_selected_vars, N_FILTERED_OUT: num_filtered}

    return {
        FLT_VARS: selected_variations,
        FLT_ID: filter_id,
        FLT_STATS: flt_stats
    }
예제 #3
0
def calc_diversities(variations,
                     max_alleles,
                     min_num_genotypes,
                     min_call_dp_for_het_call=MIN_DP_FOR_CALL_HET,
                     polymorphic_threshold=0.95):
    diversities = {}

    mafs = calc_maf_by_gt(variations,
                          max_alleles,
                          min_num_genotypes=min_num_genotypes)

    mafs_no_nan = mafs[va.logical_not(va.isnan(mafs))]

    num_variable_vars = va.sum(mafs_no_nan < 0.9999999999)

    diversities['num_variable_vars'] = num_variable_vars

    snp_is_poly = mafs_no_nan <= polymorphic_threshold
    num_poly = va.sum(snp_is_poly)
    diversities['num_polymorphic_vars'] = num_poly

    exp_het = calc_expected_het(variations,
                                max_alleles=max_alleles,
                                min_num_genotypes=min_num_genotypes)
    diversities['exp_het'] = va.nanmean(exp_het)

    obs_het = calc_obs_het(variations,
                           min_call_dp_for_het_call=min_call_dp_for_het_call,
                           min_num_genotypes=min_num_genotypes)
    diversities['obs_het'] = va.nanmean(obs_het)
    diversities['num_total_variations'] = variations.num_variations
    return diversities
예제 #4
0
def remove_low_call_rate_vars(variations,
                              min_call_rate,
                              rates=True,
                              filter_id='call_rate',
                              calc_histogram=False,
                              n_bins=DEF_NUM_BINS,
                              limits=None):
    num_missing_gts = calc_missing_gt(variations, rates=rates)
    if rates:
        num_called = 1 - num_missing_gts
    else:
        num_called = variations.gt.shape[1] - num_missing_gts

    selected_vars = num_called >= min_call_rate
    variations = variations.get_vars(selected_vars)

    num_selected_vars = va.count_nonzero(selected_vars)
    num_filtered = va.count_nonzero(va.logical_not(selected_vars))

    flt_stats = {N_KEPT: num_selected_vars, N_FILTERED_OUT: num_filtered}

    if calc_histogram:
        limits = (0, 1) if rates else (0, len(variations.num_samples))
        counts, bin_edges = va.histogram(num_called,
                                         n_bins=n_bins,
                                         limits=limits)
        flt_stats[COUNT] = counts
        flt_stats[BIN_EDGES] = bin_edges
        flt_stats['limits'] = [min_call_rate]

    return {FLT_VARS: variations, FLT_ID: filter_id, FLT_STATS: flt_stats}
예제 #5
0
def _filter_by_snp_position(variations, regions, filter_id, reverse=False):
    selected_vars = _select_variations_in_region(variations, regions)
    if reverse:
        selected_vars = va.logical_not(selected_vars)

    selected_variations = variations.get_vars(selected_vars)
    #     print('sel', selected_variations.shape)
    num_selected_vars = va.count_nonzero(selected_vars)
    num_filtered = va.count_nonzero(va.logical_not(selected_vars))

    flt_stats = {N_KEPT: num_selected_vars, N_FILTERED_OUT: num_filtered}

    return {
        FLT_VARS: selected_variations,
        FLT_ID: filter_id,
        FLT_STATS: flt_stats
    }
예제 #6
0
def _call_is_het(variations, is_missing=None):
    is_hom = _call_is_hom(variations, is_missing=is_missing)
    #     if is_hom.shape[0] == 0:
    #         return is_hom, is_missing
    is_het = va.logical_not(is_hom)
    if is_missing is not None:
        is_het[is_missing] = False
    return is_het
예제 #7
0
def hmean(array, axis=0, dtype=None):
    if axis is None:
        array = array.ravel()
        size = array.shape[0]
    else:
        size = array.shape[axis]
    with np.errstate(divide='ignore'):
        inverse_mean = va.sum(1.0 / array, axis=axis, dtype=dtype)
    is_inf = va.logical_not(va.isfinite(inverse_mean))
    hmean = size / inverse_mean
    hmean[is_inf] = np.nan

    return hmean
예제 #8
0
def _calc_obs_het_counts(variations, axis, min_call_dp_for_het_call,
                         max_call_dp_for_het_call=None):
    is_missing = va.any(variations[GT_FIELD] == MISSING_INT, axis=2)

    if min_call_dp_for_het_call is not None or max_call_dp_for_het_call is not None:
        dps = variations[DP_FIELD]
        if min_call_dp_for_het_call is not None:
            low_dp = dps < min_call_dp_for_het_call
            is_missing = va.logical_or(is_missing, low_dp)
        if max_call_dp_for_het_call is not None:
            high_dp = dps > max_call_dp_for_het_call
            is_missing = va.logical_or(is_missing, high_dp)
    is_het = _call_is_het(variations, is_missing=is_missing)

    return (va.sum(is_het, axis=axis),
            va.sum(va.logical_not(is_missing), axis=axis))
예제 #9
0
def _get_gts_non_missing_in_both(vars1, vars2):
    num_missing_gts1 = calc_missing_gt(vars1, rates=True)
    num_missing_gts2 = calc_missing_gt(vars2, rates=True)

    is_called = va.logical_not(
        va.logical_or(num_missing_gts1, num_missing_gts2))

    gts1 = vars1[GT_FIELD]
    gts2 = vars2[GT_FIELD]

    gts1 = gts1[is_called]
    gts2 = gts2[is_called]
    indi1 = gts1[:, 0]
    indi2 = gts2[:, 0]

    return indi1, indi2
예제 #10
0
def _select_vars(variations, stats, min_allowable=None, max_allowable=None):
    with np.errstate(invalid='ignore'):
        selector_max = None if max_allowable is None else stats <= max_allowable
        selector_min = None if min_allowable is None else stats >= min_allowable

    if selector_max is None and selector_min is not None:
        selected_vars = selector_min
    elif selector_max is not None and selector_min is None:
        selected_vars = selector_max
    elif selector_max is not None and selector_min is not None:
        selected_vars = selector_min & selector_max
    else:
        selected_vars = _filter_no_row(variations)

    variations = variations.get_vars(selected_vars)

    num_selected_vars = va.count_nonzero(selected_vars)
    num_filtered = va.count_nonzero(va.logical_not(selected_vars))

    flt_stats = {N_KEPT: num_selected_vars, N_FILTERED_OUT: num_filtered}

    return {FLT_VARS: variations, FLT_STATS: flt_stats}