示例#1
0
def keep_variable_variations(variations,
                             max_alleles,
                             filter_id='variable_variations'):
    gts = variations[GT_FIELD]
    some_not_missing_gts = va.any(gts != MISSING_INT, axis=2)
    selected_vars1 = va.any(some_not_missing_gts, axis=1)
    allele_counts = count_alleles(gts,
                                  max_alleles=max_alleles,
                                  count_missing=False)
    num_alleles_per_snp = va.sum(allele_counts > 0, axis=1)
    selected_vars2 = num_alleles_per_snp > 1

    selected_vars = va.logical_and(selected_vars1, selected_vars2)

    selected_variations = variations.get_vars(selected_vars)

    num_selected_vars = va.count_nonzero(selected_vars)
    num_filtered = va.count_nonzero(va.logical_not(selected_vars))

    flt_stats = {N_KEPT: num_selected_vars, N_FILTERED_OUT: num_filtered}

    return {
        FLT_VARS: selected_variations,
        FLT_ID: filter_id,
        FLT_STATS: flt_stats
    }
示例#2
0
def gts_as_mat012(gts):
    '''It transforms the GT matrix into 0 (major allele h**o), 1 (het),
       2(other hom)'''
    gts012 = va.sum(gts, axis=2)
    gts012[va.any(gts == MISSING_INT, axis=2)] = MISSING_INT
    gts012[gts012 >= 1 ] = 2
    gts012[va.logical_and(gts012 == 2, va.any(gts == 0, axis=2))] = 1

    return gts012
示例#3
0
def calc_ld_random_pairs_from_different_chroms(variations, num_pairs,
                                               max_maf=0.95, min_num_gts=10,
                                               silence_runtime_warnings=False):
    chroms = va.make_sure_array_is_in_memory(variations[CHROM_FIELD],
        silence_runtime_warnings=silence_runtime_warnings)

    different_chroms = np.unique(chroms)
    if different_chroms.size < 2:
        raise ValueError('Only one chrom in variations')
    max_alleles = variations[ALT_FIELD].shape[1]

    mafs = calc_maf_by_gt(variations, max_alleles, min_num_gts)
    mafs = va.make_sure_array_is_in_memory(mafs,
        silence_runtime_warnings=silence_runtime_warnings)

    if va.any(va.isnan(mafs)) or va.any(mafs > max_maf):
        msg = 'Not enough genotypes or MAF below allowed maximum, Rogers Huff calculations known to go wrong for very high maf'
        raise RuntimeError(msg)

    gts = va.make_sure_array_is_in_memory(variations[GT_FIELD],
        silence_runtime_warnings=silence_runtime_warnings)

    num_variations = gts.shape[0]

    pairs_computed = 0
    while True:
        snp_idx1 = random.randrange(num_variations)
        snp_idx2 = random.randrange(num_variations)
        chrom1 = chroms[snp_idx1]
        chrom2 = chroms[snp_idx2]
        if chrom1 == chrom2:
            continue

        gts_snp1 = gts[snp_idx1]
        gts_snp2 = gts[snp_idx2]
        r2_ld = _calc_rogers_huff_r_for_snp_pair(gts_snp1, gts_snp2,
                                                 min_num_gts=min_num_gts)
        if not math.isnan(r2_ld):
            yield chrom1, snp_idx1, chrom2, snp_idx2, r2_ld
            pairs_computed += 1

        if pairs_computed >= num_pairs:
            break
示例#4
0
def _kosman(vars1, vars2):
    indi1, indi2 = _get_gts_non_missing_in_both(vars1, vars2)

    if indi1.shape[1] != 2:
        raise ValueError('Only diploid are allowed')

    alleles_comparison1 = indi1 == indi2.transpose()[:, :, None]
    alleles_comparison2 = indi2 == indi1.transpose()[:, :, None]

    result = va.add(va.any(alleles_comparison2, axis=2).sum(axis=0),
                    va.any(alleles_comparison1, axis=2).sum(axis=0),
                    dtype=np.float64)

    result[result == 0] = 1
    result[result == 4] = 0

    mask = va.logical_and(result != 1, result != 0)
    result[mask] = 0.5

    return result
示例#5
0
def _calc_obs_het_counts(variations, axis, min_call_dp_for_het_call,
                         max_call_dp_for_het_call=None):
    is_missing = va.any(variations[GT_FIELD] == MISSING_INT, axis=2)

    if min_call_dp_for_het_call is not None or max_call_dp_for_het_call is not None:
        dps = variations[DP_FIELD]
        if min_call_dp_for_het_call is not None:
            low_dp = dps < min_call_dp_for_het_call
            is_missing = va.logical_or(is_missing, low_dp)
        if max_call_dp_for_het_call is not None:
            high_dp = dps > max_call_dp_for_het_call
            is_missing = va.logical_or(is_missing, high_dp)
    is_het = _call_is_het(variations, is_missing=is_missing)

    return (va.sum(is_het, axis=axis),
            va.sum(va.logical_not(is_missing), axis=axis))