示例#1
0
def locate_unlinked(gn, size=100, step=20, threshold=.1, chunked=False,
                    blen=None):
    """Locate variants in approximate linkage equilibrium, where r**2 is
    below the given `threshold`.

    Parameters
    ----------

    gn : array_like, int8, shape (n_variants, n_samples)
        Diploid genotypes at biallelic variants, coded as the number of
        alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt).
    size : int
        Window size (number of variants).
    step : int
        Number of variants to advance to the next window.
    threshold : float
        Maximum value of r**2 to include variants.
    blen : int, optional
        Block length to use for chunked computation.

    Returns
    -------

    loc : ndarray, bool, shape (n_variants)
        Boolean array where True items locate variants in approximate
        linkage equilibrium.

    Notes
    -----

    The value of r**2 between each pair of variants is calculated using the
    method of Rogers and Huff (2008).

    """

    from allel.opt.stats import gn_locate_unlinked_int8

    # check inputs
    if not hasattr(gn, 'shape') or not hasattr(gn, 'dtype'):
        gn = np.asarray(gn, dtype='i1')
    if gn.ndim != 2:
        raise ValueError('gn must have two dimensions')

    # setup output
    loc = np.ones(gn.shape[0], dtype='u1')

    # compute in chunks to avoid loading big arrays into memory
    blen = get_blen_array(gn, blen)
    blen = max(blen, 10*size)  # avoid too small chunks
    n_variants = gn.shape[0]
    for i in range(0, n_variants, blen):
        # N.B., ensure overlap with next window
        j = min(n_variants, i+blen+size)
        gnb = np.asarray(gn[i:j], dtype='i1')
        locb = loc[i:j]
        gn_locate_unlinked_int8(gnb, locb, size, step, threshold)

    return loc.astype('b1')
示例#2
0
def locate_unlinked(gn, size=100, step=20, threshold=.1, blen=None):
    """Locate variants in approximate linkage equilibrium, where r**2 is
    below the given `threshold`.

    Parameters
    ----------
    gn : array_like, int8, shape (n_variants, n_samples)
        Diploid genotypes at biallelic variants, coded as the number of
        alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt).
    size : int
        Window size (number of variants).
    step : int
        Number of variants to advance to the next window.
    threshold : float
        Maximum value of r**2 to include variants.
    blen : int, optional
        Block length to use for chunked computation.

    Returns
    -------
    loc : ndarray, bool, shape (n_variants)
        Boolean array where True items locate variants in approximate
        linkage equilibrium.

    Notes
    -----
    The value of r**2 between each pair of variants is calculated using the
    method of Rogers and Huff (2008).

    """

    # check inputs
    if not hasattr(gn, 'shape') or not hasattr(gn, 'dtype'):
        gn = np.asarray(gn, dtype='i1')
    if gn.ndim != 2:
        raise ValueError('gn must have two dimensions')

    # setup output
    loc = np.ones(gn.shape[0], dtype='u1')

    # compute in chunks to avoid loading big arrays into memory
    blen = get_blen_array(gn, blen)
    blen = max(blen, 10 * size)  # avoid too small chunks
    n_variants = gn.shape[0]
    for i in range(0, n_variants, blen):
        # N.B., ensure overlap with next window
        j = min(n_variants, i + blen + size)
        gnb = np.asarray(gn[i:j], dtype='i1')
        gnb = memoryview_safe(gnb)
        locb = loc[i:j]
        gn_locate_unlinked_int8(gnb, locb, size, step, threshold)

    return loc.astype('b1')