Exemplos de memoryview_safe em Python, exemplos de allel.compat.memoryview_safe em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: ld.py Projeto: Minhui-Chen/scikit-allel

def rogers_huff_r_between(gna, gnb, fill=np.nan):
    """Estimate the linkage disequilibrium parameter *r* for each pair of
    variants between the two input arrays, using the method of Rogers and
    Huff (2008).

    Parameters
    ----------
    gna, gnb : array_like, int8, shape (n_variants, n_samples)
        Diploid genotypes at biallelic variants, coded as the number of
        alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt).
    fill : float, optional
        Value to use where r cannot be calculated.

    Returns
    -------
    r : ndarray, float, shape (m_variants, n_variants )
        Matrix in rectangular form.

    """

    # check inputs
    gna = asarray_ndim(gna, 2, dtype='i1')
    gnb = asarray_ndim(gnb, 2, dtype='i1')
    gna = memoryview_safe(gna)
    gnb = memoryview_safe(gnb)

    # compute correlation coefficients
    r = gn_pairwise2_corrcoef_int8(gna, gnb, fill)

    # convenience for singletons
    if r.size == 1:
        r = r[0, 0]

    return r

Exemplo n.º 2

0

Exibir arquivo

def voight_painting(h):
    """Paint haplotypes, assigning a unique integer to each shared haplotype
    prefix.

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.

    Returns
    -------
    painting : ndarray, int, shape (n_variants, n_haplotypes)
        Painting array.
    indices : ndarray, int, shape (n_hapotypes,)
        Haplotype indices after sorting by prefix.

    """

    # check inputs
    # N.B., ensure int8 so we can use cython optimisation
    h = HaplotypeArray(np.asarray(h), copy=False)
    if h.max() > 1:
        raise NotImplementedError('only biallelic variants are supported')
    if h.min() < 0:
        raise NotImplementedError('missing calls are not supported')

    # sort by prefix
    indices = h.prefix_argsort()
    h = np.take(h, indices, axis=1)

    # paint
    painting = paint_shared_prefixes(memoryview_safe(np.asarray(h)))

    return painting, indices

Exemplo n.º 3

0

Exibir arquivo

Arquivo: ld.py Projeto: Minhui-Chen/scikit-allel

def rogers_huff_r(gn, fill=np.nan):
    """Estimate the linkage disequilibrium parameter *r* for each pair of
    variants using the method of Rogers and Huff (2008).

    Parameters
    ----------
    gn : array_like, int8, shape (n_variants, n_samples)
        Diploid genotypes at biallelic variants, coded as the number of
        alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt).
    fill : float, optional
        Value to use where r cannot be calculated.

    Returns
    -------
    r : ndarray, float, shape (n_variants * (n_variants - 1) // 2,)
        Matrix in condensed form.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [1, 1], [0, 0]],
    ...                          [[0, 0], [1, 1], [0, 0]],
    ...                          [[1, 1], [0, 0], [1, 1]],
    ...                          [[0, 0], [0, 1], [-1, -1]]], dtype='i1')
    >>> gn = g.to_n_alt(fill=-1)
    >>> gn
    array([[ 0,  2,  0],
           [ 0,  2,  0],
           [ 2,  0,  2],
           [ 0,  1, -1]], dtype=int8)
    >>> r = allel.stats.rogers_huff_r(gn)
    >>> r  # doctest: +ELLIPSIS
    array([ 1.        , -1.00000012,  1.        , -1.00000012,  1.        , -1.        ], ...
    >>> r ** 2  # doctest: +ELLIPSIS
    array([ 1.        ,  1.00000024,  1.        ,  1.00000024,  1.        ,  1.        ], ...
    >>> from scipy.spatial.distance import squareform
    >>> squareform(r ** 2)
    array([[ 0.        ,  1.        ,  1.00000024,  1.        ],
           [ 1.        ,  0.        ,  1.00000024,  1.        ],
           [ 1.00000024,  1.00000024,  0.        ,  1.        ],
           [ 1.        ,  1.        ,  1.        ,  0.        ]], dtype=float32)

    """

    # check inputs
    gn = asarray_ndim(gn, 2, dtype='i1')
    gn = memoryview_safe(gn)

    # compute correlation coefficients
    r = gn_pairwise_corrcoef_int8(gn, fill)

    # convenience for singletons
    if r.size == 1:
        r = r[0]

    return r

Exemplo n.º 4

0

Exibir arquivo

Arquivo: ld.py Projeto: Minhui-Chen/scikit-allel

def locate_unlinked(gn, size=100, step=20, threshold=.1, blen=None):
    """Locate variants in approximate linkage equilibrium, where r**2 is
    below the given `threshold`.

    Parameters
    ----------
    gn : array_like, int8, shape (n_variants, n_samples)
        Diploid genotypes at biallelic variants, coded as the number of
        alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt).
    size : int
        Window size (number of variants).
    step : int
        Number of variants to advance to the next window.
    threshold : float
        Maximum value of r**2 to include variants.
    blen : int, optional
        Block length to use for chunked computation.

    Returns
    -------
    loc : ndarray, bool, shape (n_variants)
        Boolean array where True items locate variants in approximate
        linkage equilibrium.

    Notes
    -----
    The value of r**2 between each pair of variants is calculated using the
    method of Rogers and Huff (2008).

    """

    # check inputs
    if not hasattr(gn, 'shape') or not hasattr(gn, 'dtype'):
        gn = np.asarray(gn, dtype='i1')
    if gn.ndim != 2:
        raise ValueError('gn must have two dimensions')

    # setup output
    loc = np.ones(gn.shape[0], dtype='u1')

    # compute in chunks to avoid loading big arrays into memory
    blen = get_blen_array(gn, blen)
    blen = max(blen, 10 * size)  # avoid too small chunks
    n_variants = gn.shape[0]
    for i in range(0, n_variants, blen):
        # N.B., ensure overlap with next window
        j = min(n_variants, i + blen + size)
        gnb = np.asarray(gn[i:j], dtype='i1')
        gnb = memoryview_safe(gnb)
        locb = loc[i:j]
        gn_locate_unlinked_int8(gnb, locb, size, step, threshold)

    return loc.astype('b1')

Exemplo n.º 5

0

Exibir arquivo

def phase_parents_by_transmission(g, window_size):
    """Phase parent genotypes from a trio or cross, given progeny genotypes
    already phased by Mendelian transmission.

    Parameters
    ----------
    g : GenotypeArray
        Genotype array, with parents as first two columns and progeny as
        remaining columns, where progeny genotypes are already phased.
    window_size : int
        Number of previous heterozygous sites to include when phasing each
        parent. A number somewhere between 10 and 100 may be appropriate,
        depending on levels of heterozygosity and quality of data.

    Returns
    -------
    g : GenotypeArray
        Genotype array with parents phased where possible.

    """

    # setup
    check_type(g, GenotypeArray)
    check_dtype(g.values, 'i1')
    check_ploidy(g.ploidy, 2)
    if g.is_phased is None:
        raise ValueError(
            'genotype array must first have progeny phased by transmission')
    check_min_samples(g.n_samples, 3)

    # run the phasing
    g._values = memoryview_safe(g.values)
    g._is_phased = memoryview_safe(g.is_phased)
    _opt_phase_parents_by_transmission(g.values, g.is_phased.view('u1'),
                                       window_size)

    # outputs
    return g

Exemplo n.º 6

0

Exibir arquivo

def ehh_decay(h, truncate=False):
    """Compute the decay of extended haplotype homozygosity (EHH)
    moving away from the first variant.

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    truncate : bool, optional
        If True, the return array will exclude trailing zeros.

    Returns
    -------
    ehh : ndarray, float, shape (n_variants, )
        EHH at successive variants from the first variant.

    """

    # check inputs
    # N.B., ensure int8 so we can use cython optimisation
    h = HaplotypeArray(np.asarray(h), copy=False)
    if h.max() > 1:
        raise NotImplementedError('only biallelic variants are supported')
    if h.min() < 0:
        raise NotImplementedError('missing calls are not supported')

    # initialise
    n_variants = h.n_variants  # number of rows, i.e., variants
    n_haplotypes = h.n_haplotypes  # number of columns, i.e., haplotypes
    n_pairs = (n_haplotypes * (n_haplotypes - 1)) // 2

    # compute the shared prefix length between all pairs of haplotypes
    spl = pairwise_shared_prefix_lengths(memoryview_safe(np.asarray(h)))

    # compute EHH by counting the number of shared prefixes extending beyond
    # each variant
    minlength = None if truncate else n_variants + 1
    b = np.bincount(spl, minlength=minlength)
    c = np.cumsum(b[::-1])[:-1]
    ehh = (c / n_pairs)[::-1]

    return ehh

Exemplo n.º 7

0

Exibir arquivo

Arquivo: mendel.py Projeto: yangmqglobe/scikit-allel

def phase_by_transmission(g, window_size, copy=True):
    """Phase genotypes in a trio or cross where possible using Mendelian
    transmission.

    Parameters
    ----------
    g : array_like, int, shape (n_variants, n_samples, 2)
        Genotype array, with parents as first two columns and progeny as
        remaining columns.
    window_size : int
        Number of previous heterozygous sites to include when phasing each
        parent. A number somewhere between 10 and 100 may be appropriate,
        depending on levels of heterozygosity and quality of data.
    copy : bool, optional
        If False, attempt to phase genotypes in-place. Note that this is
        only possible if the input array has int8 dtype, otherwise a copy is
        always made regardless of this parameter.

    Returns
    -------
    g : GenotypeArray
        Genotype array with progeny phased where possible.

    """

    # setup
    g = np.asarray(g, dtype='i1')
    g = GenotypeArray(g, copy=copy)
    g._values = memoryview_safe(g.values)
    check_ploidy(g.ploidy, 2)
    check_min_samples(g.n_samples, 3)

    # phase the progeny
    is_phased = _opt_phase_progeny_by_transmission(g.values)
    g.is_phased = np.asarray(is_phased).view(bool)

    # phase the parents
    _opt_phase_parents_by_transmission(g.values, is_phased, window_size)

    return g

Exemplo n.º 8

0

Exibir arquivo

def tabulate_state_blocks(x, states, pos=None):
    """Construct a dataframe where each row provides information about continuous state blocks.

    Parameters
    ----------
    x : array_like, int
        1-dimensional array of state values.
    states : set
        Set of states of interest. Any state value not in this set will be ignored.
    pos : array_like, int, optional
        Array of positions corresponding to values in `x`.

    Returns
    -------
    df : DataFrame

    Examples
    --------
    >>> import allel
    >>> x = [1, 1, 0, 1, 1, 2, 2, 0, 2, 1, 1]
    >>> df = allel.tabulate_state_blocks(x, states={1, 2})
    >>> df
       state  support  start_lidx     ...       size_min  size_max  is_marginal
    0      1        4          -1     ...              5        -1         True
    1      2        3           4     ...              4         4        False
    2      1        2           8     ...              2        -1         True
    [3 rows x 9 columns]
    >>> pos = [2, 4, 7, 8, 10, 14, 19, 23, 28, 30, 31]
    >>> df = allel.tabulate_state_blocks(x, states={1, 2}, pos=pos)
    >>> df
       state  support  start_lidx     ...      stop_rpos  length_min  length_max
    0      1        4          -1     ...             14           9          -1
    1      2        3           4     ...             30          15          19
    2      1        2           8     ...             -1           2          -1
    [3 rows x 15 columns]

    """

    # check inputs
    x = asarray_ndim(x, 1)
    check_integer_dtype(x)
    x = memoryview_safe(x)

    # find state transitions
    switch_points, transitions, observations = state_transitions(x, states)

    # setup some helpers
    t = transitions[1:, 0]
    o = observations[1:]
    s1 = switch_points[:-1]
    s2 = switch_points[1:]
    is_marginal = (s1[:, 0] < 0) | (s2[:, 1] < 0)
    size_min = s2[:, 0] - s1[:, 1] + 1
    size_max = s2[:, 1] - s1[:, 0] - 1
    size_max[is_marginal] = -1

    # start to build a dataframe
    items = [
        ('state', t),
        ('support', o),
        ('start_lidx', s1[:, 0]),
        ('start_ridx', s1[:, 1]),
        ('stop_lidx', s2[:, 0]),
        ('stop_ridx', s2[:, 1]),
        ('size_min', size_min),
        ('size_max', size_max),
        ('is_marginal', is_marginal)
    ]

    # deal with optional positions
    if pos is not None:
        pos = asarray_ndim(pos, 1)
        check_dim0_aligned(x, pos)
        check_integer_dtype(pos)

        # obtain switch positions
        switch_positions = np.take(pos, switch_points)
        # deal with boundary transitions
        switch_positions[0, 0] = -1
        switch_positions[-1, 1] = -1

        # setup helpers
        p1 = switch_positions[:-1]
        p2 = switch_positions[1:]
        length_min = p2[:, 0] - p1[:, 1] + 1
        length_max = p2[:, 1] - p1[:, 0] - 1
        length_max[is_marginal] = -1

        items += [
            ('start_lpos', p1[:, 0]),
            ('start_rpos', p1[:, 1]),
            ('stop_lpos', p2[:, 0]),
            ('stop_rpos', p2[:, 1]),
            ('length_min', length_min),
            ('length_max', length_max),
        ]

    import pandas
    return pandas.DataFrame.from_dict(OrderedDict(items))

Exemplo n.º 9

0

Exibir arquivo

def tabulate_state_transitions(x, states, pos=None):
    """Construct a dataframe where each row provides information about a state transition.

    Parameters
    ----------
    x : array_like, int
        1-dimensional array of state values.
    states : set
        Set of states of interest. Any state value not in this set will be ignored.
    pos : array_like, int, optional
        Array of positions corresponding to values in `x`.

    Returns
    -------
    df : DataFrame

    Notes
    -----
    The resulting dataframe includes one row at the start representing the first state
    observation and one row at the end representing the last state observation.

    Examples
    --------
    >>> import allel
    >>> x = [1, 1, 0, 1, 1, 2, 2, 0, 2, 1, 1]
    >>> df = allel.tabulate_state_transitions(x, states={1, 2})
    >>> df
       lstate  rstate  lidx  ridx
    0      -1       1    -1     0
    1       1       2     4     5
    2       2       1     8     9
    3       1      -1    10    -1
    >>> pos = [2, 4, 7, 8, 10, 14, 19, 23, 28, 30, 31]
    >>> df = allel.tabulate_state_transitions(x, states={1, 2}, pos=pos)
    >>> df
       lstate  rstate  lidx  ridx  lpos  rpos
    0      -1       1    -1     0    -1     2
    1       1       2     4     5    10    14
    2       2       1     8     9    28    30
    3       1      -1    10    -1    31    -1

    """

    # check inputs
    x = asarray_ndim(x, 1)
    check_integer_dtype(x)
    x = memoryview_safe(x)

    # find state transitions
    switch_points, transitions, _ = state_transitions(x, states)

    # start to build a dataframe
    items = [('lstate', transitions[:, 0]),
             ('rstate', transitions[:, 1]),
             ('lidx', switch_points[:, 0]),
             ('ridx', switch_points[:, 1])]

    # deal with optional positions
    if pos is not None:
        pos = asarray_ndim(pos, 1)
        check_dim0_aligned(x, pos)
        check_integer_dtype(pos)

        # find switch positions
        switch_positions = np.take(pos, switch_points)
        # deal with boundary transitions
        switch_positions[0, 0] = -1
        switch_positions[-1, 1] = -1

        # add columns into dataframe
        items += [('lpos', switch_positions[:, 0]),
                  ('rpos', switch_positions[:, 1])]

    import pandas
    return pandas.DataFrame.from_dict(OrderedDict(items))

Exemplo n.º 10

0

Exibir arquivo

def xpnsl(h1, h2, use_threads=True):
    """Cross-population version of the NSL statistic.

    Parameters
    ----------
    h1 : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array for the first population.
    h2 : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array for the second population.
    use_threads : bool, optional
        If True use multiple threads to compute.

    Returns
    -------
    score : ndarray, float, shape (n_variants,)
        Unstandardized XPNSL scores.

    """

    # check inputs
    h1 = asarray_ndim(h1, 2)
    check_integer_dtype(h1)
    h2 = asarray_ndim(h2, 2)
    check_integer_dtype(h2)
    check_dim0_aligned(h1, h2)
    h1 = memoryview_safe(h1)
    h2 = memoryview_safe(h2)

    if use_threads and multiprocessing.cpu_count() > 1:
        # use multiple threads

        # setup threadpool
        pool = ThreadPool(min(4, multiprocessing.cpu_count()))

        # scan forward
        res1_fwd = pool.apply_async(nsl_scan, args=(h1, ))
        res2_fwd = pool.apply_async(nsl_scan, args=(h2, ))

        # scan backward
        res1_rev = pool.apply_async(nsl_scan, args=(h1[::-1], ))
        res2_rev = pool.apply_async(nsl_scan, args=(h2[::-1], ))

        # wait for both to finish
        pool.close()
        pool.join()

        # obtain results
        nsl1_fwd = res1_fwd.get()
        nsl2_fwd = res2_fwd.get()
        nsl1_rev = res1_rev.get()
        nsl2_rev = res2_rev.get()

        # cleanup
        pool.terminate()

    else:
        # compute without threads

        # scan forward
        nsl1_fwd = nsl_scan(h1)
        nsl2_fwd = nsl_scan(h2)

        # scan backward
        nsl1_rev = nsl_scan(h1[::-1])
        nsl2_rev = nsl_scan(h2[::-1])

    # handle reverse scans
    nsl1_rev = nsl1_rev[::-1]
    nsl2_rev = nsl2_rev[::-1]

    # compute unstandardized score
    nsl1 = nsl1_fwd + nsl1_rev
    nsl2 = nsl2_fwd + nsl2_rev
    score = np.log(nsl1 / nsl2)

    return score

Exemplo n.º 11

0

Exibir arquivo

def nsl(h, use_threads=True):
    """Compute the unstandardized number of segregating sites by length (nSl)
    for each variant, comparing the reference and alternate alleles,
    after Ferrer-Admetlla et al. (2014).

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    use_threads : bool, optional
        If True use multiple threads to compute.

    Returns
    -------
    score : ndarray, float, shape (n_variants,)

    Notes
    -----
    This function will calculate nSl for all variants. To exclude variants
    below a given minor allele frequency, filter the input haplotype array
    before passing to this function.

    This function computes nSl by comparing the reference and alternate
    alleles. These can be polarised by switching the sign for any variant where
    the reference allele is derived.

    This function does nothing about nSl calculations where haplotype
    homozygosity extends up to the first or last variant. There may be edge
    effects.

    Note that the unstandardized score is returned. Usually these scores are
    then standardized in different allele frequency bins.

    See Also
    --------
    standardize_by_allele_count

    """

    # check inputs
    h = asarray_ndim(h, 2)
    check_integer_dtype(h)
    h = memoryview_safe(h)

    # # check there are no invariant sites
    # ac = h.count_alleles()
    # assert np.all(ac.is_segregating()), 'please remove non-segregating sites'

    if use_threads and multiprocessing.cpu_count() > 1:

        # create pool
        pool = ThreadPool(2)

        # scan forward
        result_fwd = pool.apply_async(nsl01_scan, args=(h, ))

        # scan backward
        result_rev = pool.apply_async(nsl01_scan, args=(h[::-1], ))

        # wait for both to finish
        pool.close()
        pool.join()

        # obtain results
        nsl0_fwd, nsl1_fwd = result_fwd.get()
        nsl0_rev, nsl1_rev = result_rev.get()

    else:

        # scan forward
        nsl0_fwd, nsl1_fwd = nsl01_scan(h)

        # scan backward
        nsl0_rev, nsl1_rev = nsl01_scan(h[::-1])

    # handle backwards
    nsl0_rev = nsl0_rev[::-1]
    nsl1_rev = nsl1_rev[::-1]

    # compute unstandardized score
    nsl0 = nsl0_fwd + nsl0_rev
    nsl1 = nsl1_fwd + nsl1_rev
    score = np.log(nsl1 / nsl0)

    return score

Exemplo n.º 12

0

Exibir arquivo

def xpehh(h1,
          h2,
          pos,
          map_pos=None,
          min_ehh=0.05,
          include_edges=False,
          gap_scale=20000,
          max_gap=200000,
          is_accessible=None,
          use_threads=True):
    """Compute the unstandardized cross-population extended haplotype
    homozygosity score (XPEHH) for each variant.

    Parameters
    ----------
    h1 : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array for the first population.
    h2 : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array for the second population.
    pos : array_like, int, shape (n_variants,)
        Variant positions on physical or genetic map.
    map_pos : array_like, float, shape (n_variants,)
        Variant positions (genetic map distance).
    min_ehh: float, optional
        Minimum EHH beyond which to truncate integrated haplotype
        homozygosity calculation.
    include_edges : bool, optional
        If True, report scores even if EHH does not decay below `min_ehh`
        before reaching the edge of the data.
    gap_scale : int, optional
        Rescale distance between variants if gap is larger than this value.
    max_gap : int, optional
        Do not report scores if EHH spans a gap larger than this number of
        base pairs.
    is_accessible : array_like, bool, optional
        Genome accessibility array. If provided, distance between variants
        will be computed as the number of accessible bases between them.
    use_threads : bool, optional
        If True use multiple threads to compute.

    Returns
    -------
    score : ndarray, float, shape (n_variants,)
        Unstandardized XPEHH scores.

    Notes
    -----

    This function will calculate XPEHH for all variants. To exclude variants
    below a given minor allele frequency, filter the input haplotype arrays
    before passing to this function.

    This function returns NaN for any EHH calculations where haplotype
    homozygosity does not decay below `min_ehh` before reaching the first or
    last variant. To disable this behaviour, set `include_edges` to True.

    Note that the unstandardized score is returned. Usually these scores are
    then standardized genome-wide.

    Haplotype arrays from the two populations may have different numbers of
    haplotypes.

    See Also
    --------
    standardize

    """

    # check inputs
    h1 = asarray_ndim(h1, 2)
    check_integer_dtype(h1)
    h2 = asarray_ndim(h2, 2)
    check_integer_dtype(h2)
    pos = asarray_ndim(pos, 1)
    check_dim0_aligned(h1, h2, pos)
    h1 = memoryview_safe(h1)
    h2 = memoryview_safe(h2)
    pos = memoryview_safe(pos)

    # compute gaps between variants for integration
    gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible)

    # setup kwargs
    kwargs = dict(min_ehh=min_ehh, include_edges=include_edges)

    if use_threads and multiprocessing.cpu_count() > 1:
        # use multiple threads

        # setup threadpool
        pool = ThreadPool(min(4, multiprocessing.cpu_count()))

        # scan forward
        res1_fwd = pool.apply_async(ihh_scan, (h1, gaps), kwargs)
        res2_fwd = pool.apply_async(ihh_scan, (h2, gaps), kwargs)

        # scan backward
        res1_rev = pool.apply_async(ihh_scan, (h1[::-1], gaps[::-1]), kwargs)
        res2_rev = pool.apply_async(ihh_scan, (h2[::-1], gaps[::-1]), kwargs)

        # wait for both to finish
        pool.close()
        pool.join()

        # obtain results
        ihh1_fwd = res1_fwd.get()
        ihh2_fwd = res2_fwd.get()
        ihh1_rev = res1_rev.get()
        ihh2_rev = res2_rev.get()

        # cleanup
        pool.terminate()

    else:
        # compute without threads

        # scan forward
        ihh1_fwd = ihh_scan(h1, gaps, **kwargs)
        ihh2_fwd = ihh_scan(h2, gaps, **kwargs)

        # scan backward
        ihh1_rev = ihh_scan(h1[::-1], gaps[::-1], **kwargs)
        ihh2_rev = ihh_scan(h2[::-1], gaps[::-1], **kwargs)

    # handle reverse scans
    ihh1_rev = ihh1_rev[::-1]
    ihh2_rev = ihh2_rev[::-1]

    # compute unstandardized score
    ihh1 = ihh1_fwd + ihh1_rev
    ihh2 = ihh2_fwd + ihh2_rev
    score = np.log(ihh1 / ihh2)

    return score

Exemplo n.º 13

0

Exibir arquivo

def ihs(h,
        pos,
        map_pos=None,
        min_ehh=0.05,
        min_maf=0.05,
        include_edges=False,
        gap_scale=20000,
        max_gap=200000,
        is_accessible=None,
        use_threads=True):
    """Compute the unstandardized integrated haplotype score (IHS) for each
    variant, comparing integrated haplotype homozygosity between the
    reference (0) and alternate (1) alleles.

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    pos : array_like, int, shape (n_variants,)
        Variant positions (physical distance).
    map_pos : array_like, float, shape (n_variants,)
        Variant positions (genetic map distance).
    min_ehh: float, optional
        Minimum EHH beyond which to truncate integrated haplotype
        homozygosity calculation.
    min_maf : float, optional
        Do not compute integrated haplotype homozogysity for variants with
        minor allele frequency below this value.
    include_edges : bool, optional
        If True, report scores even if EHH does not decay below `min_ehh`
        before reaching the edge of the data.
    gap_scale : int, optional
        Rescale distance between variants if gap is larger than this value.
    max_gap : int, optional
        Do not report scores if EHH spans a gap larger than this number of
        base pairs.
    is_accessible : array_like, bool, optional
        Genome accessibility array. If provided, distance between variants
        will be computed as the number of accessible bases between them.
    use_threads : bool, optional
        If True use multiple threads to compute.

    Returns
    -------
    score : ndarray, float, shape (n_variants,)
        Unstandardized IHS scores.

    Notes
    -----

    This function will calculate IHS for all variants. To exclude variants
    below a given minor allele frequency, filter the input haplotype array
    before passing to this function.

    This function computes IHS comparing the reference and alternate alleles.
    These can be polarised by switching the sign for any variant where the
    reference allele is derived.

    This function returns NaN for any IHS calculations where haplotype
    homozygosity does not decay below `min_ehh` before reaching the first or
    last variant. To disable this behaviour, set `include_edges` to True.

    Note that the unstandardized score is returned. Usually these scores are
    then standardized in different allele frequency bins.

    See Also
    --------
    standardize_by_allele_count

    """

    # check inputs
    h = asarray_ndim(h, 2)
    check_integer_dtype(h)
    pos = asarray_ndim(pos, 1)
    check_dim0_aligned(h, pos)
    h = memoryview_safe(h)
    pos = memoryview_safe(pos)

    # compute gaps between variants for integration
    gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible)

    # setup kwargs
    kwargs = dict(min_ehh=min_ehh,
                  min_maf=min_maf,
                  include_edges=include_edges)

    if use_threads and multiprocessing.cpu_count() > 1:
        # run with threads

        # create pool
        pool = ThreadPool(2)

        # scan forward
        result_fwd = pool.apply_async(ihh01_scan, (h, gaps), kwargs)

        # scan backward
        result_rev = pool.apply_async(ihh01_scan, (h[::-1], gaps[::-1]),
                                      kwargs)

        # wait for both to finish
        pool.close()
        pool.join()

        # obtain results
        ihh0_fwd, ihh1_fwd = result_fwd.get()
        ihh0_rev, ihh1_rev = result_rev.get()

        # cleanup
        pool.terminate()

    else:
        # run without threads

        # scan forward
        ihh0_fwd, ihh1_fwd = ihh01_scan(h, gaps, **kwargs)

        # scan backward
        ihh0_rev, ihh1_rev = ihh01_scan(h[::-1], gaps[::-1], **kwargs)

    # handle reverse scan
    ihh0_rev = ihh0_rev[::-1]
    ihh1_rev = ihh1_rev[::-1]

    # compute unstandardized score
    ihh0 = ihh0_fwd + ihh0_rev
    ihh1 = ihh1_fwd + ihh1_rev
    score = np.log(ihh1 / ihh0)

    return score