def rogers_huff_r_between(gna, gnb, fill=np.nan): """Estimate the linkage disequilibrium parameter *r* for each pair of variants between the two input arrays, using the method of Rogers and Huff (2008). Parameters ---------- gna, gnb : array_like, int8, shape (n_variants, n_samples) Diploid genotypes at biallelic variants, coded as the number of alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt). fill : float, optional Value to use where r cannot be calculated. Returns ------- r : ndarray, float, shape (m_variants, n_variants ) Matrix in rectangular form. """ # check inputs gna = asarray_ndim(gna, 2, dtype='i1') gnb = asarray_ndim(gnb, 2, dtype='i1') gna = memoryview_safe(gna) gnb = memoryview_safe(gnb) # compute correlation coefficients r = gn_pairwise2_corrcoef_int8(gna, gnb, fill) # convenience for singletons if r.size == 1: r = r[0, 0] return r
def voight_painting(h): """Paint haplotypes, assigning a unique integer to each shared haplotype prefix. Parameters ---------- h : array_like, int, shape (n_variants, n_haplotypes) Haplotype array. Returns ------- painting : ndarray, int, shape (n_variants, n_haplotypes) Painting array. indices : ndarray, int, shape (n_hapotypes,) Haplotype indices after sorting by prefix. """ # check inputs # N.B., ensure int8 so we can use cython optimisation h = HaplotypeArray(np.asarray(h), copy=False) if h.max() > 1: raise NotImplementedError('only biallelic variants are supported') if h.min() < 0: raise NotImplementedError('missing calls are not supported') # sort by prefix indices = h.prefix_argsort() h = np.take(h, indices, axis=1) # paint painting = paint_shared_prefixes(memoryview_safe(np.asarray(h))) return painting, indices
def rogers_huff_r(gn, fill=np.nan): """Estimate the linkage disequilibrium parameter *r* for each pair of variants using the method of Rogers and Huff (2008). Parameters ---------- gn : array_like, int8, shape (n_variants, n_samples) Diploid genotypes at biallelic variants, coded as the number of alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt). fill : float, optional Value to use where r cannot be calculated. Returns ------- r : ndarray, float, shape (n_variants * (n_variants - 1) // 2,) Matrix in condensed form. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [1, 1], [0, 0]], ... [[0, 0], [1, 1], [0, 0]], ... [[1, 1], [0, 0], [1, 1]], ... [[0, 0], [0, 1], [-1, -1]]], dtype='i1') >>> gn = g.to_n_alt(fill=-1) >>> gn array([[ 0, 2, 0], [ 0, 2, 0], [ 2, 0, 2], [ 0, 1, -1]], dtype=int8) >>> r = allel.stats.rogers_huff_r(gn) >>> r # doctest: +ELLIPSIS array([ 1. , -1.00000012, 1. , -1.00000012, 1. , -1. ], ... >>> r ** 2 # doctest: +ELLIPSIS array([ 1. , 1.00000024, 1. , 1.00000024, 1. , 1. ], ... >>> from scipy.spatial.distance import squareform >>> squareform(r ** 2) array([[ 0. , 1. , 1.00000024, 1. ], [ 1. , 0. , 1.00000024, 1. ], [ 1.00000024, 1.00000024, 0. , 1. ], [ 1. , 1. , 1. , 0. ]], dtype=float32) """ # check inputs gn = asarray_ndim(gn, 2, dtype='i1') gn = memoryview_safe(gn) # compute correlation coefficients r = gn_pairwise_corrcoef_int8(gn, fill) # convenience for singletons if r.size == 1: r = r[0] return r
def locate_unlinked(gn, size=100, step=20, threshold=.1, blen=None): """Locate variants in approximate linkage equilibrium, where r**2 is below the given `threshold`. Parameters ---------- gn : array_like, int8, shape (n_variants, n_samples) Diploid genotypes at biallelic variants, coded as the number of alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt). size : int Window size (number of variants). step : int Number of variants to advance to the next window. threshold : float Maximum value of r**2 to include variants. blen : int, optional Block length to use for chunked computation. Returns ------- loc : ndarray, bool, shape (n_variants) Boolean array where True items locate variants in approximate linkage equilibrium. Notes ----- The value of r**2 between each pair of variants is calculated using the method of Rogers and Huff (2008). """ # check inputs if not hasattr(gn, 'shape') or not hasattr(gn, 'dtype'): gn = np.asarray(gn, dtype='i1') if gn.ndim != 2: raise ValueError('gn must have two dimensions') # setup output loc = np.ones(gn.shape[0], dtype='u1') # compute in chunks to avoid loading big arrays into memory blen = get_blen_array(gn, blen) blen = max(blen, 10 * size) # avoid too small chunks n_variants = gn.shape[0] for i in range(0, n_variants, blen): # N.B., ensure overlap with next window j = min(n_variants, i + blen + size) gnb = np.asarray(gn[i:j], dtype='i1') gnb = memoryview_safe(gnb) locb = loc[i:j] gn_locate_unlinked_int8(gnb, locb, size, step, threshold) return loc.astype('b1')
def phase_parents_by_transmission(g, window_size): """Phase parent genotypes from a trio or cross, given progeny genotypes already phased by Mendelian transmission. Parameters ---------- g : GenotypeArray Genotype array, with parents as first two columns and progeny as remaining columns, where progeny genotypes are already phased. window_size : int Number of previous heterozygous sites to include when phasing each parent. A number somewhere between 10 and 100 may be appropriate, depending on levels of heterozygosity and quality of data. Returns ------- g : GenotypeArray Genotype array with parents phased where possible. """ # setup check_type(g, GenotypeArray) check_dtype(g.values, 'i1') check_ploidy(g.ploidy, 2) if g.is_phased is None: raise ValueError( 'genotype array must first have progeny phased by transmission') check_min_samples(g.n_samples, 3) # run the phasing g._values = memoryview_safe(g.values) g._is_phased = memoryview_safe(g.is_phased) _opt_phase_parents_by_transmission(g.values, g.is_phased.view('u1'), window_size) # outputs return g
def ehh_decay(h, truncate=False): """Compute the decay of extended haplotype homozygosity (EHH) moving away from the first variant. Parameters ---------- h : array_like, int, shape (n_variants, n_haplotypes) Haplotype array. truncate : bool, optional If True, the return array will exclude trailing zeros. Returns ------- ehh : ndarray, float, shape (n_variants, ) EHH at successive variants from the first variant. """ # check inputs # N.B., ensure int8 so we can use cython optimisation h = HaplotypeArray(np.asarray(h), copy=False) if h.max() > 1: raise NotImplementedError('only biallelic variants are supported') if h.min() < 0: raise NotImplementedError('missing calls are not supported') # initialise n_variants = h.n_variants # number of rows, i.e., variants n_haplotypes = h.n_haplotypes # number of columns, i.e., haplotypes n_pairs = (n_haplotypes * (n_haplotypes - 1)) // 2 # compute the shared prefix length between all pairs of haplotypes spl = pairwise_shared_prefix_lengths(memoryview_safe(np.asarray(h))) # compute EHH by counting the number of shared prefixes extending beyond # each variant minlength = None if truncate else n_variants + 1 b = np.bincount(spl, minlength=minlength) c = np.cumsum(b[::-1])[:-1] ehh = (c / n_pairs)[::-1] return ehh
def phase_by_transmission(g, window_size, copy=True): """Phase genotypes in a trio or cross where possible using Mendelian transmission. Parameters ---------- g : array_like, int, shape (n_variants, n_samples, 2) Genotype array, with parents as first two columns and progeny as remaining columns. window_size : int Number of previous heterozygous sites to include when phasing each parent. A number somewhere between 10 and 100 may be appropriate, depending on levels of heterozygosity and quality of data. copy : bool, optional If False, attempt to phase genotypes in-place. Note that this is only possible if the input array has int8 dtype, otherwise a copy is always made regardless of this parameter. Returns ------- g : GenotypeArray Genotype array with progeny phased where possible. """ # setup g = np.asarray(g, dtype='i1') g = GenotypeArray(g, copy=copy) g._values = memoryview_safe(g.values) check_ploidy(g.ploidy, 2) check_min_samples(g.n_samples, 3) # phase the progeny is_phased = _opt_phase_progeny_by_transmission(g.values) g.is_phased = np.asarray(is_phased).view(bool) # phase the parents _opt_phase_parents_by_transmission(g.values, is_phased, window_size) return g
def tabulate_state_blocks(x, states, pos=None): """Construct a dataframe where each row provides information about continuous state blocks. Parameters ---------- x : array_like, int 1-dimensional array of state values. states : set Set of states of interest. Any state value not in this set will be ignored. pos : array_like, int, optional Array of positions corresponding to values in `x`. Returns ------- df : DataFrame Examples -------- >>> import allel >>> x = [1, 1, 0, 1, 1, 2, 2, 0, 2, 1, 1] >>> df = allel.tabulate_state_blocks(x, states={1, 2}) >>> df state support start_lidx ... size_min size_max is_marginal 0 1 4 -1 ... 5 -1 True 1 2 3 4 ... 4 4 False 2 1 2 8 ... 2 -1 True [3 rows x 9 columns] >>> pos = [2, 4, 7, 8, 10, 14, 19, 23, 28, 30, 31] >>> df = allel.tabulate_state_blocks(x, states={1, 2}, pos=pos) >>> df state support start_lidx ... stop_rpos length_min length_max 0 1 4 -1 ... 14 9 -1 1 2 3 4 ... 30 15 19 2 1 2 8 ... -1 2 -1 [3 rows x 15 columns] """ # check inputs x = asarray_ndim(x, 1) check_integer_dtype(x) x = memoryview_safe(x) # find state transitions switch_points, transitions, observations = state_transitions(x, states) # setup some helpers t = transitions[1:, 0] o = observations[1:] s1 = switch_points[:-1] s2 = switch_points[1:] is_marginal = (s1[:, 0] < 0) | (s2[:, 1] < 0) size_min = s2[:, 0] - s1[:, 1] + 1 size_max = s2[:, 1] - s1[:, 0] - 1 size_max[is_marginal] = -1 # start to build a dataframe items = [ ('state', t), ('support', o), ('start_lidx', s1[:, 0]), ('start_ridx', s1[:, 1]), ('stop_lidx', s2[:, 0]), ('stop_ridx', s2[:, 1]), ('size_min', size_min), ('size_max', size_max), ('is_marginal', is_marginal) ] # deal with optional positions if pos is not None: pos = asarray_ndim(pos, 1) check_dim0_aligned(x, pos) check_integer_dtype(pos) # obtain switch positions switch_positions = np.take(pos, switch_points) # deal with boundary transitions switch_positions[0, 0] = -1 switch_positions[-1, 1] = -1 # setup helpers p1 = switch_positions[:-1] p2 = switch_positions[1:] length_min = p2[:, 0] - p1[:, 1] + 1 length_max = p2[:, 1] - p1[:, 0] - 1 length_max[is_marginal] = -1 items += [ ('start_lpos', p1[:, 0]), ('start_rpos', p1[:, 1]), ('stop_lpos', p2[:, 0]), ('stop_rpos', p2[:, 1]), ('length_min', length_min), ('length_max', length_max), ] import pandas return pandas.DataFrame.from_dict(OrderedDict(items))
def tabulate_state_transitions(x, states, pos=None): """Construct a dataframe where each row provides information about a state transition. Parameters ---------- x : array_like, int 1-dimensional array of state values. states : set Set of states of interest. Any state value not in this set will be ignored. pos : array_like, int, optional Array of positions corresponding to values in `x`. Returns ------- df : DataFrame Notes ----- The resulting dataframe includes one row at the start representing the first state observation and one row at the end representing the last state observation. Examples -------- >>> import allel >>> x = [1, 1, 0, 1, 1, 2, 2, 0, 2, 1, 1] >>> df = allel.tabulate_state_transitions(x, states={1, 2}) >>> df lstate rstate lidx ridx 0 -1 1 -1 0 1 1 2 4 5 2 2 1 8 9 3 1 -1 10 -1 >>> pos = [2, 4, 7, 8, 10, 14, 19, 23, 28, 30, 31] >>> df = allel.tabulate_state_transitions(x, states={1, 2}, pos=pos) >>> df lstate rstate lidx ridx lpos rpos 0 -1 1 -1 0 -1 2 1 1 2 4 5 10 14 2 2 1 8 9 28 30 3 1 -1 10 -1 31 -1 """ # check inputs x = asarray_ndim(x, 1) check_integer_dtype(x) x = memoryview_safe(x) # find state transitions switch_points, transitions, _ = state_transitions(x, states) # start to build a dataframe items = [('lstate', transitions[:, 0]), ('rstate', transitions[:, 1]), ('lidx', switch_points[:, 0]), ('ridx', switch_points[:, 1])] # deal with optional positions if pos is not None: pos = asarray_ndim(pos, 1) check_dim0_aligned(x, pos) check_integer_dtype(pos) # find switch positions switch_positions = np.take(pos, switch_points) # deal with boundary transitions switch_positions[0, 0] = -1 switch_positions[-1, 1] = -1 # add columns into dataframe items += [('lpos', switch_positions[:, 0]), ('rpos', switch_positions[:, 1])] import pandas return pandas.DataFrame.from_dict(OrderedDict(items))
def xpnsl(h1, h2, use_threads=True): """Cross-population version of the NSL statistic. Parameters ---------- h1 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the first population. h2 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the second population. use_threads : bool, optional If True use multiple threads to compute. Returns ------- score : ndarray, float, shape (n_variants,) Unstandardized XPNSL scores. """ # check inputs h1 = asarray_ndim(h1, 2) check_integer_dtype(h1) h2 = asarray_ndim(h2, 2) check_integer_dtype(h2) check_dim0_aligned(h1, h2) h1 = memoryview_safe(h1) h2 = memoryview_safe(h2) if use_threads and multiprocessing.cpu_count() > 1: # use multiple threads # setup threadpool pool = ThreadPool(min(4, multiprocessing.cpu_count())) # scan forward res1_fwd = pool.apply_async(nsl_scan, args=(h1, )) res2_fwd = pool.apply_async(nsl_scan, args=(h2, )) # scan backward res1_rev = pool.apply_async(nsl_scan, args=(h1[::-1], )) res2_rev = pool.apply_async(nsl_scan, args=(h2[::-1], )) # wait for both to finish pool.close() pool.join() # obtain results nsl1_fwd = res1_fwd.get() nsl2_fwd = res2_fwd.get() nsl1_rev = res1_rev.get() nsl2_rev = res2_rev.get() # cleanup pool.terminate() else: # compute without threads # scan forward nsl1_fwd = nsl_scan(h1) nsl2_fwd = nsl_scan(h2) # scan backward nsl1_rev = nsl_scan(h1[::-1]) nsl2_rev = nsl_scan(h2[::-1]) # handle reverse scans nsl1_rev = nsl1_rev[::-1] nsl2_rev = nsl2_rev[::-1] # compute unstandardized score nsl1 = nsl1_fwd + nsl1_rev nsl2 = nsl2_fwd + nsl2_rev score = np.log(nsl1 / nsl2) return score
def nsl(h, use_threads=True): """Compute the unstandardized number of segregating sites by length (nSl) for each variant, comparing the reference and alternate alleles, after Ferrer-Admetlla et al. (2014). Parameters ---------- h : array_like, int, shape (n_variants, n_haplotypes) Haplotype array. use_threads : bool, optional If True use multiple threads to compute. Returns ------- score : ndarray, float, shape (n_variants,) Notes ----- This function will calculate nSl for all variants. To exclude variants below a given minor allele frequency, filter the input haplotype array before passing to this function. This function computes nSl by comparing the reference and alternate alleles. These can be polarised by switching the sign for any variant where the reference allele is derived. This function does nothing about nSl calculations where haplotype homozygosity extends up to the first or last variant. There may be edge effects. Note that the unstandardized score is returned. Usually these scores are then standardized in different allele frequency bins. See Also -------- standardize_by_allele_count """ # check inputs h = asarray_ndim(h, 2) check_integer_dtype(h) h = memoryview_safe(h) # # check there are no invariant sites # ac = h.count_alleles() # assert np.all(ac.is_segregating()), 'please remove non-segregating sites' if use_threads and multiprocessing.cpu_count() > 1: # create pool pool = ThreadPool(2) # scan forward result_fwd = pool.apply_async(nsl01_scan, args=(h, )) # scan backward result_rev = pool.apply_async(nsl01_scan, args=(h[::-1], )) # wait for both to finish pool.close() pool.join() # obtain results nsl0_fwd, nsl1_fwd = result_fwd.get() nsl0_rev, nsl1_rev = result_rev.get() else: # scan forward nsl0_fwd, nsl1_fwd = nsl01_scan(h) # scan backward nsl0_rev, nsl1_rev = nsl01_scan(h[::-1]) # handle backwards nsl0_rev = nsl0_rev[::-1] nsl1_rev = nsl1_rev[::-1] # compute unstandardized score nsl0 = nsl0_fwd + nsl0_rev nsl1 = nsl1_fwd + nsl1_rev score = np.log(nsl1 / nsl0) return score
def xpehh(h1, h2, pos, map_pos=None, min_ehh=0.05, include_edges=False, gap_scale=20000, max_gap=200000, is_accessible=None, use_threads=True): """Compute the unstandardized cross-population extended haplotype homozygosity score (XPEHH) for each variant. Parameters ---------- h1 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the first population. h2 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the second population. pos : array_like, int, shape (n_variants,) Variant positions on physical or genetic map. map_pos : array_like, float, shape (n_variants,) Variant positions (genetic map distance). min_ehh: float, optional Minimum EHH beyond which to truncate integrated haplotype homozygosity calculation. include_edges : bool, optional If True, report scores even if EHH does not decay below `min_ehh` before reaching the edge of the data. gap_scale : int, optional Rescale distance between variants if gap is larger than this value. max_gap : int, optional Do not report scores if EHH spans a gap larger than this number of base pairs. is_accessible : array_like, bool, optional Genome accessibility array. If provided, distance between variants will be computed as the number of accessible bases between them. use_threads : bool, optional If True use multiple threads to compute. Returns ------- score : ndarray, float, shape (n_variants,) Unstandardized XPEHH scores. Notes ----- This function will calculate XPEHH for all variants. To exclude variants below a given minor allele frequency, filter the input haplotype arrays before passing to this function. This function returns NaN for any EHH calculations where haplotype homozygosity does not decay below `min_ehh` before reaching the first or last variant. To disable this behaviour, set `include_edges` to True. Note that the unstandardized score is returned. Usually these scores are then standardized genome-wide. Haplotype arrays from the two populations may have different numbers of haplotypes. See Also -------- standardize """ # check inputs h1 = asarray_ndim(h1, 2) check_integer_dtype(h1) h2 = asarray_ndim(h2, 2) check_integer_dtype(h2) pos = asarray_ndim(pos, 1) check_dim0_aligned(h1, h2, pos) h1 = memoryview_safe(h1) h2 = memoryview_safe(h2) pos = memoryview_safe(pos) # compute gaps between variants for integration gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible) # setup kwargs kwargs = dict(min_ehh=min_ehh, include_edges=include_edges) if use_threads and multiprocessing.cpu_count() > 1: # use multiple threads # setup threadpool pool = ThreadPool(min(4, multiprocessing.cpu_count())) # scan forward res1_fwd = pool.apply_async(ihh_scan, (h1, gaps), kwargs) res2_fwd = pool.apply_async(ihh_scan, (h2, gaps), kwargs) # scan backward res1_rev = pool.apply_async(ihh_scan, (h1[::-1], gaps[::-1]), kwargs) res2_rev = pool.apply_async(ihh_scan, (h2[::-1], gaps[::-1]), kwargs) # wait for both to finish pool.close() pool.join() # obtain results ihh1_fwd = res1_fwd.get() ihh2_fwd = res2_fwd.get() ihh1_rev = res1_rev.get() ihh2_rev = res2_rev.get() # cleanup pool.terminate() else: # compute without threads # scan forward ihh1_fwd = ihh_scan(h1, gaps, **kwargs) ihh2_fwd = ihh_scan(h2, gaps, **kwargs) # scan backward ihh1_rev = ihh_scan(h1[::-1], gaps[::-1], **kwargs) ihh2_rev = ihh_scan(h2[::-1], gaps[::-1], **kwargs) # handle reverse scans ihh1_rev = ihh1_rev[::-1] ihh2_rev = ihh2_rev[::-1] # compute unstandardized score ihh1 = ihh1_fwd + ihh1_rev ihh2 = ihh2_fwd + ihh2_rev score = np.log(ihh1 / ihh2) return score
def ihs(h, pos, map_pos=None, min_ehh=0.05, min_maf=0.05, include_edges=False, gap_scale=20000, max_gap=200000, is_accessible=None, use_threads=True): """Compute the unstandardized integrated haplotype score (IHS) for each variant, comparing integrated haplotype homozygosity between the reference (0) and alternate (1) alleles. Parameters ---------- h : array_like, int, shape (n_variants, n_haplotypes) Haplotype array. pos : array_like, int, shape (n_variants,) Variant positions (physical distance). map_pos : array_like, float, shape (n_variants,) Variant positions (genetic map distance). min_ehh: float, optional Minimum EHH beyond which to truncate integrated haplotype homozygosity calculation. min_maf : float, optional Do not compute integrated haplotype homozogysity for variants with minor allele frequency below this value. include_edges : bool, optional If True, report scores even if EHH does not decay below `min_ehh` before reaching the edge of the data. gap_scale : int, optional Rescale distance between variants if gap is larger than this value. max_gap : int, optional Do not report scores if EHH spans a gap larger than this number of base pairs. is_accessible : array_like, bool, optional Genome accessibility array. If provided, distance between variants will be computed as the number of accessible bases between them. use_threads : bool, optional If True use multiple threads to compute. Returns ------- score : ndarray, float, shape (n_variants,) Unstandardized IHS scores. Notes ----- This function will calculate IHS for all variants. To exclude variants below a given minor allele frequency, filter the input haplotype array before passing to this function. This function computes IHS comparing the reference and alternate alleles. These can be polarised by switching the sign for any variant where the reference allele is derived. This function returns NaN for any IHS calculations where haplotype homozygosity does not decay below `min_ehh` before reaching the first or last variant. To disable this behaviour, set `include_edges` to True. Note that the unstandardized score is returned. Usually these scores are then standardized in different allele frequency bins. See Also -------- standardize_by_allele_count """ # check inputs h = asarray_ndim(h, 2) check_integer_dtype(h) pos = asarray_ndim(pos, 1) check_dim0_aligned(h, pos) h = memoryview_safe(h) pos = memoryview_safe(pos) # compute gaps between variants for integration gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible) # setup kwargs kwargs = dict(min_ehh=min_ehh, min_maf=min_maf, include_edges=include_edges) if use_threads and multiprocessing.cpu_count() > 1: # run with threads # create pool pool = ThreadPool(2) # scan forward result_fwd = pool.apply_async(ihh01_scan, (h, gaps), kwargs) # scan backward result_rev = pool.apply_async(ihh01_scan, (h[::-1], gaps[::-1]), kwargs) # wait for both to finish pool.close() pool.join() # obtain results ihh0_fwd, ihh1_fwd = result_fwd.get() ihh0_rev, ihh1_rev = result_rev.get() # cleanup pool.terminate() else: # run without threads # scan forward ihh0_fwd, ihh1_fwd = ihh01_scan(h, gaps, **kwargs) # scan backward ihh0_rev, ihh1_rev = ihh01_scan(h[::-1], gaps[::-1], **kwargs) # handle reverse scan ihh0_rev = ihh0_rev[::-1] ihh1_rev = ihh1_rev[::-1] # compute unstandardized score ihh0 = ihh0_fwd + ihh0_rev ihh1 = ihh1_fwd + ihh1_rev score = np.log(ihh1 / ihh0) return score