def __init__(self, data, chunks=None, name=None, lock=False): super(AlleleCountsDaskArray, self).__init__(data, chunks=chunks, name=name, lock=lock) check_ndim(self.values, 2) check_integer_dtype(self.values)
def sfs(dac): """Compute the site frequency spectrum given derived allele counts at a set of biallelic variants. Parameters ---------- dac : array_like, int, shape (n_variants,) Array of derived allele counts. Returns ------- sfs : ndarray, int, shape (n_chromosomes,) Array where the kth element is the number of variant sites with k derived alleles. """ # check input dac = asarray_ndim(dac, 1) check_integer_dtype(dac) # need platform integer for bincount dac = dac.astype(int, copy=False) # compute site frequency spectrum s = np.bincount(dac) return s
def sfs_folded(ac): """Compute the folded site frequency spectrum given reference and alternate allele counts at a set of biallelic variants. Parameters ---------- ac : array_like, int, shape (n_variants, 2) Allele counts array. Returns ------- sfs_folded : ndarray, int, shape (n_chromosomes//2,) Array where the kth element is the number of variant sites with a minor allele count of k. """ # check input ac = asarray_ndim(ac, 2) assert ac.shape[1] == 2, 'only biallelic variants are supported' check_integer_dtype(ac) # compute minor allele counts mac = np.amin(ac, axis=1) # need platform integer for bincount mac = mac.astype(int, copy=False) # compute folded site frequency spectrum s = np.bincount(mac) return s
def joint_sfs(dac1, dac2): """Compute the joint site frequency spectrum between two populations. Parameters ---------- dac1 : array_like, int, shape (n_variants,) Derived allele counts for the first population. dac2 : array_like, int, shape (n_variants,) Derived allele counts for the second population. Returns ------- joint_sfs : ndarray, int, shape (m_chromosomes, n_chromosomes) Array where the (i, j)th element is the number of variant sites with i derived alleles in the first population and j derived alleles in the second population. """ # check inputs dac1 = asarray_ndim(dac1, 1) dac2 = asarray_ndim(dac2, 1) check_integer_dtype(dac1) check_integer_dtype(dac2) # compute site frequency spectrum n = int(np.max(dac1) + 1) m = int(np.max(dac2) + 1) # need platform integer for bincount tmp = (dac1 * m + dac2).astype(int, copy=False) s = np.bincount(tmp) s.resize(n, m) return s
def __init__(self, data, chunks=None, name=None, lock=False): super(GenotypesDask, self).__init__(data, chunks=chunks, name=name, lock=lock) check_integer_dtype(self.values) self._mask = None self._is_phased = None
def _check_dac_n(dac, n): dac = asarray_ndim(dac, 1) check_integer_dtype(dac) mx = np.max(dac) if n is None: n = mx elif n < mx: raise ValueError( 'number of chromosomes too small; expected {}, found {}'.format( n, mx)) return dac, int(n)
def _check_ac_n(ac, n): ac = asarray_ndim(ac, 2) if ac.shape[1] != 2: raise ValueError('only biallelic variants are supported') check_integer_dtype(ac) mx = np.max(np.sum(ac, axis=1)) if n is None: n = mx elif n < mx: raise ValueError( 'number of chromosomes too small; expected {}, found {}'.format( n, mx)) return ac, int(n)
def joint_sfs_folded(ac1, ac2): """Compute the joint folded site frequency spectrum between two populations. Parameters ---------- ac1 : array_like, int, shape (n_variants, 2) Allele counts for the first population. ac2 : array_like, int, shape (n_variants, 2) Allele counts for the second population. Returns ------- joint_sfs_folded : ndarray, int, shape (m_chromosomes//2, n_chromosomes//2) Array where the (i, j)th element is the number of variant sites with a minor allele count of i in the first population and j in the second population. """ # check inputs ac1 = asarray_ndim(ac1, 2) ac2 = asarray_ndim(ac2, 2) assert ac1.shape[1] == ac2.shape[1] == 2, \ 'only biallelic variants are supported' check_integer_dtype(ac1) check_integer_dtype(ac2) # compute minor allele counts mac1 = np.amin(ac1, axis=1) mac2 = np.amin(ac2, axis=1) # compute site frequency spectrum m = int(np.max(mac1) + 1) n = int(np.max(mac2) + 1) tmp = (mac1 * n + mac2).astype(int, copy=False) s = np.bincount(tmp) s.resize(m, n) return s
def xpehh(h1, h2, pos, map_pos=None, min_ehh=0.05, include_edges=False, gap_scale=20000, max_gap=200000, is_accessible=None, use_threads=True): """Compute the unstandardized cross-population extended haplotype homozygosity score (XPEHH) for each variant. Parameters ---------- h1 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the first population. h2 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the second population. pos : array_like, int, shape (n_variants,) Variant positions on physical or genetic map. map_pos : array_like, float, shape (n_variants,) Variant positions (genetic map distance). min_ehh: float, optional Minimum EHH beyond which to truncate integrated haplotype homozygosity calculation. include_edges : bool, optional If True, report scores even if EHH does not decay below `min_ehh` before reaching the edge of the data. gap_scale : int, optional Rescale distance between variants if gap is larger than this value. max_gap : int, optional Do not report scores if EHH spans a gap larger than this number of base pairs. is_accessible : array_like, bool, optional Genome accessibility array. If provided, distance between variants will be computed as the number of accessible bases between them. use_threads : bool, optional If True use multiple threads to compute. Returns ------- score : ndarray, float, shape (n_variants,) Unstandardized XPEHH scores. Notes ----- This function will calculate XPEHH for all variants. To exclude variants below a given minor allele frequency, filter the input haplotype arrays before passing to this function. This function returns NaN for any EHH calculations where haplotype homozygosity does not decay below `min_ehh` before reaching the first or last variant. To disable this behaviour, set `include_edges` to True. Note that the unstandardized score is returned. Usually these scores are then standardized genome-wide. Haplotype arrays from the two populations may have different numbers of haplotypes. See Also -------- standardize """ # check inputs h1 = asarray_ndim(h1, 2) check_integer_dtype(h1) h2 = asarray_ndim(h2, 2) check_integer_dtype(h2) pos = asarray_ndim(pos, 1) check_dim0_aligned(h1, h2, pos) # compute gaps between variants for integration gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible) # setup kwargs kwargs = dict(min_ehh=min_ehh, include_edges=include_edges) if use_threads and multiprocessing.cpu_count() > 1: # use multiple threads # setup threadpool pool = ThreadPool(min(4, multiprocessing.cpu_count())) # scan forward res1_fwd = pool.apply_async(ihh_scan, (h1, gaps), kwargs) res2_fwd = pool.apply_async(ihh_scan, (h2, gaps), kwargs) # scan backward res1_rev = pool.apply_async(ihh_scan, (h1[::-1], gaps[::-1]), kwargs) res2_rev = pool.apply_async(ihh_scan, (h2[::-1], gaps[::-1]), kwargs) # wait for both to finish pool.close() pool.join() # obtain results ihh1_fwd = res1_fwd.get() ihh2_fwd = res2_fwd.get() ihh1_rev = res1_rev.get() ihh2_rev = res2_rev.get() # cleanup pool.terminate() else: # compute without threads # scan forward ihh1_fwd = ihh_scan(h1, gaps, **kwargs) ihh2_fwd = ihh_scan(h2, gaps, **kwargs) # scan backward ihh1_rev = ihh_scan(h1[::-1], gaps[::-1], **kwargs) ihh2_rev = ihh_scan(h2[::-1], gaps[::-1], **kwargs) # handle reverse scans ihh1_rev = ihh1_rev[::-1] ihh2_rev = ihh2_rev[::-1] # compute unstandardized score ihh1 = ihh1_fwd + ihh1_rev ihh2 = ihh2_fwd + ihh2_rev score = np.log(ihh1 / ihh2) return score
def ihs(h, pos, map_pos=None, min_ehh=0.05, min_maf=0.05, include_edges=False, gap_scale=20000, max_gap=200000, is_accessible=None, use_threads=True): """Compute the unstandardized integrated haplotype score (IHS) for each variant, comparing integrated haplotype homozygosity between the reference (0) and alternate (1) alleles. Parameters ---------- h : array_like, int, shape (n_variants, n_haplotypes) Haplotype array. pos : array_like, int, shape (n_variants,) Variant positions (physical distance). map_pos : array_like, float, shape (n_variants,) Variant positions (genetic map distance). min_ehh: float, optional Minimum EHH beyond which to truncate integrated haplotype homozygosity calculation. min_maf : float, optional Do not compute integrated haplotype homozogysity for variants with minor allele frequency below this value. include_edges : bool, optional If True, report scores even if EHH does not decay below `min_ehh` before reaching the edge of the data. gap_scale : int, optional Rescale distance between variants if gap is larger than this value. max_gap : int, optional Do not report scores if EHH spans a gap larger than this number of base pairs. is_accessible : array_like, bool, optional Genome accessibility array. If provided, distance between variants will be computed as the number of accessible bases between them. use_threads : bool, optional If True use multiple threads to compute. Returns ------- score : ndarray, float, shape (n_variants,) Unstandardized IHS scores. Notes ----- This function will calculate IHS for all variants. To exclude variants below a given minor allele frequency, filter the input haplotype array before passing to this function. This function computes IHS comparing the reference and alternate alleles. These can be polarised by switching the sign for any variant where the reference allele is derived. This function returns NaN for any IHS calculations where haplotype homozygosity does not decay below `min_ehh` before reaching the first or last variant. To disable this behaviour, set `include_edges` to True. Note that the unstandardized score is returned. Usually these scores are then standardized in different allele frequency bins. See Also -------- standardize_by_allele_count """ # check inputs h = asarray_ndim(h, 2) check_integer_dtype(h) pos = asarray_ndim(pos, 1) check_dim0_aligned(h, pos) # compute gaps between variants for integration gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible) # setup kwargs kwargs = dict(min_ehh=min_ehh, min_maf=min_maf, include_edges=include_edges) if use_threads and multiprocessing.cpu_count() > 1: # run with threads # create pool pool = ThreadPool(2) # scan forward result_fwd = pool.apply_async(ihh01_scan, (h, gaps), kwargs) # scan backward result_rev = pool.apply_async(ihh01_scan, (h[::-1], gaps[::-1]), kwargs) # wait for both to finish pool.close() pool.join() # obtain results ihh0_fwd, ihh1_fwd = result_fwd.get() ihh0_rev, ihh1_rev = result_rev.get() # cleanup pool.terminate() else: # run without threads # scan forward ihh0_fwd, ihh1_fwd = ihh01_scan(h, gaps, **kwargs) # scan backward ihh0_rev, ihh1_rev = ihh01_scan(h[::-1], gaps[::-1], **kwargs) # handle reverse scan ihh0_rev = ihh0_rev[::-1] ihh1_rev = ihh1_rev[::-1] # compute unstandardized score ihh0 = ihh0_fwd + ihh0_rev ihh1 = ihh1_fwd + ihh1_rev score = np.log(ihh1 / ihh0) return score
def __init__(self, data): super(GenotypeAlleleCountsChunkedArray, self).__init__(data) check_ndim(self.values, 3) check_integer_dtype(self.values)
def __init__(self, data): super(HaplotypeChunkedArray, self).__init__(data) check_ndim(self.values, 2) check_integer_dtype(self.values)
def __init__(self, data): super(GenotypeChunkedArray, self).__init__(data) check_ndim(self.values, 3) check_integer_dtype(self.values) self._mask = None self._is_phased = None
def tabulate_state_blocks(x, states, pos=None): """Construct a dataframe where each row provides information about continuous state blocks. Parameters ---------- x : array_like, int 1-dimensional array of state values. states : set Set of states of interest. Any state value not in this set will be ignored. pos : array_like, int, optional Array of positions corresponding to values in `x`. Returns ------- df : DataFrame Examples -------- >>> import allel >>> x = [1, 1, 0, 1, 1, 2, 2, 0, 2, 1, 1] >>> df = allel.tabulate_state_blocks(x, states={1, 2}) >>> df state support start_lidx ... size_min size_max is_marginal 0 1 4 -1 ... 5 -1 True 1 2 3 4 ... 4 4 False 2 1 2 8 ... 2 -1 True [3 rows x 9 columns] >>> pos = [2, 4, 7, 8, 10, 14, 19, 23, 28, 30, 31] >>> df = allel.tabulate_state_blocks(x, states={1, 2}, pos=pos) >>> df state support start_lidx ... stop_rpos length_min length_max 0 1 4 -1 ... 14 9 -1 1 2 3 4 ... 30 15 19 2 1 2 8 ... -1 2 -1 [3 rows x 15 columns] """ # check inputs x = asarray_ndim(x, 1) check_integer_dtype(x) x = memoryview_safe(x) # find state transitions switch_points, transitions, observations = state_transitions(x, states) # setup some helpers t = transitions[1:, 0] o = observations[1:] s1 = switch_points[:-1] s2 = switch_points[1:] is_marginal = (s1[:, 0] < 0) | (s2[:, 1] < 0) size_min = s2[:, 0] - s1[:, 1] + 1 size_max = s2[:, 1] - s1[:, 0] - 1 size_max[is_marginal] = -1 # start to build a dataframe items = [ ('state', t), ('support', o), ('start_lidx', s1[:, 0]), ('start_ridx', s1[:, 1]), ('stop_lidx', s2[:, 0]), ('stop_ridx', s2[:, 1]), ('size_min', size_min), ('size_max', size_max), ('is_marginal', is_marginal) ] # deal with optional positions if pos is not None: pos = asarray_ndim(pos, 1) check_dim0_aligned(x, pos) check_integer_dtype(pos) # obtain switch positions switch_positions = np.take(pos, switch_points) # deal with boundary transitions switch_positions[0, 0] = -1 switch_positions[-1, 1] = -1 # setup helpers p1 = switch_positions[:-1] p2 = switch_positions[1:] length_min = p2[:, 0] - p1[:, 1] + 1 length_max = p2[:, 1] - p1[:, 0] - 1 length_max[is_marginal] = -1 items += [ ('start_lpos', p1[:, 0]), ('start_rpos', p1[:, 1]), ('stop_lpos', p2[:, 0]), ('stop_rpos', p2[:, 1]), ('length_min', length_min), ('length_max', length_max), ] import pandas return pandas.DataFrame.from_dict(OrderedDict(items))
def __init__(self, data, chunks=None, name=None, lock=False): super(GenotypeAlleleCountsDask, self).__init__(data, chunks=chunks, name=name, lock=lock) check_integer_dtype(self.values)
def nsl(h, use_threads=True): """Compute the unstandardized number of segregating sites by length (nSl) for each variant, comparing the reference and alternate alleles, after Ferrer-Admetlla et al. (2014). Parameters ---------- h : array_like, int, shape (n_variants, n_haplotypes) Haplotype array. use_threads : bool, optional If True use multiple threads to compute. Returns ------- score : ndarray, float, shape (n_variants,) Notes ----- This function will calculate nSl for all variants. To exclude variants below a given minor allele frequency, filter the input haplotype array before passing to this function. This function computes nSl by comparing the reference and alternate alleles. These can be polarised by switching the sign for any variant where the reference allele is derived. This function does nothing about nSl calculations where haplotype homozygosity extends up to the first or last variant. There may be edge effects. Note that the unstandardized score is returned. Usually these scores are then standardized in different allele frequency bins. See Also -------- standardize_by_allele_count """ # check inputs h = asarray_ndim(h, 2) check_integer_dtype(h) # # check there are no invariant sites # ac = h.count_alleles() # assert np.all(ac.is_segregating()), 'please remove non-segregating sites' if use_threads and multiprocessing.cpu_count() > 1: # create pool pool = ThreadPool(2) # scan forward result_fwd = pool.apply_async(nsl01_scan, args=(h, )) # scan backward result_rev = pool.apply_async(nsl01_scan, args=(h[::-1], )) # wait for both to finish pool.close() pool.join() # obtain results nsl0_fwd, nsl1_fwd = result_fwd.get() nsl0_rev, nsl1_rev = result_rev.get() else: # scan forward nsl0_fwd, nsl1_fwd = nsl01_scan(h) # scan backward nsl0_rev, nsl1_rev = nsl01_scan(h[::-1]) # handle backwards nsl0_rev = nsl0_rev[::-1] nsl1_rev = nsl1_rev[::-1] # compute unstandardized score nsl0 = nsl0_fwd + nsl0_rev nsl1 = nsl1_fwd + nsl1_rev score = np.log(nsl1 / nsl0) return score
def xpnsl(h1, h2, use_threads=True): """Cross-population version of the NSL statistic. Parameters ---------- h1 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the first population. h2 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the second population. use_threads : bool, optional If True use multiple threads to compute. Returns ------- score : ndarray, float, shape (n_variants,) Unstandardized XPNSL scores. """ # check inputs h1 = asarray_ndim(h1, 2) check_integer_dtype(h1) h2 = asarray_ndim(h2, 2) check_integer_dtype(h2) check_dim0_aligned(h1, h2) if use_threads and multiprocessing.cpu_count() > 1: # use multiple threads # setup threadpool pool = ThreadPool(min(4, multiprocessing.cpu_count())) # scan forward res1_fwd = pool.apply_async(nsl_scan, args=(h1, )) res2_fwd = pool.apply_async(nsl_scan, args=(h2, )) # scan backward res1_rev = pool.apply_async(nsl_scan, args=(h1[::-1], )) res2_rev = pool.apply_async(nsl_scan, args=(h2[::-1], )) # wait for both to finish pool.close() pool.join() # obtain results nsl1_fwd = res1_fwd.get() nsl2_fwd = res2_fwd.get() nsl1_rev = res1_rev.get() nsl2_rev = res2_rev.get() # cleanup pool.terminate() else: # compute without threads # scan forward nsl1_fwd = nsl_scan(h1) nsl2_fwd = nsl_scan(h2) # scan backward nsl1_rev = nsl_scan(h1[::-1]) nsl2_rev = nsl_scan(h2[::-1]) # handle reverse scans nsl1_rev = nsl1_rev[::-1] nsl2_rev = nsl2_rev[::-1] # compute unstandardized score nsl1 = nsl1_fwd + nsl1_rev nsl2 = nsl2_fwd + nsl2_rev score = np.log(nsl1 / nsl2) return score
def tabulate_state_transitions(x, states, pos=None): """Construct a dataframe where each row provides information about a state transition. Parameters ---------- x : array_like, int 1-dimensional array of state values. states : set Set of states of interest. Any state value not in this set will be ignored. pos : array_like, int, optional Array of positions corresponding to values in `x`. Returns ------- df : DataFrame Notes ----- The resulting dataframe includes one row at the start representing the first state observation and one row at the end representing the last state observation. Examples -------- >>> import allel >>> x = [1, 1, 0, 1, 1, 2, 2, 0, 2, 1, 1] >>> df = allel.tabulate_state_transitions(x, states={1, 2}) >>> df lstate rstate lidx ridx 0 -1 1 -1 0 1 1 2 4 5 2 2 1 8 9 3 1 -1 10 -1 >>> pos = [2, 4, 7, 8, 10, 14, 19, 23, 28, 30, 31] >>> df = allel.tabulate_state_transitions(x, states={1, 2}, pos=pos) >>> df lstate rstate lidx ridx lpos rpos 0 -1 1 -1 0 -1 2 1 1 2 4 5 10 14 2 2 1 8 9 28 30 3 1 -1 10 -1 31 -1 """ # check inputs x = asarray_ndim(x, 1) check_integer_dtype(x) x = memoryview_safe(x) # find state transitions switch_points, transitions, _ = state_transitions(x, states) # start to build a dataframe items = [('lstate', transitions[:, 0]), ('rstate', transitions[:, 1]), ('lidx', switch_points[:, 0]), ('ridx', switch_points[:, 1])] # deal with optional positions if pos is not None: pos = asarray_ndim(pos, 1) check_dim0_aligned(x, pos) check_integer_dtype(pos) # find switch positions switch_positions = np.take(pos, switch_points) # deal with boundary transitions switch_positions[0, 0] = -1 switch_positions[-1, 1] = -1 # add columns into dataframe items += [('lpos', switch_positions[:, 0]), ('rpos', switch_positions[:, 1])] import pandas return pandas.DataFrame.from_dict(OrderedDict(items))