def xpehh(h1, h2, pos, min_ehh=0.05): """Compute the unstandardized cross-population extended haplotype homozygosity score (XPEHH) for each variant. Parameters ---------- h1 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the first population. h2 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the second population. pos : array_like, int, shape (n_variants,) Variant positions on physical or genetic map. min_ehh: float, optional Minimum EHH beyond which to truncate integrated haplotype homozygosity calculation. Returns ------- score : ndarray, float, shape (n_variants,) Unstandardized XPEHH scores. Notes ----- This function will calculate XPEHH for all variants. To exclude variants below a given minor allele frequency, filter the input haplotype arrays before passing to this function. This function returns NaN for any EHH calculations where haplotype homozygosity does not decay below `min_ehh` before reaching the first or last variant. To disable this behaviour, set `min_ehh` to None. This function currently does nothing to account for large gaps between variants. There will be edge effects near any large gaps. Note that the unstandardized score is returned. Usually these scores are then normalised in different allele frequency bins. Haplotype arrays from the two populations may have different numbers of haplotypes. """ from allel.opt.stats import ihh_scan_int8 # scan forward ihh1_fwd = ihh_scan_int8(h1, pos, min_ehh=min_ehh) ihh2_fwd = ihh_scan_int8(h2, pos, min_ehh=min_ehh) # scan backward ihh1_rev = ihh_scan_int8(h1[::-1], pos[::-1], min_ehh=min_ehh)[::-1] ihh2_rev = ihh_scan_int8(h2[::-1], pos[::-1], min_ehh=min_ehh)[::-1] # compute unstandardized score ihh1 = ihh1_fwd + ihh1_rev ihh2 = ihh2_fwd + ihh2_rev score = np.log(ihh1 / ihh2) return score
def test_ihh_scan_int8_d(): # edge case: start from 0 haplotype homozygosity gaps = np.array([10], dtype='f8') h = np.array([[0, 1], [1, 0]], dtype='i1') expect = [0, 0] actual = ihh_scan_int8(h, gaps, min_ehh=0, include_edges=False) assert_array_nanclose(expect, actual) expect = [0, 0] actual = ihh_scan_int8(h, gaps, min_ehh=0, include_edges=True) assert_array_nanclose(expect, actual)
def test_ihh_scan_int8_c(): # simple case: 1 haplotype pair, haplotype homozygosity decays gaps = np.array([10, 10], dtype='f8') h = np.array([[0, 1], [0, 0], [0, 0]], dtype='i1') # do not include edges expect = [0, 5, 15] actual = ihh_scan_int8(h, gaps, min_ehh=0, include_edges=False) assert_array_nanclose(expect, actual) # include edges expect = [0, 5, 15] actual = ihh_scan_int8(h, gaps, min_ehh=0, include_edges=True) assert_array_nanclose(expect, actual)
def test_ihh_scan_int8_b(): # 1 haplotype pair, haplotype homozygosity over all variants # handling of large gap (encoded as -1) gaps = np.array([10, -1], dtype='f8') h = np.array([[0, 0], [0, 0], [0, 0]], dtype='i1') # do not include edges expect = [np.nan, np.nan, np.nan] actual = ihh_scan_int8(h, gaps, min_ehh=0, include_edges=False) assert_array_nanclose(expect, actual) # include edges expect = [0, 10, np.nan] actual = ihh_scan_int8(h, gaps, min_ehh=0, include_edges=True) assert_array_nanclose(expect, actual)
def xpehh( h1, h2, pos, map_pos=None, min_ehh=0.05, include_edges=False, gap_scale=20000, max_gap=200000, is_accessible=None, use_threads=True, ): """Compute the unstandardized cross-population extended haplotype homozygosity score (XPEHH) for each variant. Parameters ---------- h1 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the first population. h2 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the second population. pos : array_like, int, shape (n_variants,) Variant positions on physical or genetic map. map_pos : array_like, float, shape (n_variants,) Variant positions (genetic map distance). min_ehh: float, optional Minimum EHH beyond which to truncate integrated haplotype homozygosity calculation. include_edges : bool, optional If True, report scores even if EHH does not decay below `min_ehh` before reaching the edge of the data. gap_scale : int, optional Rescale distance between variants if gap is larger than this value. max_gap : int, optional Do not report scores if EHH spans a gap larger than this number of base pairs. is_accessible : array_like, bool, optional Genome accessibility array. If provided, distance between variants will be computed as the number of accessible bases between them. use_threads : bool, optional If True use multiple threads to compute. Returns ------- score : ndarray, float, shape (n_variants,) Unstandardized XPEHH scores. Notes ----- This function will calculate XPEHH for all variants. To exclude variants below a given minor allele frequency, filter the input haplotype arrays before passing to this function. This function returns NaN for any EHH calculations where haplotype homozygosity does not decay below `min_ehh` before reaching the first or last variant. To disable this behaviour, set `include_edges` to True. Note that the unstandardized score is returned. Usually these scores are then standardized genome-wide. Haplotype arrays from the two populations may have different numbers of haplotypes. See Also -------- standardize """ from allel.opt.stats import ihh_scan_int8 # check inputs h1 = HaplotypeArray(np.asarray(h1, dtype="i1")) h2 = HaplotypeArray(np.asarray(h2, dtype="i1")) pos = asarray_ndim(pos, 1) check_dim0_aligned(h1, h2, pos) # compute gaps between variants for integration gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible) # setup kwargs kwargs = dict(min_ehh=min_ehh, include_edges=include_edges) if use_threads and multiprocessing.cpu_count() > 1: # use multiple threads # setup threadpool pool = ThreadPool(min(4, multiprocessing.cpu_count())) # scan forward res1_fwd = pool.apply_async(ihh_scan_int8, (h1, gaps), kwargs) res2_fwd = pool.apply_async(ihh_scan_int8, (h2, gaps), kwargs) # scan backward res1_rev = pool.apply_async(ihh_scan_int8, (h1[::-1], gaps[::-1]), kwargs) res2_rev = pool.apply_async(ihh_scan_int8, (h2[::-1], gaps[::-1]), kwargs) # wait for both to finish pool.close() pool.join() # obtain results ihh1_fwd = res1_fwd.get() ihh2_fwd = res2_fwd.get() ihh1_rev = res1_rev.get() ihh2_rev = res2_rev.get() # cleanup pool.terminate() else: # compute without threads # scan forward ihh1_fwd = ihh_scan_int8(h1, gaps, **kwargs) ihh2_fwd = ihh_scan_int8(h2, gaps, **kwargs) # scan backward ihh1_rev = ihh_scan_int8(h1[::-1], gaps[::-1], **kwargs) ihh2_rev = ihh_scan_int8(h2[::-1], gaps[::-1], **kwargs) # handle reverse scans ihh1_rev = ihh1_rev[::-1] ihh2_rev = ihh2_rev[::-1] # compute unstandardized score ihh1 = ihh1_fwd + ihh1_rev ihh2 = ihh2_fwd + ihh2_rev score = np.log(ihh1 / ihh2) return score