def voight_painting(h): """Paint haplotypes, assigning a unique integer to each shared haplotype prefix. Parameters ---------- h : array_like, int, shape (n_variants, n_haplotypes) Haplotype array. Returns ------- painting : ndarray, int, shape (n_variants, n_haplotypes) Painting array. indices : ndarray, int, shape (n_hapotypes,) Haplotype indices after sorting by prefix. """ # check inputs # N.B., ensure int8 so we can use cython optimisation h = HaplotypeArray(np.asarray(h), copy=False) if h.max() > 1: raise NotImplementedError('only biallelic variants are supported') if h.min() < 0: raise NotImplementedError('missing calls are not supported') # sort by prefix indices = h.prefix_argsort() h = np.take(h, indices, axis=1) # paint painting = paint_shared_prefixes(np.asarray(h)) return painting, indices
def haplotype_diversity(h): """Estimate haplotype diversity. Parameters ---------- h : array_like, int, shape (n_variants, n_haplotypes) Haplotype array. Returns ------- hd : float Haplotype diversity. """ # check inputs h = HaplotypeArray(h, copy=False) # number of haplotypes n = h.n_haplotypes # compute haplotype frequencies f = h.distinct_frequencies() # estimate haplotype diversity hd = (1 - np.sum(f ** 2)) * n / (n - 1) return hd
def test_mean_pairwise_diversity(self): # start with simplest case, two haplotypes, one pairwise comparison h = HaplotypeArray([[0, 0], [1, 1], [0, 1], [1, 2], [0, -1], [-1, -1]]) ac = h.count_alleles() expect = [0, 0, 1, 1, -1, -1] actual = allel.stats.mean_pairwise_difference(ac, fill=-1) aeq(expect, actual) # four haplotypes, 6 pairwise comparison h = HaplotypeArray([[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 1], [0, 1, 1, 1], [1, 1, 1, 1], [0, 0, 1, 2], [0, 1, 1, 2], [0, 1, -1, -1], [-1, -1, -1, -1]]) ac = h.count_alleles() expect = [0, 3/6, 4/6, 3/6, 0, 5/6, 5/6, 1, -1] actual = allel.stats.mean_pairwise_difference(ac, fill=-1) assert_array_close(expect, actual)
def haplotype_diversity(h): """Estimate haplotype diversity. Parameters ---------- h : array_like, int, shape (n_variants, n_haplotypes) Haplotype array. Returns ------- hd : float Haplotype diversity. """ # check inputs h = HaplotypeArray(h, copy=False) # number of haplotypes n = h.n_haplotypes # compute haplotype frequencies f = h.distinct_frequencies() # estimate haplotype diversity hd = (1 - np.sum(f**2)) * n / (n - 1) return hd
def plot_haplotype_frequencies(h, palette='Paired', singleton_color='w', ax=None): """Plot haplotype frequencies. Parameters ---------- h : array_like, int, shape (n_variants, n_haplotypes) Haplotype array. palette : string, optional A Seaborn palette name. singleton_color : string, optional Color to paint singleton haplotypes. ax : axes, optional The axes on which to draw. If not provided, a new figure will be created. Returns ------- ax : axes """ import matplotlib.pyplot as plt import seaborn as sns # check inputs h = HaplotypeArray(h, copy=False) # setup figure if ax is None: width = plt.rcParams['figure.figsize'][0] height = width / 10 fig, ax = plt.subplots(figsize=(width, height)) sns.despine(ax=ax, left=True) # count distinct haplotypes hc = h.distinct_counts() # setup palette n_colors = np.count_nonzero(hc > 1) palette = sns.color_palette(palette, n_colors) # paint frequencies x1 = 0 for i, c in enumerate(hc): x2 = x1 + c if c > 1: color = palette[i] else: color = singleton_color ax.axvspan(x1, x2, color=color) x1 = x2 # tidy up ax.set_xlim(0, h.shape[1]) ax.set_yticks([]) return ax
def test_slice_types(self): h = HaplotypeArray(haplotype_data, dtype='i1') # row slice s = h[1:] assert_is_instance(s, HaplotypeArray) # col slice s = h[:, 1:] assert_is_instance(s, HaplotypeArray) # row index s = h[0] assert_is_instance(s, np.ndarray) assert_not_is_instance(s, HaplotypeArray) # col index s = h[:, 0] assert_is_instance(s, np.ndarray) assert_not_is_instance(s, HaplotypeArray) # item s = h[0, 0] assert_is_instance(s, np.int8) assert_not_is_instance(s, HaplotypeArray)
def garud_h(h): """Compute the H1, H12, H123 and H2/H1 statistics for detecting signatures of soft sweeps, as defined in Garud et al. (2015). Parameters ---------- h : array_like, int, shape (n_variants, n_haplotypes) Haplotype array. Returns ------- h1 : float H1 statistic (sum of squares of haplotype frequencies). h12 : float H12 statistic (sum of squares of haplotype frequencies, combining the two most common haplotypes into a single frequency). h123 : float H123 statistic (sum of squares of haplotype frequencies, combining the three most common haplotypes into a single frequency). h2_h1 : float H2/H1 statistic, indicating the "softness" of a sweep. """ # check inputs h = HaplotypeArray(h, copy=False) # compute haplotype frequencies f = h.distinct_frequencies() # compute H1 h1 = np.sum(f ** 2) # compute H12 h12 = np.sum(f[:2]) ** 2 + np.sum(f[2:] ** 2) # compute H123 h123 = np.sum(f[:3]) ** 2 + np.sum(f[3:] ** 2) # compute H2/H1 h2 = h1 - f[0] ** 2 h2_h1 = h2 / h1 return h1, h12, h123, h2_h1
def ehh_decay(h, truncate=False): """Compute the decay of extended haplotype homozygosity (EHH) moving away from the first variant. Parameters ---------- h : array_like, int, shape (n_variants, n_haplotypes) Haplotype array. truncate : bool, optional If True, the return array will exclude trailing zeros. Returns ------- ehh : ndarray, float, shape (n_variants, ) EHH at successive variants from the first variant. """ from allel.opt.stats import pairwise_shared_prefix_lengths_int8 # check inputs # N.B., ensure int8 so we can use cython optimisation h = HaplotypeArray(np.asarray(h, dtype="i1"), copy=False) if h.max() > 1: raise NotImplementedError("only biallelic variants are supported") if h.min() < 0: raise NotImplementedError("missing calls are not supported") # initialise n_variants = h.n_variants # number of rows, i.e., variants n_haplotypes = h.n_haplotypes # number of columns, i.e., haplotypes n_pairs = (n_haplotypes * (n_haplotypes - 1)) // 2 # compute the shared prefix length between all pairs of haplotypes spl = pairwise_shared_prefix_lengths_int8(h) # compute EHH by counting the number of shared prefixes extending beyond # each variant minlength = None if truncate else n_variants + 1 b = np.bincount(spl, minlength=minlength) c = np.cumsum(b[::-1])[:-1] ehh = (c / n_pairs)[::-1] return ehh
def ehh_decay(h, truncate=False): """Compute the decay of extended haplotype homozygosity (EHH) moving away from the first variant. Parameters ---------- h : array_like, int, shape (n_variants, n_haplotypes) Haplotype array. truncate : bool, optional If True, the return array will exclude trailing zeros. Returns ------- ehh : ndarray, float, shape (n_variants, ) EHH at successive variants from the first variant. """ from allel.opt.stats import pairwise_shared_prefix_lengths_int8 # check inputs # N.B., ensure int8 so we can use cython optimisation h = HaplotypeArray(np.asarray(h, dtype='i1'), copy=False) if h.max() > 1: raise NotImplementedError('only biallelic variants are supported') if h.min() < 0: raise NotImplementedError('missing calls are not supported') # initialise n_variants = h.n_variants # number of rows, i.e., variants n_haplotypes = h.n_haplotypes # number of columns, i.e., haplotypes n_pairs = (n_haplotypes * (n_haplotypes - 1)) // 2 # compute the shared prefix length between all pairs of haplotypes spl = pairwise_shared_prefix_lengths_int8(h) # compute EHH by counting the number of shared prefixes extending beyond # each variant minlength = None if truncate else n_variants + 1 b = np.bincount(spl, minlength=minlength) c = np.cumsum(b[::-1])[:-1] ehh = (c / n_pairs)[::-1] return ehh
def garud_h(h): """Compute the H1, H12, H123 and H2/H1 statistics for detecting signatures of soft sweeps, as defined in Garud et al. (2015). Parameters ---------- h : array_like, int, shape (n_variants, n_haplotypes) Haplotype array. Returns ------- h1 : float H1 statistic (sum of squares of haplotype frequencies). h12 : float H12 statistic (sum of squares of haplotype frequencies, combining the two most common haplotypes into a single frequency). h123 : float H123 statistic (sum of squares of haplotype frequencies, combining the three most common haplotypes into a single frequency). h2_h1 : float H2/H1 statistic, indicating the "softness" of a sweep. """ # check inputs h = HaplotypeArray(h, copy=False) # compute haplotype frequencies f = h.distinct_frequencies() # compute H1 h1 = np.sum(f**2) # compute H12 h12 = np.sum(f[:2])**2 + np.sum(f[2:]**2) # compute H123 h123 = np.sum(f[:3])**2 + np.sum(f[3:]**2) # compute H2/H1 h2 = h1 - f[0]**2 h2_h1 = h2 / h1 return h1, h12, h123, h2_h1
def test_windowed_diversity(self): # four haplotypes, 6 pairwise comparison h = HaplotypeArray([[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 1], [0, 1, 1, 1], [1, 1, 1, 1], [0, 0, 1, 2], [0, 1, 1, 2], [0, 1, -1, -1], [-1, -1, -1, -1]]) ac = h.count_alleles() # mean pairwise diversity # expect = [0, 3/6, 4/6, 3/6, 0, 5/6, 5/6, 1, -1] pos = SortedIndex([2, 4, 7, 14, 15, 18, 19, 25, 27]) expect = [(7/6)/10, (13/6)/10, 1/11] actual, _, _, _ = allel.stats.windowed_diversity(pos, ac, size=10, start=1, stop=31) assert_array_close(expect, actual)
def test_mean_pairwise_divergence(self): # simplest case, two haplotypes in each population h = HaplotypeArray([[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 1], [0, 1, 1, 1], [1, 1, 1, 1], [0, 0, 1, 2], [0, 1, 1, 2], [0, 1, -1, -1], [-1, -1, -1, -1]]) h1 = h.take([0, 1], axis=1) h2 = h.take([2, 3], axis=1) ac1 = h1.count_alleles() ac2 = h2.count_alleles() expect = [0/4, 2/4, 4/4, 2/4, 0/4, 4/4, 3/4, -1, -1] actual = allel.stats.mean_pairwise_difference_between(ac1, ac2, fill=-1) aeq(expect, actual)
def test_constructor(self): # missing data arg with self.assertRaises(TypeError): # noinspection PyArgumentList HaplotypeArray() # data has wrong dtype data = 'foo bar' with self.assertRaises(TypeError): HaplotypeArray(data) # data has wrong dtype data = [4., 5., 3.7] with self.assertRaises(TypeError): HaplotypeArray(data) # data has wrong dimensions data = [1, 2, 3] with self.assertRaises(TypeError): HaplotypeArray(data) # data has wrong dimensions data = diploid_genotype_data # use GenotypeArray instead with self.assertRaises(TypeError): HaplotypeArray(data) # haploid data (typed) h = HaplotypeArray(haplotype_data, dtype='i1') aeq(haplotype_data, h) eq(np.int8, h.dtype)
def test_windowed_divergence(self): # simplest case, two haplotypes in each population h = HaplotypeArray([[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 1], [0, 1, 1, 1], [1, 1, 1, 1], [0, 0, 1, 2], [0, 1, 1, 2], [0, 1, -1, -1], [-1, -1, -1, -1]]) h1 = h.take([0, 1], axis=1) h2 = h.take([2, 3], axis=1) ac1 = h1.count_alleles() ac2 = h2.count_alleles() # mean pairwise divergence # expect = [0/4, 2/4, 4/4, 2/4, 0/4, 4/4, 3/4, -1, -1] pos = SortedIndex([2, 4, 7, 14, 15, 18, 19, 25, 27]) expect = [(6/4)/10, (9/4)/10, 0/11] actual, _, _, _ = allel.stats.windowed_divergence( pos, ac1, ac2, size=10, start=1, stop=31 ) assert_array_close(expect, actual)
def voight_painting(h): """Paint haplotypes, assigning a unique integer to each shared haplotype prefix. Parameters ---------- h : array_like, int, shape (n_variants, n_haplotypes) Haplotype array. Returns ------- painting : ndarray, int, shape (n_variants, n_haplotypes) Painting array. indices : ndarray, int, shape (n_hapotypes,) Haplotype indices after sorting by prefix. """ from allel.opt.stats import paint_shared_prefixes_int8 # check inputs # N.B., ensure int8 so we can use cython optimisation h = HaplotypeArray(np.asarray(h, dtype="i1"), copy=False) if h.max() > 1: raise NotImplementedError("only biallelic variants are supported") if h.min() < 0: raise NotImplementedError("missing calls are not supported") # sort by prefix indices = h.prefix_argsort() h = np.take(h, indices, axis=1) # paint painting = paint_shared_prefixes_int8(h) return painting, indices
def paint_transmission(parent_haplotypes, progeny_haplotypes): """Paint haplotypes inherited from a single diploid parent according to their allelic inheritance. Parameters ---------- parent_haplotypes : array_like, int, shape (n_variants, 2) Both haplotypes from a single diploid parent. progeny_haplotypes : array_like, int, shape (n_variants, n_progeny) Haplotypes found in progeny of the given parent, inherited from the given parent. I.e., haplotypes from gametes of the given parent. Returns ------- painting : ndarray, uint8, shape (n_variants, n_progeny) An array of integers coded as follows: 1 = allele inherited from first parental haplotype; 2 = allele inherited from second parental haplotype; 3 = reference allele, also carried by both parental haplotypes; 4 = non-reference allele, also carried by both parental haplotypes; 5 = non-parental allele; 6 = either or both parental alleles missing; 7 = missing allele; 0 = undetermined. Examples -------- >>> import allel >>> haplotypes = allel.HaplotypeArray([ ... [0, 0, 0, 1, 2, -1], ... [0, 1, 0, 1, 2, -1], ... [1, 0, 0, 1, 2, -1], ... [1, 1, 0, 1, 2, -1], ... [0, 2, 0, 1, 2, -1], ... [0, -1, 0, 1, 2, -1], ... [-1, 1, 0, 1, 2, -1], ... [-1, -1, 0, 1, 2, -1], ... ], dtype='i1') >>> painting = allel.paint_transmission(haplotypes[:, :2], ... haplotypes[:, 2:]) >>> painting array([[3, 5, 5, 7], [1, 2, 5, 7], [2, 1, 5, 7], [5, 4, 5, 7], [1, 5, 2, 7], [6, 6, 6, 7], [6, 6, 6, 7], [6, 6, 6, 7]], dtype=uint8) """ # check inputs parent_haplotypes = HaplotypeArray(parent_haplotypes) progeny_haplotypes = HaplotypeArray(progeny_haplotypes) if parent_haplotypes.n_haplotypes != 2: raise ValueError('exactly two parental haplotypes should be provided') # convenience variables parent1 = parent_haplotypes[:, 0, np.newaxis] parent2 = parent_haplotypes[:, 1, np.newaxis] progeny_is_missing = progeny_haplotypes < 0 parent_is_missing = np.any(parent_haplotypes < 0, axis=1) # need this for broadcasting, but also need to retain original for later parent_is_missing_bc = parent_is_missing[:, np.newaxis] parent_diplotype = GenotypeArray(parent_haplotypes[:, np.newaxis, :]) parent_is_hom_ref = parent_diplotype.is_hom_ref() parent_is_het = parent_diplotype.is_het() parent_is_hom_alt = parent_diplotype.is_hom_alt() # identify allele calls where inheritance can be determined is_callable = ~progeny_is_missing & ~parent_is_missing_bc is_callable_seg = is_callable & parent_is_het # main inheritance states inherit_parent1 = is_callable_seg & (progeny_haplotypes == parent1) inherit_parent2 = is_callable_seg & (progeny_haplotypes == parent2) nonseg_ref = (is_callable & parent_is_hom_ref & (progeny_haplotypes == parent1)) nonseg_alt = (is_callable & parent_is_hom_alt & (progeny_haplotypes == parent1)) nonparental = ( is_callable & (progeny_haplotypes != parent1) & (progeny_haplotypes != parent2) ) # record inheritance states # N.B., order in which these are set matters painting = np.zeros(progeny_haplotypes.shape, dtype='u1') painting[inherit_parent1] = INHERIT_PARENT1 painting[inherit_parent2] = INHERIT_PARENT2 painting[nonseg_ref] = INHERIT_NONSEG_REF painting[nonseg_alt] = INHERIT_NONSEG_ALT painting[nonparental] = INHERIT_NONPARENTAL painting[parent_is_missing] = INHERIT_PARENT_MISSING painting[progeny_is_missing] = INHERIT_MISSING return painting
def setup_instance(self, data): return HaplotypeArray(data)