def test_divergence__windowed_scikit_allel_comparison(sample_size, n_cohorts, chunks): ts = simulate_ts(sample_size, length=200) ds = ts_to_dataset(ts, chunks) # type: ignore[no-untyped-call] ds, subsets = add_cohorts(ds, ts, n_cohorts) # type: ignore[no-untyped-call] ds = window(ds, size=25) ds = divergence(ds) div = ds["stat_divergence"].values # test off-diagonal entries, by replacing diagonal with NaNs div[:, np.arange(2), np.arange(2)] = np.nan # Calculate divergence using scikit-allel moving_statistic # (Don't use windowed_divergence, since it treats the last window differently) ds1 = count_variant_alleles(ts_to_dataset( ts, samples=ts.samples()[:1])) # type: ignore[no-untyped-call] ds2 = count_variant_alleles(ts_to_dataset( ts, samples=ts.samples()[1:])) # type: ignore[no-untyped-call] ac1 = ds1["variant_allele_count"].values ac2 = ds2["variant_allele_count"].values mpd = allel.mean_pairwise_difference_between(ac1, ac2, fill=0) ska_div = allel.moving_statistic(mpd, np.sum, size=25) # noqa: F841 # TODO: investigate why numbers are different np.testing.assert_allclose( div[:-1], ska_div) # scikit-allel has final window missing
def test_mean_pairwise_divergence(self): # simplest case, two haplotypes in each population h = HaplotypeArray([[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 1], [0, 1, 1, 1], [1, 1, 1, 1], [0, 0, 1, 2], [0, 1, 1, 2], [0, 1, -1, -1], [-1, -1, -1, -1]]) h1 = h.take([0, 1], axis=1) h2 = h.take([2, 3], axis=1) ac1 = h1.count_alleles() ac2 = h2.count_alleles() expect = [0 / 4, 2 / 4, 4 / 4, 2 / 4, 0 / 4, 4 / 4, 3 / 4, -1, -1] actual = allel.mean_pairwise_difference_between(ac1, ac2, fill=-1) aeq(expect, actual)
def test_pairwise_distance_multidim(self): g = GenotypeArray( [[[0, 0], [0, 0]], [[1, 1], [1, 1]], [[1, 1], [2, 2]], [[0, 0], [0, 1]], [[0, 0], [0, 2]], [[1, 1], [1, 2]], [[0, 1], [0, 1]], [[0, 1], [1, 2]], [[0, 0], [-1, -1]], [[0, 1], [-1, -1]], [[-1, -1], [-1, -1]]], dtype='i1') gac = g.to_allele_counts() def metric(ac1, ac2): mpd = allel.mean_pairwise_difference_between(ac1, ac2, fill=0) return mpd.sum() expect = [ allel.mean_pairwise_difference_between(gac[:, 0], gac[:, 1], fill=0).sum() ] actual = allel.pairwise_distance(gac, metric) aeq(expect, actual)
def metric(ac1, ac2): mpd = allel.mean_pairwise_difference_between(ac1, ac2, fill=0) return mpd.sum()
def RecombinationRepper( pooled_args): # provide r_rate, model_function, reps, samples r_rate = pooled_args[0] model_function = pooled_args[1] reps = pooled_args[2] samples = pooled_args[3] mean_Fst_dists = [] var_Fst_dists = [] mean_SE_dists = [] mean_SE_dists_shuf = [] tree_counts = [] mean_Dxy_dists = [] mean_Tajima_dists = [] mean_diversity_dists = [] mean_H12_dists = [] for t in range(reps): print(t) new_tree = migration_simulation_2patch(r_rate) # Add mutations to the tree count = 0 for r in new_tree.trees(): count += 1 # print(t, count) tree_counts.append(count) new_tree_dist_Fst = [] new_tree_dist_SE = [] new_tree_dist_SE_shuf = [] new_tree_dist_Dxy = [] new_tree_dist_diversity = [] new_tree_dist_Tajima = [] new_tree_dist_H12 = [] for i in range(samples): ## Repeat 100 times # Add mutations to the tree muts = 0 while muts == 0: mutated_tree = msprime.mutate(new_tree, 1.25e-7) muts = len([v for v in mutated_tree.variants()]) # Get the genotype matrix, ready for using sci-kit.allel msprime_genotype_matrix = mutated_tree.genotype_matrix() # Convert msprime's haplotype matrix into genotypes by randomly merging chromosomes haplotype_array = allel.HaplotypeArray(msprime_genotype_matrix) genotype_array = haplotype_array.to_genotypes(ploidy=2) shuffled_genotypes = shuffle(genotype_array, random_state=0) ac1 = haplotype_array.count_alleles( subpop=[s for s in range(0, 100)]) ac2 = haplotype_array.count_alleles( subpop=[s for s in range(100, 200)]) ## Calculate Tajima's D Tajimas_D = allel.tajima_d(ac1) ## Calculate Dxy dxy = sum(allel.mean_pairwise_difference_between(ac1, ac2)) / 10000. ## Calculate Garud's H statistics for the population ## Grab the haplotypes for 400SNPs from deme 1 hapslice = haplotype_array[:400, 0:100] H_vector = allel.garud_h(hapslice) ## Calculate Diversity pi = sum(allel.mean_pairwise_difference(ac1)) / 10000. subpopulations = [[p for p in range(0, 50)], [z for z in range(50, 100)]] mean_fst = allel.average_weir_cockerham_fst(genotype_array, blen=100, subpops=subpopulations) mean_fst_shuf = allel.average_weir_cockerham_fst( shuffled_genotypes, blen=100, subpops=subpopulations) new_tree_dist_Fst.append(mean_fst[0]) new_tree_dist_SE.append(mean_fst[1]) new_tree_dist_SE_shuf.append(mean_fst_shuf[1]) new_tree_dist_Tajima.append(Tajimas_D) new_tree_dist_Dxy.append(dxy) new_tree_dist_H12.append(H_vector[1]) new_tree_dist_diversity.append(pi) mean_Fst_dists.append(np.mean(new_tree_dist_Fst)) var_Fst_dists.append(np.sqrt(np.var(new_tree_dist_Fst))) mean_SE_dists.append(np.mean(new_tree_dist_SE)) mean_SE_dists_shuf.append(np.mean(new_tree_dist_SE_shuf)) mean_Dxy_dists.append(np.mean(new_tree_dist_Dxy)) mean_Tajima_dists.append(np.mean(new_tree_dist_Tajima)) mean_H12_dists.append(np.mean(new_tree_dist_H12)) mean_diversity_dists.append(np.mean(new_tree_dist_diversity)) return [ r_rate, mean_Fst_dists, mean_SE_dists, mean_SE_dists_shuf, var_Fst_dists, tree_counts, mean_Dxy_dists, mean_Tajima_dists, mean_diversity_dists, mean_H12_dists ]