示例#1
0
    def verify_matrix(self, ts):
        m = ts.get_num_sites()
        ldc = tskit.LdCalculator(ts)
        A = ldc.get_r2_matrix()
        self.assertEqual(A.shape, (m, m))
        B = get_r2_matrix(ts)
        self.assertTrue(np.allclose(A, B))

        # Now look at each row in turn, and verify it's the same
        # when we use get_r2 directly.
        for j in range(m):
            a = ldc.get_r2_array(j, direction=tskit.FORWARD)
            b = A[j, j + 1:]
            self.assertEqual(a.shape[0], m - j - 1)
            self.assertEqual(b.shape[0], m - j - 1)
            self.assertTrue(np.allclose(a, b))
            a = ldc.get_r2_array(j, direction=tskit.REVERSE)
            b = A[j, :j]
            self.assertEqual(a.shape[0], j)
            self.assertEqual(b.shape[0], j)
            self.assertTrue(np.allclose(a[::-1], b))

        # Now check every cell in the matrix in turn.
        for j in range(m):
            for k in range(m):
                self.assertAlmostEqual(ldc.get_r2(j, k), A[j, k])
示例#2
0
 def verify_max_distance(self, ts):
     """
     Verifies that the max_distance parameter works as expected.
     """
     mutations = list(ts.mutations())
     ldc = tskit.LdCalculator(ts)
     A = ldc.get_r2_matrix()
     j = len(mutations) // 2
     for k in range(j):
         x = mutations[j + k].position - mutations[j].position
         a = ldc.get_r2_array(j, max_distance=x)
         self.assertEqual(a.shape[0], k)
         self.assertTrue(np.allclose(A[j, j + 1:j + 1 + k], a))
         x = mutations[j].position - mutations[j - k].position
         a = ldc.get_r2_array(j, max_distance=x, direction=tskit.REVERSE)
         self.assertEqual(a.shape[0], k)
         self.assertTrue(np.allclose(A[j, j - k:j], a[::-1]))
     L = ts.get_sequence_length()
     m = len(mutations)
     a = ldc.get_r2_array(0, max_distance=L)
     self.assertEqual(a.shape[0], m - 1)
     self.assertTrue(np.allclose(A[0, 1:], a))
     a = ldc.get_r2_array(m - 1, max_distance=L, direction=tskit.REVERSE)
     self.assertEqual(a.shape[0], m - 1)
     self.assertTrue(np.allclose(A[m - 1, :-1], a[::-1]))
示例#3
0
 def test_deprecated_aliases(self):
     ts = msprime.simulate(20, mutation_rate=10, random_seed=15)
     ts = tsutil.subsample_sites(ts, self.num_test_sites)
     ldc = tskit.LdCalculator(ts)
     A = ldc.get_r2_matrix()
     B = ldc.r2_matrix()
     self.assertTrue(np.array_equal(A, B))
     a = ldc.get_r2_array(0)
     b = ldc.r2_array(0)
     self.assertTrue(np.array_equal(a, b))
     self.assertEqual(ldc.get_r2(0, 1), ldc.r2(0, 1))
def ld_matrix_example(ts):
    ld_calc = tskit.LdCalculator(ts)
    A = ld_calc.r2_matrix()
    # Now plot this matrix.
    x = A.shape[0] / pyplot.rcParams["figure.dpi"]
    x = max(x, pyplot.rcParams["figure.figsize"][0])
    fig, ax = pyplot.subplots(figsize=(x, x))
    fig.tight_layout(pad=0)
    im = ax.imshow(A, interpolation="none", vmin=0, vmax=1, cmap="Blues")
    ax.set_xticks([])
    ax.set_yticks([])
    for s in "top", "bottom", "left", "right":
        ax.spines[s].set_visible(False)
    pyplot.gcf().colorbar(im, shrink=0.5, pad=0)
示例#5
0
 def verify_max_mutations(self, ts):
     """
     Verifies that the max mutations parameter works as expected.
     """
     mutations = list(ts.mutations())
     ldc = tskit.LdCalculator(ts)
     A = ldc.get_r2_matrix()
     j = len(mutations) // 2
     for k in range(j):
         a = ldc.get_r2_array(j, max_mutations=k)
         self.assertEqual(a.shape[0], k)
         self.assertTrue(np.allclose(A[j, j + 1:j + 1 + k], a))
         a = ldc.get_r2_array(j, max_mutations=k, direction=tskit.REVERSE)
         self.assertEqual(a.shape[0], k)
         self.assertTrue(np.allclose(A[j, j - k:j], a[::-1]))
示例#6
0
    def test_get_r2_array_multiple_instances(self):
        # This is the nominal case where we have a separate LdCalculator
        # instance in each thread.
        ts = self.get_tree_sequence()
        ld_calc = tskit.LdCalculator(ts)
        A = ld_calc.get_r2_matrix()
        m = A.shape[0]
        del ld_calc

        def worker(thread_index, results):
            ld_calc = tskit.LdCalculator(ts)
            results[thread_index] = np.array(
                ld_calc.get_r2_array(thread_index))

        results = run_threads(worker, m)
        for j in range(m):
            assert np.allclose(results[j], A[j, j + 1:])
示例#7
0
    def test_get_r2_single_instance(self):
        # This is the degenerate case where we have a single LdCalculator
        # instance shared by the threads. We should have only one thread
        # actually executing get_r2() at one time.
        ts = self.get_tree_sequence()
        ld_calc = tskit.LdCalculator(ts)
        A = ld_calc.get_r2_matrix()
        m = A.shape[0]

        def worker(thread_index, results):
            row = np.zeros(m)
            results[thread_index] = row
            for j in range(m):
                row[j] = ld_calc.get_r2(thread_index, j)

        results = run_threads(worker, m)
        for j in range(m):
            assert np.allclose(results[j], A[j])
示例#8
0
    def test_get_r2_array_single_instance(self):
        # This is the degenerate case where we have a single LdCalculator
        # instance shared by the threads. We should have only one thread
        # actually executing get_r2_array() at one time. Because the buffer
        # is shared by many different instances, we can't make any assertions
        # about the returned values --- they are essentially gibberish.
        # However, we shouldn't crash and burn, which is what this test
        # is here to check for.
        ts = self.get_tree_sequence()
        ld_calc = tskit.LdCalculator(ts)
        m = ts.get_num_mutations()

        def worker(thread_index, results):
            results[thread_index] = ld_calc.get_r2_array(thread_index).shape

        results = run_threads(worker, m)
        for j in range(m):
            assert results[j][0] == m - j - 1
示例#9
0
    def test_get_r2_multiple_instances(self):
        # This is the nominal case where we have a separate LdCalculator
        # instance in each thread.
        ts = self.get_tree_sequence()
        ld_calc = tskit.LdCalculator(ts)
        A = ld_calc.get_r2_matrix()
        del ld_calc
        m = A.shape[0]

        def worker(thread_index, results):
            ld_calc = tskit.LdCalculator(ts)
            row = np.zeros(m)
            results[thread_index] = row
            for j in range(m):
                row[j] = ld_calc.get_r2(thread_index, j)

        results = run_threads(worker, m)
        for j in range(m):
            self.assertTrue(np.allclose(results[j], A[j]))
示例#10
0
def main():
    ## Define command line args
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("--trees",
                        required=True,
                        dest="trees",
                        type=str,
                        help="Coalescent trees for the simulation")
    parser.add_argument(
        "--output",
        required=True,
        dest="output",
        type=str,
        help=
        "What name do you want to give to the output file? [DON'T GIVE FILE EXTENSION, THE SCRIPT MAKES TWO OUTPUT FILES"
    )
    parser.add_argument(
        "--island",
        dest="island",
        action="store_true",
        help="Use this flag if the data comes from the Island model")
    args = parser.parse_args()

    wins = [i for i in range(0, 1000000 + 3000, 1000)]
    ts = pyslim.load(args.trees)

    #	print("recap")

    mut_rate = 1e-7  # HARD CODED, SO CHANGE IF NECESSARY

    pops = {}
    for i in ts.individuals_alive_at(0):
        if ts.individual(i).population == 999: continue

        try:
            pops[ts.individual(i).population].append(i)
        except KeyError:
            pops[ts.individual(i).population] = [i]


# Let's take a sample of 40 pops and 20 diploids from each and make a new tree from them
    sampled_indivs = []
    for i in np.random.choice(list(pops.keys()), 1, replace=False):
        for j in np.random.choice(np.array(pops[i]), 10, replace=False):
            sampled_indivs.extend(ts.individual(j).nodes)

    r2_tree2 = ts.simplify(sampled_indivs)

    #print(i.population)
    #	print(ts.sample_size)
    ## Sprinkle mutations onto the coalescent tree
    #	print('Sprinkling mutations onto trees')

    #	print("sprinkle 1")
    sprinkled = msprime.mutate(r2_tree2, rate=mut_rate, keep=True)
    #	MAF_005 = []
    #	for i in sprinkled.variants():
    #		print(i.position)
    #		if sum(i.genotypes)/len(i.genotypes) > 0.01:
    #			MAF_005.append(1)
    #		else:
    #			MAF_005.append(0)
    #	print( np.array(MAF_005).sum() )

    #	return
    #	print("sprinkle 2")

    #	sprinkled_recap = msprime.mutate(ts, rate= mut_rate, keep=True)

    ####	print(sprinkled.diversity( windows = 	[0,1e6,1e6+3000]) )

    #	sprinkled_recap.diversity( windows = 	wins+ [1003000]
    #))
    #	plt.step( np.array(wins), sprinkled.diversity( windows = 	wins+ [1003000top
    #]) , "xkcd:crimson" )
    #	plt.step( np.array(wins), sprinkled_recap.diversity( windows = 	wins+ [1003000]) , "peachpuff" )
    #	plt.show()

    #	return
    positions = [i.position for i in sprinkled.variants() if i.position < 1e4]
    #	print(len(positions))

    LD_interval = sprinkled.keep_intervals(np.array([[0, 20000]]))
    LD = tskit.LdCalculator(LD_interval)
    LD_mat = LD.r2_matrix()

    for i in range(len(positions)):
        for j in range(len(positions)):
            if j >= i:
                continue
            else:
                pw_distances_ij = abs(positions[i] - positions[j])
                if pw_distances_ij > 100000: continue
                print(pw_distances_ij, LD_mat[i, j])
示例#11
0
 def worker(thread_index, results):
     ld_calc = tskit.LdCalculator(ts)
     row = np.zeros(m)
     results[thread_index] = row
     for j in range(m):
         row[j] = ld_calc.get_r2(thread_index, j)
示例#12
0
 def worker(thread_index, results):
     ld_calc = tskit.LdCalculator(ts)
     results[thread_index] = np.array(
         ld_calc.get_r2_array(thread_index))
示例#13
0
def getLD(tree_sequence, output_name):

    pops = {}
    for i in tree_sequence.individuals_alive_at(0):
        if tree_sequence.individual(i).population == 999: continue

        try:
            pops[tree_sequence.individual(i).population].append(i)
        except KeyError:
            pops[tree_sequence.individual(i).population] = [i]


# Let's take a sample of 10 individuals from 1 pops and make a new tree from them
    sampled_indivs_1 = []
    for i in np.random.choice(list(pops.keys()), 1, replace=False):
        for j in np.random.choice(np.array(pops[i]), 10, replace=False):
            sampled_indivs_1.extend(tree_sequence.individual(j).nodes)

    diploids_per_pop = 2
    # Let's take a sample of 10 individuals from 20 pops and make a new tree from them
    sampled_indivs_2 = []
    for i in np.random.choice(list(pops.keys()), 20, replace=False):
        for j in np.random.choice(np.array(pops[i]), 2, replace=False):
            sampled_indivs_2.extend(tree_sequence.individual(j).nodes)

    r2_tree_1 = tree_sequence.simplify(sampled_indivs_1)
    mut_tree_1 = msprime.mutate(r2_tree_1, 1e-7)

    r2_tree_2 = tree_sequence.simplify(sampled_indivs_2)
    mut_tree_2 = msprime.mutate(r2_tree_2, 1e-7)

    #	print(ts.sample_size)
    ## Sprinkle mutations onto the coalescent tree
    #	print('Sprinkling mutations onto trees')

    #	print("sprinkle 1")
    #	sprinkled = msprime.mutate(r2_tree2, rate= mut_rate, keep=True)

    LD_intervals = [[9000000, 9010000 - 1], [9200000, 9210000 - 1],
                    [9400000, 9410000 - 1], [9600000, 9610000 - 1],
                    [9800000, 9810000 - 1]]

    LD_intervals = [[7260000, 7270000 - 1]]

    LD_output_within = open("within_" + output_name, "w")
    for interval in LD_intervals:
        positions = [
            i.position for i in mut_tree_1.variants()
            if i.position >= interval[0] and i.position <= interval[1]
        ]
        LD_interval = mut_tree_1.keep_intervals(np.array([interval]))
        LD = tskit.LdCalculator(LD_interval)
        LD_mat = LD.r2_matrix()
        freq_dict = {}
        for variant in LD_interval.variants():
            ## Calculating the genotype freqs this way assumes that the ref and alt are coded as 1s and 0s, repectively. I'll keep that for now, but will change in a bit
            allele_freq = ((variant.genotypes[::2] + variant.genotypes[1::2]) /
                           2).mean()
            freq_dict[variant.position] = allele_freq

        print(len(freq_dict.keys()))
        print(len(positions))

        for i in range(len(positions)):
            for j in range(len(positions)):
                if j >= i:
                    continue
                else:
                    if freq_dict[positions[i]] < 0.1 or freq_dict[
                            positions[j]] < 0.1:
                        continue
                    pw_distances_ij = abs(positions[i] - positions[j])
                    if pw_distances_ij > 10000: continue
                    LD_output_within.write(",".join(
                        [str(int(pw_distances_ij)),
                         str(LD_mat[i, j])]) + "\n")

    LD_output_within.close()
    print("Now getting 'LD' between pops")
    LD_output_between = open("between_" + output_name, "w")

    for interval in LD_intervals:
        positions = [
            i.position for i in mut_tree_2.variants()
            if i.position >= interval[0] and i.position <= interval[1]
        ]
        LD_interval = mut_tree_2.keep_intervals(np.array([interval]))

        freq_dict = {}

        for variant in LD_interval.variants():

            ## Calculating the genotype freqs this way assumes that the ref and alt are coded as 1s and 0s, repectively. I'll keep that for now, but will change in a bit
            genotypes_as_freqs = (variant.genotypes[::2] +
                                  variant.genotypes[1::2]) / 2

            freqs_per_pop = ([
                i.mean()
                for i in chunks(genotypes_as_freqs, int(diploids_per_pop))
            ])

            if sum(freqs_per_pop) / len(freqs_per_pop) < 0.1: continue

            freq_dict[variant.position] = freqs_per_pop

        for pair in itertools.product(freq_dict.keys(), repeat=2):
            pw_distances_ij = abs(pair[0] - pair[1])
            if pw_distances_ij == 0: continue
            LD_output_between.write(",".join([
                str(int(pw_distances_ij)),
                str(
                    scipy.stats.pearsonr(freq_dict[pair[0]], freq_dict[
                        pair[1]])[0]**2)
            ]) + "\n")

    LD_output_between.close()

    return
示例#14
0
                                         Ne=Ne,
                                         recombination_map=recombination_map,
                                         model="dtwf",
                                         mutation_rate=mutation_rate)

        first_chrom2 = 'not assigned'
        n1 = 0
        n2 = 0
        for variant in tree_sequence.variants():
            if variant.position > 1e8:
                first_chrom2 = variant.index
                n2 += 1
            if variant.position < (1e8 - 1):
                n1 += 1

        r2 = tskit.LdCalculator(tree_sequence).r2_matrix()
        allele_frequency_filter = Waples2011.get_allele_filter(
            tree_sequence, 0.2, S)
        chrom2_filter = np.logical_and(allele_frequency_filter,
                                       np.append(np.zeros(n1), np.ones(n2)))
        chrom1_filter = np.logical_and(allele_frequency_filter,
                                       np.append(np.ones(n1), np.zeros(n2)))
        filtered_matrix = r2[chrom1_filter][:, chrom2_filter]

        obs_r2_drift = np.mean(Waples2011.get_r2_drift(filtered_matrix, S))
        obs_r2_uncorrected = np.mean(filtered_matrix)

        N_prediction = 1 / (3 * obs_r2_drift)

        r2_corrected_list.append(obs_r2_drift)
        r2_observed_list.append(obs_r2_uncorrected)
示例#15
0
def get_LD_matrix(tree_sequence):
    return tskit.LdCalculator(tree_sequence).r2_matrix()
def myLDcalc(tree_sequence, ):
    for variant in tree_sequence.variants():
        if variant.site.id == 0:
            gen1 = variant.genotypes
        if variant.site.id == 1:
            gen2 = variant.genotypes
            break
    n = len(gen1)
    pA = sum(gen1) / n
    pB = sum(gen2) / n
    pAB= sum(np.logical_and(gen1, gen2))/n
    pab= sum(np.logical_and(1-gen1, 1-gen2))/n
    pAb= sum(np.logical_and(gen1, 1-gen2))/n
    paB= sum(np.logical_and(1-gen1, gen2))/n

    print(sum([pAB, pAb, paB, pab]))

    r_unphased = ( pAB - pA * pB ) / np.sqrt(pA * (1-pA) * pB * (1-pB))
    r2_phased   = (pAB * pab - pAb * paB)**2 / (pA * (1-pA) * pB * (1-pB))
    return r_unphased**2, r2_phased

for variant in tree_sequence.variants():
    print(
        variant.site.id, variant.site.position,
        variant.alleles, sep = '\t'
    )

print(count_mutations(tree_sequence))
print(tskit.LdCalculator(tree_sequence).r2_matrix())
print(tskit.LdCalculator(tree_sequence).r2(0,1))
print(myLDcalc(tree_sequence))
示例#17
0
def get_LD_estimate(Ne, S, n_subpops, m, rate=False, n_loci=100):

    # set up population parameters

    ## migration matrix
    M = get_migration_matrix(m, n_subpops)
    ## sample
    population_configurations = [
        msprime.PopulationConfiguration(sample_size=S)
    ] + [
        msprime.PopulationConfiguration(sample_size=0)
        for i in range(n_subpops - 1)
    ]
    ## mutation and recombination : adjust based on Ne to get same number of mutations!
    mutation_rate = 0 * 5e-9 / 40
    recom_rate = 1e-8
    positions = [0, 1e8 - 1, 1e8, 2e8 - 1]
    rates = [recom_rate, 0.5, recom_rate, 0]
    num_loci = int(positions[-1])

    recombination_map = msprime.RecombinationMap(positions=positions,
                                                 rates=rates,
                                                 num_loci=num_loci)

    tree_sequence = msprime.simulate(
        Ne=Ne,
        recombination_map=recombination_map,
        mutation_rate=mutation_rate,
        population_configurations=population_configurations,
        migration_matrix=M,
        model="dtwf")

    if not rate:
        print("Calculating rate...")
        L = 0
        for tree in tree_sequence.trees():
            L += tree.get_length() * tree.get_total_branch_length()
        rate = n_loci / L

    tree_sequence = msprime.mutate(tree_sequence,
                                   rate=rate,
                                   random_seed=None,
                                   model=None,
                                   keep=False,
                                   start_time=None,
                                   end_time=None)

    first_chrom2 = 'not assigned'
    n1 = 0
    n2 = 0
    for variant in tree_sequence.variants():
        if variant.position > 1e8:
            first_chrom2 = variant.index
            n2 += 1
        if variant.position < (1e8 - 1):
            n1 += 1

    r2 = tskit.LdCalculator(tree_sequence).r2_matrix()
    fil = np.zeros((n1 + n2, n1 + n2))
    fil[:n1, n1:] = 1

    i = 0
    for v in tree_sequence.variants():
        if sum(v.genotypes / len(v.genotypes)) < 0.05:
            fil[:, i] = 0
            fil[i, :] = 0
        i += 1

    fil = fil.astype(int)
    r2 = np.where(fil, r2, np.zeros_like(r2))

    r2_mean = np.sum(r2) / np.sum(fil)
    r2_drift = r2_mean - (1 / (1 * S) / (1 - 1 / (1 * S)))
    Ne_est = 1 / (3 * r2_drift)

    #print(n1+n2)
    #print(Ne_est)
    return Ne_est, rate
示例#18
0
def getLD(tree_sequence, output_name, selection=False):
    if selection == True:
        sel_pos_raw = []
        for v in tree_sequence.variants():
            if v.genotypes.sum() < 4000:
                continue  ## A rough way of removing low frueqnecy alleles
            else:
                sel_pos_raw.append(round(v.position / 10000) * 10000)
        LD_intervals = [[s, s + 9999] for s in list(set(sel_pos_raw))]

    else:
        ## these genes always evolve neutrally
        LD_intervals = [[9000000, 9010000 - 1], [9200000, 9210000 - 1],
                        [9400000, 9410000 - 1], [9600000, 9610000 - 1],
                        [9800000, 9810000 - 1]]

    print(LD_intervals)
    pops = {}
    for i in tree_sequence.individuals_alive_at(0):
        if tree_sequence.individual(i).population == 999: continue

        try:
            pops[tree_sequence.individual(i).population].append(i)
        except KeyError:
            pops[tree_sequence.individual(i).population] = [i]


# Let's take a sample of 10 individuals from 1 pops and make a new tree from them
    sampled_indivs = []
    for i in np.random.choice(list(pops.keys()), 1, replace=False):
        for j in np.random.choice(np.array(pops[i]), 10, replace=False):
            sampled_indivs.extend(tree_sequence.individual(j).nodes)

    r2_tree2 = tree_sequence.simplify(sampled_indivs)
    mut_tree = msprime.mutate(r2_tree2, 1e-7)

    #	print(pops)
    #	print(ts.sample_size)
    ## Sprinkle mutations onto the coalescent tree
    #	print('Sprinkling mutations onto trees')

    #	print("sprinkle 1")
    #	sprinkled = msprime.mutate(r2_tree2, rate= mut_rate, keep=True)

    LD_output = open(output_name, "w")
    for inter in range(len(LD_intervals)):
        interval = LD_intervals[inter]
        positions = [
            i.position for i in mut_tree.variants()
            if i.position >= interval[0] and i.position <= interval[1]
        ]
        LD_interval = mut_tree.keep_intervals(np.array([interval]))
        LD = tskit.LdCalculator(LD_interval)
        LD_mat = LD.r2_matrix()

        for i in range(len(positions)):
            for j in range(len(positions)):
                if j >= i:
                    continue
                else:
                    pw_distances_ij = abs(positions[i] - positions[j])
                    if pw_distances_ij > 10000: continue
                    LD_output.write(",".join(
                        [str(inter),
                         str(pw_distances_ij),
                         str(LD_mat[i, j])]) + "\n")
    LD_output.close()