Пример #1
0
 def verify_max_distance(self, ts):
     """
     Verifies that the max_distance parameter works as expected.
     """
     mutations = list(ts.mutations())
     ldc = msprime.LdCalculator(ts)
     A = ldc.get_r2_matrix()
     j = len(mutations) // 2
     for k in range(j):
         x = mutations[j + k].position - mutations[j].position
         a = ldc.get_r2_array(j, max_distance=x)
         self.assertEqual(a.shape[0], k)
         self.assertTrue(np.allclose(A[j, j + 1:j + 1 + k], a))
         x = mutations[j].position - mutations[j - k].position
         a = ldc.get_r2_array(j, max_distance=x, direction=msprime.REVERSE)
         self.assertEqual(a.shape[0], k)
         self.assertTrue(np.allclose(A[j, j - k:j], a[::-1]))
     L = ts.get_sequence_length()
     m = len(mutations)
     a = ldc.get_r2_array(0, max_distance=L)
     self.assertEqual(a.shape[0], m - 1)
     self.assertTrue(np.allclose(A[0, 1:], a))
     a = ldc.get_r2_array(m - 1, max_distance=L, direction=msprime.REVERSE)
     self.assertEqual(a.shape[0], m - 1)
     self.assertTrue(np.allclose(A[m - 1, :-1], a[::-1]))
Пример #2
0
def get_msprime_ld(sim_data):
    ld_calc = msprime.LdCalculator(sim_data)
    A = ld_calc.get_r2_matrix()
    plt.imshow(A, interpolation="none", vmin=0, vmax=1, cmap="Blues")
    plt.xticks([])
    plt.yticks([])
    plt.show()
Пример #3
0
    def verify_matrix(self, ts):
        m = ts.get_num_sites()
        ldc = msprime.LdCalculator(ts)
        A = ldc.get_r2_matrix()
        self.assertEqual(A.shape, (m, m))
        B = get_r2_matrix(ts)
        self.assertTrue(np.allclose(A, B))

        # Now look at each row in turn, and verify it's the same
        # when we use get_r2 directly.
        for j in range(m):
            a = ldc.get_r2_array(j, direction=msprime.FORWARD)
            b = A[j, j + 1:]
            self.assertEqual(a.shape[0], m - j - 1)
            self.assertEqual(b.shape[0], m - j - 1)
            self.assertTrue(np.allclose(a, b))
            a = ldc.get_r2_array(j, direction=msprime.REVERSE)
            b = A[j, :j]
            self.assertEqual(a.shape[0], j)
            self.assertEqual(b.shape[0], j)
            self.assertTrue(np.allclose(a[::-1], b))

        # Now check every cell in the matrix in turn.
        for j in range(m):
            for k in range(m):
                self.assertAlmostEqual(ldc.get_r2(j, k), A[j, k])
Пример #4
0
def clump_variants(simulation, summary_stats, nhaps, r2_threshold,
                   window_size):
    """
    perform variant clumping in a greedy fasion with p-value and r2 threshold in windows
    return only those variants meeting some nominal threshold
    
    1: make a dict of pos -> variant for subset of sites meeting criteria
    2: make an r2 dict of all pairs of snps meeting p-value threshold and in same window
    """
    # make a list of SNPs ordered by p-value
    eprint('Subsetting variants to usable list' + current_time())
    usable_positions = {}  # position -> variant (simulation indices)

    sim_pos_index = {}
    for variant in tqdm(simulation.variants(),
                        total=simulation.get_num_mutations()):
        if variant.position in summary_stats:
            usable_positions[variant.position] = variant
            sim_pos_index[variant.position] = variant.index

    # order all snps by p-value
    ordered_positions = sorted(summary_stats.keys(),
                               key=lambda x: summary_stats[x][-1])
    #[(x, (x in usable_positions.keys())) for x in ordered_positions]

    eur_subset = simulation.subset(range(nhaps[0], (nhaps[0] + nhaps[1])))
    eur_index_pos = {}
    eur_pos_index = {}
    for mutation in tqdm(eur_subset.mutations(),
                         total=eur_subset.get_num_mutations()):
        eur_index_pos[mutation.index] = mutation.position
        eur_pos_index[mutation.position] = mutation.index
    ordered_eur_index = sorted(eur_index_pos.keys())
    ld_calc = msprime.LdCalculator(eur_subset)
    #ld_calc = msprime.LdCalculator(simulation)

    # compute LD and prune in order of significance (popping index of SNPs)
    for position in ordered_positions:
        if position in usable_positions:
            r2_forward = ld_calc.get_r2_array(eur_pos_index[position],
                                              direction=msprime.FORWARD,
                                              max_distance=125e3)
            #print([position, np.where(r2_forward > r2_threshold)[0], np.where(r2_reverse > r2_threshold)[0]])
            for i in np.where(r2_forward > r2_threshold)[0]:
                usable_positions.pop(
                    eur_index_pos[eur_pos_index[position] + i + 1],
                    None)  #identify next position in eur space
            r2_reverse = ld_calc.get_r2_array(eur_pos_index[position],
                                              direction=msprime.REVERSE,
                                              max_distance=125e3)
            for i in np.where(r2_reverse > r2_threshold)[0]:
                usable_positions.pop(
                    eur_index_pos[eur_pos_index[position] - i - 1], None)

    clumped_snps = set(usable_positions.keys())

    eprint('Starting SNPs: ' + str(len(ordered_positions)) +
           '; SNPs after clumping: ' + str(len(clumped_snps)) + current_time())

    return (clumped_snps, usable_positions)
Пример #5
0
    def pos_r2(ts):
        """Obtain vectors of position differences and r^2 per pair of sites.

        Arguments
        ---------
        ts : msprime.TreeSequence
            tree sequence object

        Returns
        -------
        pos_diff : np.array
            position difference for pairs of snps

        r2 : np.array
            r^2 as computed between the different sites

        """
        ld_calc = msp.LdCalculator(ts)
        r2_est = ld_calc.r2_matrix()
        # Computing positions and indices
        pos = np.array([s.position for s in ts.sites()], dtype=np.float32)
        n_sites = ts.num_sites
        pos_diff_mat = np.zeros(shape=(n_sites, n_sites), dtype=np.float32)
        #       print(r2_est.shape, pos_diff_mat.shape)
        for i in np.arange(len(pos)):
            for j in np.arange(i):
                # Calculating the absolute difference in position
                pos_diff_mat[i, j] = np.abs(pos[i] - pos[j])

        # Extract entries that matter (and are matched)
        r2 = r2_est[pos_diff_mat > 0]
        pos_diff = pos_diff_mat[pos_diff_mat > 0]
        return (pos_diff, r2)
Пример #6
0
    def _pos_r2(ts):
        """Obtain vectors of position differences and r^2 per pair of sites.

        Arguments
        ---------
        ts : msprime.TreeSequence
            tree sequence object

        Returns
        -------
        pos_diff : np.array
            position difference for pairs of snps

        r2 : np.array
            r^2 as computed between the different sites

        """
        ld_calc = msp.LdCalculator(ts)
        r2_est = ld_calc.r2_matrix()
        # Computing positions and indices
        pos = np.array([s.position for s in ts.sites()], dtype=np.float32)
        pos_diff_mat = np.zeros(shape=(pos.shape[0], pos.shape[0]), dtype=np.float32)
        for i in np.arange(len(pos)):
            for j in np.arange(i):
                # Calculating the absolute difference in position
                pos_diff_mat[i, j] = np.abs(pos[i] - pos[j])

        # Extract entries that matter (and are matched)
        r2 = r2_est[pos_diff_mat > 0]
        pos_diff = pos_diff_mat[pos_diff_mat > 0]

        # Set undefined values to be 1 (due to non-segregating issues...)
        r2[np.isnan(r2)] = 1.0
        return (pos_diff, r2)
Пример #7
0
 def test_deprecated_aliases(self):
     ts = msprime.simulate(20, mutation_rate=10, random_seed=15)
     ts = tsutil.subsample_sites(ts, self.num_test_sites)
     ldc = msprime.LdCalculator(ts)
     A = ldc.get_r2_matrix()
     B = ldc.r2_matrix()
     self.assertTrue(np.array_equal(A, B))
     a = ldc.get_r2_array(0)
     b = ldc.r2_array(0)
     self.assertTrue(np.array_equal(a, b))
     self.assertEqual(ldc.get_r2(0, 1), ldc.r2(0, 1))
Пример #8
0
 def thread_worker(thread_index):
     ld = msp.LdCalculator(ts)
     chunk_size = int(math.ceil(len(mask) / num_threads))
     nextSite = thread_index * chunk_size
     stop = nextSite + chunk_size
     while True:
         mask[nextSite] = True
         r2 = (ld.r2_array(nextSite) <= thresh)
         if nextSite > stop or len(r2) == 0 or not np.any(r2):
             break
         nextSite += (1 + np.argmax(r2))
Пример #9
0
 def verify_max_mutations(self, ts):
     """
     Verifies that the max mutations parameter works as expected.
     """
     mutations = list(ts.mutations())
     ldc = msprime.LdCalculator(ts)
     A = ldc.get_r2_matrix()
     j = len(mutations) // 2
     for k in range(j):
         a = ldc.get_r2_array(j, max_mutations=k)
         self.assertEqual(a.shape[0], k)
         self.assertTrue(np.allclose(A[j, j + 1:j + 1 + k], a))
         a = ldc.get_r2_array(j, max_mutations=k, direction=msprime.REVERSE)
         self.assertEqual(a.shape[0], k)
         self.assertTrue(np.allclose(A[j, j - k:j], a[::-1]))
Пример #10
0
 def thread_worker(thread_index):
     ld_calc = msprime.LdCalculator(tree_sequence)
     chunk_size = int(math.ceil(len(focal_mutations) / num_threads))
     start = thread_index * chunk_size
     for focal_mutation in focal_mutations[start: start + chunk_size]:
         a = ld_calc.get_r2_array(
             focal_mutation, max_distance=max_distance,
             direction=msprime.REVERSE)
         rev_indexes = focal_mutation - np.nonzero(a >= r2_threshold)[0] - 1
         a = ld_calc.get_r2_array(
             focal_mutation, max_distance=max_distance,
             direction=msprime.FORWARD)
         fwd_indexes = focal_mutation + np.nonzero(a >= r2_threshold)[0] + 1
         indexes = np.concatenate((rev_indexes[::-1], fwd_indexes))
         results[focal_mutation] = indexes
         progress_bar.update()
Пример #11
0
def ld_matrix_example():
    ts = msprime.simulate(100, recombination_rate=10, mutation_rate=20,
            random_seed=1)
    ld_calc = msprime.LdCalculator(ts)
    A = ld_calc.get_r2_matrix()
    # Now plot this matrix.
    x = A.shape[0] / pyplot.rcParams['savefig.dpi']
    x = max(x, pyplot.rcParams['figure.figsize'][0])
    fig, ax = pyplot.subplots(figsize=(x, x))
    fig.tight_layout(pad=0)
    im = ax.imshow(A, interpolation="none", vmin=0, vmax=1, cmap="Blues")
    ax.set_xticks([])
    ax.set_yticks([])
    for s in 'top', 'bottom', 'left', 'right':
        ax.spines[s].set_visible(False)
    pyplot.gcf().colorbar(im, shrink=.5, pad=0)
    pyplot.savefig("_static/ld.svg")
Пример #12
0
def two_bins(NA, N1, N2, Ts, M1, M2):
    NA = NA
    N1 = N1
    N2 = N2
    Ts = Ts
    M1 = M1
    M2 = M2

    population_configurations = [
        msprime.PopulationConfiguration(sample_size=0, initial_size=N1),
        msprime.PopulationConfiguration(sample_size=50, initial_size=N2)
    ]
    migration_matrix = [[0, M2], [0, 0]]
    demographic_events = [
        msprime.MigrationRateChange(time=Ts / 2, rate=M1, matrix_index=(0, 1)),
        #msprime.MigrationRateChange(time=Ts/2, rate=M1, matrix_index=(1, 0)),
        msprime.MassMigration(time=Ts, source=1, destination=0, proportion=1.0)
    ]

    #dp = msprime.DemographyDebugger(
    #    Ne=NA,
    #    population_configurations=population_configurations,
    #    migration_matrix=migration_matrix,
    #    demographic_events=demographic_events)
    #dp.print_history()

    replicates = 500000
    sim = msprime.simulate(Ne=NA,
                           population_configurations=population_configurations,
                           migration_matrix=migration_matrix,
                           demographic_events=demographic_events,
                           mutation_rate=1e-7,
                           recombination_rate=1e-8,
                           length=100000,
                           num_replicates=replicates)
    pi = np.zeros(replicates)
    seg = np.zeros(replicates)
    ld = np.zeros(replicates)
    for j, s in enumerate(sim):
        pi[j] = s.get_pairwise_diversity()
        seg[j] = s.get_num_mutations()
        ld[j] = np.var(msprime.LdCalculator(s).get_r2_matrix())

    #return(np.array([np.mean(pi),np.var(pi),np.mean(seg),np.var(seg)]))
    #return(np.array([np.var(pi),np.var(seg), np.var(ld)]))
    return (np.array([np.var(seg)]))
Пример #13
0
    def test_get_r2_array_multiple_instances(self):
        # This is the nominal case where we have a separate LdCalculator
        # instance in each thread.
        ts = self.get_tree_sequence()
        ld_calc = msprime.LdCalculator(ts)
        A = ld_calc.get_r2_matrix()
        m = A.shape[0]
        del ld_calc

        def worker(thread_index, results):
            ld_calc = msprime.LdCalculator(ts)
            results[thread_index] = np.array(
                ld_calc.get_r2_array(thread_index))

        results = run_threads(worker, m)
        for j in range(m):
            self.assertTrue(np.allclose(results[j], A[j, j + 1:]))
Пример #14
0
    def test_get_r2_array_single_instance(self):
        # This is the degenerate case where we have a single LdCalculator
        # instance shared by the threads. We should have only one thread
        # actually executing get_r2_array() at one time. Because the buffer
        # is shared by many different instances, we can't make any assertions
        # about the returned values --- they are essentially gibberish.
        # However, we shouldn't crash and burn, which is what this test
        # is here to check for.
        ts = self.get_tree_sequence()
        ld_calc = msprime.LdCalculator(ts)
        m = ts.get_num_mutations()

        def worker(thread_index, results):
            results[thread_index] = ld_calc.get_r2_array(thread_index).shape

        results = run_threads(worker, m)
        for j in range(m):
            self.assertEqual(results[j][0], m - j - 1)
Пример #15
0
    def test_get_r2_single_instance(self):
        # This is the degenerate case where we have a single LdCalculator
        # instance shared by the threads. We should have only one thread
        # actually executing get_r2() at one time.
        ts = self.get_tree_sequence()
        ld_calc = msprime.LdCalculator(ts)
        A = ld_calc.get_r2_matrix()
        m = A.shape[0]

        def worker(thread_index, results):
            row = np.zeros(m)
            results[thread_index] = row
            for j in range(m):
                row[j] = ld_calc.get_r2(thread_index, j)

        results = run_threads(worker, m)
        for j in range(m):
            self.assertTrue(np.allclose(results[j], A[j]))
Пример #16
0
 def worker(thread_index, results):
     ld_calc = msprime.LdCalculator(ts)
     results[thread_index] = np.array(
         ld_calc.get_r2_array(thread_index))
Пример #17
0
 def worker(thread_index, results):
     ld_calc = msprime.LdCalculator(ts)
     row = np.zeros(m)
     results[thread_index] = row
     for j in range(m):
         row[j] = ld_calc.get_r2(thread_index, j)