def test_one_generation_no_deep_history(self): N = 20 tables = wf_sim(N=N, ngens=1, deep_history=False, seed=self.random_seed) self.assertEqual(tables.nodes.num_rows, 2 * N) self.assertGreater(tables.edges.num_rows, 0) self.assertEqual(tables.sites.num_rows, 0) self.assertEqual(tables.mutations.num_rows, 0) self.assertEqual(tables.migrations.num_rows, 0) nodes = tables.nodes edges = tables.edges samples = np.where(nodes.flags == msprime.NODE_IS_SAMPLE)[0].astype( np.int32) msprime.sort_tables(nodes=nodes, edges=edges) msprime.simplify_tables(samples=samples, nodes=nodes, edges=edges) self.assertGreater(tables.nodes.num_rows, 0) self.assertGreater(tables.edges.num_rows, 0) ts = msprime.load_tables(nodes=nodes, edges=edges) for tree in ts.trees(): all_samples = set() for root in tree.roots: root_samples = set(tree.samples(root)) self.assertEqual(len(root_samples & all_samples), 0) all_samples |= root_samples self.assertEqual(all_samples, set(ts.samples()))
def test_with_recurrent_mutations(self): # actually with only ONE site, at 0.0 N = 10 ngens = 100 tables = wf_sim(N=N, ngens=ngens, deep_history=False, seed=self.random_seed) msprime.sort_tables(**tables.asdict()) ts = msprime.load_tables(**tables.asdict()) ts = tsutil.jukes_cantor(ts, 1, 10, seed=self.random_seed) tables = ts.tables self.assertEqual(tables.sites.num_rows, 1) self.assertGreater(tables.mutations.num_rows, 0) nodes = tables.nodes samples = np.where(nodes.flags == msprime.NODE_IS_SAMPLE)[0].astype( np.int32) # before simplify for h in ts.haplotypes(): self.assertEqual(len(h), 1) # after simplify msprime.simplify_tables(samples=samples, nodes=tables.nodes, edges=tables.edges, sites=tables.sites, mutations=tables.mutations) self.assertGreater(tables.nodes.num_rows, 0) self.assertGreater(tables.edges.num_rows, 0) self.assertEqual(tables.sites.num_rows, 1) self.assertGreater(tables.mutations.num_rows, 0) ts = msprime.load_tables(**tables.asdict()) self.assertEqual(ts.sample_size, N) for hap in ts.haplotypes(): self.assertEqual(len(hap), ts.num_sites)
def test_overlapping_generations(self): tables = wf_sim(N=30, ngens=10, survival=0.85, seed=self.random_seed) self.assertGreater(tables.nodes.num_rows, 0) self.assertGreater(tables.edges.num_rows, 0) self.assertEqual(tables.sites.num_rows, 0) self.assertEqual(tables.mutations.num_rows, 0) self.assertEqual(tables.migrations.num_rows, 0) nodes = tables.nodes edges = tables.edges msprime.sort_tables(nodes=nodes, edges=edges) samples = np.where(nodes.flags == msprime.NODE_IS_SAMPLE)[0].astype( np.int32) msprime.simplify_tables(samples=samples, nodes=nodes, edges=edges) ts = msprime.load_tables(nodes=nodes, edges=edges) for tree in ts.trees(): self.assertEqual(tree.num_roots, 1)
def simplify(self, generation, ancestry): # update node times: if self.__nodes.num_rows > 0: tc = self.__nodes.time dt = float(generation) - self.last_gc_time tc += dt self.last_gc_time = generation flags = np.empty([self.__nodes.num_rows], dtype=np.uint32) flags.fill(1) self.__nodes.set_columns(flags=flags, population=self.__nodes.population, time=tc) start = time.time() ancestry.prep_for_gc() na = np.array(ancestry.nodes, copy=False) ea = np.array(ancestry.edges, copy=False) samples = np.array(ancestry.samples, copy=False) flags = np.empty([len(na)], dtype=np.uint32) flags.fill(1) stop = time.time() self.__time_prepping += (stop - start) start = time.time() self.__nodes.append_columns(flags=flags, population=na['population'], time=na['generation']) self.__edges.append_columns(left=ea['left'], right=ea['right'], parent=ea['parent'], children=ea['child'], children_length=[1] * len(ea)) stop = time.time() self.__time_appending += (stop - start) start = time.time() msprime.sort_tables(nodes=self.__nodes, edgesets=self.__edges) stop = time.time() self.__time_sorting += (stop - start) start = time.time() msprime.simplify_tables(samples=samples.tolist(), nodes=self.__nodes, edgesets=self.__edges) stop = time.time() self.__time_simplifying += (stop - start) return (True, self.__nodes.num_rows)
def simplify(self, generation, tracker): """ Details of taking new data, appending, and simplifying. :return: length of simplifed node table, which is next_id to use """ # Update time in current nodes. # Is this most effficient method? dt = generation - self.last_gc_time self.nodes.set_columns(flags=self.nodes.flags, population=self.nodes.population, time=self.nodes.time + dt) # Create "flags" for new nodes. # This is much faster than making a list flags = np.empty([len(tracker.nodes)], dtype=np.uint32) flags.fill(1) # Convert time from forwards to backwards tracker.convert_time() # Update internal *Tables self.nodes.append_columns(flags=flags, population=tracker.nodes['population'], time=tracker.nodes['generation']) self.edges.append_columns(left=tracker.edges['left'], right=tracker.edges['right'], parent=tracker.edges['parent'], children=tracker.edges['child'], children_length=[1] * len(tracker.edges)) # Sort and simplify msprime.sort_tables(nodes=self.nodes, edgesets=self.edges) msprime.simplify_tables(samples=tracker.samples.tolist(), nodes=self.nodes, edgesets=self.edges) # Return length of NodeTable, # which can be used as next offspring ID return self.nodes.num_rows
def test_non_overlapping_generations(self): tables = wf_sim(N=10, ngens=10, survival=0.0, seed=self.random_seed) self.assertGreater(tables.nodes.num_rows, 0) self.assertGreater(tables.edges.num_rows, 0) self.assertEqual(tables.sites.num_rows, 0) self.assertEqual(tables.mutations.num_rows, 0) self.assertEqual(tables.migrations.num_rows, 0) nodes = tables.nodes edges = tables.edges msprime.sort_tables(nodes=nodes, edges=edges) samples = np.where(nodes.flags == msprime.NODE_IS_SAMPLE)[0].astype( np.int32) msprime.simplify_tables(samples=samples, nodes=nodes, edges=edges) ts = msprime.load_tables(nodes=nodes, edges=edges) # All trees should have exactly one root and the leaves should be the samples, # and all internal nodes should have arity > 1 for tree in ts.trees(): self.assertEqual(tree.num_roots, 1) leaves = set(tree.leaves(tree.root)) self.assertEqual(leaves, set(ts.samples())) for u in tree.nodes(): if tree.is_internal(u): self.assertGreater(len(tree.children(u)), 1)
def wright_fisher(N, T, simplify_interval=1): """ An implementation of algorithm W where we simplify after every generation. The goal here is to measure the number of edges in the tree sequence representing the history as a function of time. For simplicity we assume that the genome length L = 1 and the probability of death delta = 1. """ L = 1 edges = msprime.EdgeTable() nodes = msprime.NodeTable() P = [j for j in range(N)] for j in range(N): nodes.add_row(time=T, flags=1) t = T S = np.zeros(T, dtype=int) while t > 0: t -= 1 Pp = [P[j] for j in range(N)] for j in range(N): n = len(nodes) nodes.add_row(time=t, flags=1) Pp[j] = n a = random.randint(0, N - 1) b = random.randint(0, N - 1) x = random.uniform(0, L) edges.add_row(0, x, P[a], n) edges.add_row(x, L, P[b], n) P = Pp if t % simplify_interval == 0: msprime.sort_tables(nodes=nodes, edges=edges) msprime.simplify_tables(Pp, nodes, edges) P = list(range(N)) S[T - t - 1] = len(edges) # We will always simplify at t = 0, so no need for special case at the end return msprime.load_tables(nodes=nodes, edges=edges), S
def test_many_generations_no_deep_history(self): N = 10 ngens = 100 tables = wf_sim(N=N, ngens=ngens, deep_history=False, seed=self.random_seed) self.assertEqual(tables.nodes.num_rows, N * (ngens + 1)) self.assertGreater(tables.edges.num_rows, 0) self.assertEqual(tables.sites.num_rows, 0) self.assertEqual(tables.mutations.num_rows, 0) self.assertEqual(tables.migrations.num_rows, 0) nodes = tables.nodes edges = tables.edges samples = np.where(nodes.flags == msprime.NODE_IS_SAMPLE)[0].astype( np.int32) msprime.sort_tables(nodes=nodes, edges=edges) msprime.simplify_tables(samples=samples, nodes=nodes, edges=edges) self.assertGreater(tables.nodes.num_rows, 0) self.assertGreater(tables.edges.num_rows, 0) # We are assuming that everything has coalesced and we have single-root trees ts = msprime.load_tables(nodes=nodes, edges=edges) for tree in ts.trees(): self.assertEqual(tree.num_roots, 1)
def test4(self): self.n.set_columns(time=[1,0,0,2],flags=[msprime.NODE_IS_SAMPLE]*4) self.e.add_row(parent=0,child=1,left=0,right=0.4) self.e.add_row(parent=0,child=1,left=0.6,right=1.0) self.e.add_row(parent=0,child=2,left=0,right=1) self.e.add_row(parent=3,child=0,left=0,right=0.4) self.s.add_row(position=0.4,ancestral_state='0') self.m.add_row(site=0,node=3,derived_state='1') msprime.sort_tables(nodes=self.n,edges=self.e, sites=self.s,mutations=self.m) idmap = msprime.simplify_tables(nodes=self.n,edges=self.e, sites=self.s,mutations=self.m,samples=[1,2]) ts = msprime.load_tables(nodes=self.n,edges=self.e,sites=self.s, mutations=self.m) m = ts.genotype_matrix() self.assertEqual(m[0:].sum(),0)
def test_simplify_tables(self): seed = 71 for ts in self.get_wf_sims(seed=seed): tables = ts.dump_tables() for nsamples in [2, 5, 10]: nodes = tables.nodes.copy() edges = tables.edges.copy() sites = tables.sites.copy() mutations = tables.mutations.copy() sub_samples = random.sample(list(ts.samples()), min(nsamples, ts.num_samples)) node_map = msprime.simplify_tables(samples=sub_samples, nodes=nodes, edges=edges, sites=sites, mutations=mutations) small_ts = msprime.load_tables(nodes=nodes, edges=edges, sites=sites, mutations=mutations) self.verify_simplify(ts, small_ts, sub_samples, node_map)
nt.append_columns(flags=flags, population=nodes['population'] + node_offset, time=nodes['generation']) es.append_columns(left=edges['left'], right=edges['right'], parent=edges['parent'] + node_offset, child=edges['child'] + node_offset) # Sort msprime.sort_tables(nodes=nt, edges=es) # Simplify: this is where the magic happens # PLR: since these tables aren't valid, you gotta use simplify_tables, not load them into a tree sequence msprime.simplify_tables(samples=samples.tolist(), nodes=nt, edges=es) # Create a tree sequence x = msprime.load_tables(nodes=nt, edges=es) # Lets look at the MRCAS. # This is where things go badly: MRCAS = [t.get_time(t.get_root()) for t in x.trees()] print(MRCAS) # Throw down some mutations # onto a sample of size nsam # We'll copy tables here, # just to see what happens. # PLR: these .copy()s aren't doing anything: just overwritten before nt_s = nt.copy()
def writer(): msprime.simplify_tables([0, 1], nodes=tables.nodes, edges=tables.edges, sites=tables.sites, mutations=tables.mutations)
def writer(thread_index, results): msprime.simplify_tables([0, 1], nodes=tables.nodes, edges=tables.edges, sites=tables.sites, mutations=tables.mutations)
derived_state_length=np.ones(len(mutas['node_id']), np.uint32)) # Sort msprime.sort_tables(nodes=nt, edges=es, sites=st, mutations=mt) print("num total mutations: ", st.num_rows) # Simplify: this is where the magic happens ## PLR: since these tables aren't valid, you gotta use simplify_tables, not load them into a tree sequence nt_c = nt.copy() es_c = es.copy() st_c = st.copy() mt_c = mt.copy() msprime.simplify_tables(samples=samples.tolist(), nodes=nt_c, edges=es_c, sites=st_c, mutations=mt_c) print("num simplified mutations: ", st_c.num_rows) # Create a tree sequence x = msprime.load_tables(nodes=nt_c, edges=es_c, sites=st_c, mutations=mt_c) print(max(mt_c.node)) print(nt_c.num_rows) nt_s = nt_c.copy() es_s = es_c.copy() st_s = st_c.copy() mt_s = mt_c.copy() nsam_samples = np.random.choice(2 * popsize, nsam, replace=False)
c.append(ci[0]) l.append(li[0]) r.append(ri[0]) edges.set_columns(parent=p,child=c,left=l,right=r) N=int(sys.argv[3]) #samples=[i for i in range(len(times)-2*N,len(times))] samples=[i for i in range(0,len(times),132)] ts=None A=time.time() msprime.sort_tables(nodes=nodes,edges=edges) B=time.time() ts=msprime.simplify_tables(nodes=nodes,edges=edges,samples=samples) C=time.time() print("Sorting: ",B-A,"seconds") print("Simplifying: ",C-B,"seconds") with open(sys.argv[4],'w') as f: for i in edges: f.write("{} {} {:.6f} {:.6f}\n".format(i.parent,i.child,i.left,i.right,nodes[i.parent].time)) with open(sys.argv[5],'w') as f: for i in nodes: f.write("{}\n".format(i.time))
tracker = MockAncestryTracker() recrate = args.rho / float(4 * args.popsize) samples = wf(args.popsize, simplifier, tracker, recrate, SIMLEN * args.popsize) if len(tracker.nodes) > 0: # Then there's stuff that didn't get GC'd simplifier.simplify(SIMLEN * args.popsize, tracker) # Local names for convenience. # I copy the tables here, too, # because I think that will be # done in practice: you will # often want to simplify and # ARG down to a smaller sample # but still have the complete # history of the pop'n. nodes = simplifier.nodes.copy() edges = simplifier.edges.copy() nsam_samples = np.random.choice(2 * args.popsize, args.nsam, replace=False) msprime.simplify_tables(samples=nsam_samples.tolist(), nodes=nodes, edges=edges) msp_rng = msprime.RandomGenerator(args.seed) mutations = msprime.MutationTable() sites = msprime.SiteTable() mutgen = msprime.MutationGenerator(msp_rng, args.theta / float(4 * args.popsize)) mutgen.generate(nodes, edges, sites, mutations) print(sites.num_rows)
# Use fwdpy11 wf.evolve(rng, pop, params) # Get a sample s = fwdpy11.sampling.sample_separate(rng, pop, args.nsam) else: # Use this module simplifier, atracker, tsim = evolve_track( rng, pop, params, args.gc, True, args.seed, args.async, args.queue, args.qsize, args.wthreads) # Take times from simplifier before they change. times = simplifier.times times['fwd_sim_runtime'] = [tsim] times['N'] = [args.popsize] times['theta'] = [args.theta] times['rho'] = [args.rho] times['simplify_interval'] = [args.gc] d = pd.DataFrame(times) d.to_csv(args.outfile1, sep='\t', index=False, compression='gzip') # Simplify the genealogy down to a sample, # And throw mutations onto that sample msprime.simplify_tables(np.random.choice(2 * args.popsize, args.nsam, replace=False).tolist(), nodes=simplifier.nodes, edges=simplifier.edges) msp_rng = msprime.RandomGenerator(args.seed) sites = msprime.SiteTable() mutations = msprime.MutationTable() mutgen = msprime.MutationGenerator( msp_rng, args.theta / float(4 * args.popsize)) mutgen.generate(simplifier.nodes, simplifier.edges, sites, mutations)
def simplify(self, generation, ancestry): # print(type(ancestry)) # update node times: if self.__nodes.num_rows > 0: tc = self.__nodes.time dt = float(generation) - self.last_gc_time tc += dt self.last_gc_time = generation flags = np.ones(self.__nodes.num_rows, dtype=np.uint32) self.__nodes.set_columns(flags=flags, population=self.__nodes.population, time=tc) before = time.process_time() # Acquire mutex ancestry.acquire() self.reverse_time(ancestry.nodes) na = np.array(ancestry.nodes, copy=False) ea = np.array(ancestry.edges, copy=False) new_min_id = na['id'][0] new_max_id = na['id'][-1] delta = new_min_id - len(self.__nodes) if delta != 0: self.update_indexes(ancestry.edges, ancestry.samples, delta, new_min_id, new_max_id) samples = np.array(ancestry.samples, copy=False) flags = np.ones(len(na), dtype=np.uint32) self.__time_prepping += time.process_time() - before before = time.process_time() clen = len(self.__nodes) self.__nodes.append_columns(flags=flags, population=na['population'], time=na['generation']) # Copy the already sorted edges to local arrays left = self.__edges.left[:] right = self.__edges.right[:] parent = self.__edges.parent[:] child = self.__edges.child[:] # Get the new edges and reverse them. After this, we know that all edges # are correctly sorted with respect to time. We then sort each time slice # individually, reducing the overall cost of the sort. new_left = ea['left'][::-1] new_right = ea['right'][::-1] new_parent = ea['parent'][::-1] new_child = ea['child'][::-1] parent_time = self.__nodes.time[new_parent] breakpoints = np.where(parent_time[1:] != parent_time[:-1])[0] + 1 self.__edges.reset() self.__time_appending += time.process_time() - before before = time.process_time() start = 0 for end in itertools.chain(breakpoints, [-1]): assert np.all(parent_time[start:end] == parent_time[start]) self.__edges.append_columns(left=new_left[start:end], right=new_right[start:end], parent=new_parent[start:end], child=new_child[start:end]) msprime.sort_tables(nodes=self.__nodes, edges=self.__edges, edge_start=start) start = end self.__time_sorting += time.process_time() - before # Append the old sorted edges to the table. self.__edges.append_columns(left=left, right=right, parent=parent, child=child) before = time.process_time() msprime.simplify_tables(samples=samples.tolist(), nodes=self.__nodes, edges=self.__edges) # Release any locks on the ancestry object ancestry.release() self.__last_edge_start = len(self.__edges) self.__time_simplifying += time.process_time() - before self.__process = True return (True, self.__nodes.num_rows)
import fwdpy11_arg_example.evolve_arg as ea import msprime import numpy as np import sys N = int(sys.argv[1]) rho = float(sys.argv[2]) theta = float(sys.argv[3]) gc_interval = int(sys.argv[4]) seed = int(sys.argv[5]) simplifier, atracker, tsim = ea.evolve_track_wrapper(popsize=N, rho=rho, seed=seed, gc_interval=gc_interval, mu=0.0) print(tsim, simplifier.times) np.random.seed(seed) # Get a sample of size n = 10 msprime.simplify_tables(np.random.choice(2 * N, 10, replace=False).tolist(), nodes=simplifier.nodes, edgesets=simplifier.edgesets) msp_rng = msprime.RandomGenerator(seed) sites = msprime.SiteTable() mutations = msprime.MutationTable() mutgen = msprime.MutationGenerator(msp_rng, theta / float(4 * N)) # rho = theta mutgen.generate(simplifier.nodes, simplifier.edgesets, sites, mutations) print(sites.num_rows)
def run_simplify_num_edges_benchmark(args): ts = msprime.load(args.file) np.random.seed(1) print("num_nodes = ", ts.num_nodes) print("num_edges = ", ts.num_edges) num_slices = 10 tables = ts.dump_tables() nodes = tables.nodes edges = tables.edges node_time = nodes.time left = edges.left right = edges.right parent = edges.parent child = edges.child size = left.nbytes + right.nbytes + parent.nbytes + child.nbytes print("Total edge size = ", size / 1024**3, "GiB") sample_sizes = [10, 100, 1000] num_sample_sizes = len(sample_sizes) num_edges = np.zeros(num_slices * num_sample_sizes) simplify_time = np.zeros(num_slices * num_sample_sizes) sample_size = np.zeros(num_slices * num_sample_sizes) slice_size = ts.num_edges // num_slices j = 0 for N in sample_sizes: for start in range(ts.num_edges - slice_size, 0, -slice_size): max_node = np.max(child[start:]) samples = np.arange(max_node - N, max_node, dtype=np.int32) subset_nodes = msprime.NodeTable() subset_nodes.set_columns(time=node_time[:max_node + 1], flags=np.ones(max_node + 1, dtype=np.uint32)) subset_edges = msprime.EdgeTable() subset_edges.set_columns(left=left[start:], right=right[start:], parent=parent[start:], child=child[start:]) before = time.process_time() msprime.simplify_tables(samples=samples, nodes=subset_nodes, edges=subset_edges) duration = time.process_time() - before num_edges[j] = ts.num_edges - start simplify_time[j] = duration sample_size[j] = N print(N, num_edges[j], duration, num_edges[j] / duration, "per second") j += 1 df = pd.DataFrame({ "sample_size": sample_size, "num_edges": num_edges, "time": simplify_time }) df.to_csv("data/simplify_num_edges.dat") for N in sample_sizes: index = sample_size == N plt.plot(num_edges[index], simplify_time[index], marker="o") plt.xlabel("num edges") plt.ylabel("Time to simplify (s)") plt.savefig("simplify_num_edges.png")