def finalise(self, simplify=True, stabilise_node_ordering=False): logger.info("Finalising tree sequence") ts = self.get_tree_sequence(all_sites=True) if simplify: logger.info("Running simplify on {} nodes and {} edges".format( ts.num_nodes, ts.num_edges)) if stabilise_node_ordering: # Ensure all the node times are distinct so that they will have # stable IDs after simplifying. This could possibly also be done # by reversing the IDs within a time slice. This is used for comparing # tree sequences produced by perfect inference. tables = ts.tables time = tables.nodes.time for t in range(1, int(time[0])): index = np.where(time == t)[0] k = index.shape[0] time[index] += np.arange(k)[::-1] / k tables.nodes.set_columns(flags=tables.nodes.flags, time=time) msprime.sort_tables(**tables.asdict()) ts = msprime.load_tables(**tables.asdict()) ts = ts.simplify(samples=self.sample_ids, filter_zero_mutation_sites=False) logger.info( "Finished simplify; now have {} nodes and {} edges".format( ts.num_nodes, ts.num_edges)) return ts
def wright_fisher(N, delta, L, T): """ Direct implementation of Algorithm W. """ edges = msprime.EdgeTable() tau = [] P = [j for j in range(N)] for j in range(N): tau.append(T) t = T n = N while t > 0: t -= 1 j = 0 Pp = [P[j] for j in range(N)] while j < N: if random.random() < delta: Pp[j] = n tau.append(t) a = random.randint(0, N - 1) b = random.randint(0, N - 1) x = random.uniform(0, L) edges.add_row(0, x, P[a], n) edges.add_row(x, L, P[b], n) n += 1 j += 1 P = Pp nodes = msprime.NodeTable() P = set(P) for j in range(n): nodes.add_row(time=tau[j], flags=int(j in P)) msprime.sort_tables(nodes=nodes, edges=edges) return msprime.load_tables(nodes=nodes, edges=edges)
def test_one_generation_no_deep_history(self): N = 20 tables = wf_sim(N=N, ngens=1, deep_history=False, seed=self.random_seed) self.assertEqual(tables.nodes.num_rows, 2 * N) self.assertGreater(tables.edges.num_rows, 0) self.assertEqual(tables.sites.num_rows, 0) self.assertEqual(tables.mutations.num_rows, 0) self.assertEqual(tables.migrations.num_rows, 0) nodes = tables.nodes edges = tables.edges samples = np.where(nodes.flags == msprime.NODE_IS_SAMPLE)[0].astype( np.int32) msprime.sort_tables(nodes=nodes, edges=edges) msprime.simplify_tables(samples=samples, nodes=nodes, edges=edges) self.assertGreater(tables.nodes.num_rows, 0) self.assertGreater(tables.edges.num_rows, 0) ts = msprime.load_tables(nodes=nodes, edges=edges) for tree in ts.trees(): all_samples = set() for root in tree.roots: root_samples = set(tree.samples(root)) self.assertEqual(len(root_samples & all_samples), 0) all_samples |= root_samples self.assertEqual(all_samples, set(ts.samples()))
def single_childify(ts): """ Builds a new equivalent tree sequence which contains an extra node in the middle of all exising branches. """ tables = ts.dump_tables() edges = tables.edges nodes = tables.nodes sites = tables.sites mutations = tables.mutations time = nodes.time[:] edges.reset() for edge in ts.edges(): # Insert a new node in between the parent and child. u = len(nodes) t = time[edge.child] + (time[edge.parent] - time[edge.child]) / 2 nodes.add_row(time=t) edges.add_row( left=edge.left, right=edge.right, parent=u, child=edge.child) edges.add_row( left=edge.left, right=edge.right, parent=edge.parent, child=u) msprime.sort_tables( nodes=nodes, edges=edges, sites=sites, mutations=mutations) add_provenance(tables.provenances, "insert_redundant_breakpoints") new_ts = msprime.load_tables( nodes=nodes, edges=edges, sites=sites, mutations=mutations, provenances=tables.provenances) return new_ts
def test_with_recurrent_mutations(self): # actually with only ONE site, at 0.0 N = 10 ngens = 100 tables = wf_sim(N=N, ngens=ngens, deep_history=False, seed=self.random_seed) msprime.sort_tables(**tables.asdict()) ts = msprime.load_tables(**tables.asdict()) ts = tsutil.jukes_cantor(ts, 1, 10, seed=self.random_seed) tables = ts.tables self.assertEqual(tables.sites.num_rows, 1) self.assertGreater(tables.mutations.num_rows, 0) nodes = tables.nodes samples = np.where(nodes.flags == msprime.NODE_IS_SAMPLE)[0].astype( np.int32) # before simplify for h in ts.haplotypes(): self.assertEqual(len(h), 1) # after simplify msprime.simplify_tables(samples=samples, nodes=tables.nodes, edges=tables.edges, sites=tables.sites, mutations=tables.mutations) self.assertGreater(tables.nodes.num_rows, 0) self.assertGreater(tables.edges.num_rows, 0) self.assertEqual(tables.sites.num_rows, 1) self.assertGreater(tables.mutations.num_rows, 0) ts = msprime.load_tables(**tables.asdict()) self.assertEqual(ts.sample_size, N) for hap in ts.haplotypes(): self.assertEqual(len(hap), ts.num_sites)
def _load_legacy_hdf5_v3(root, remove_duplicate_positions): # get the trees group for the records and samples trees_group = root["trees"] nodes_group = trees_group["nodes"] time = np.array(nodes_group["time"]) breakpoints = np.array(trees_group["breakpoints"]) records_group = trees_group["records"] left_indexes = np.array(records_group["left"]) right_indexes = np.array(records_group["right"]) record_node = np.array(records_group["node"], dtype=np.int32) num_nodes = time.shape[0] sample_size = np.min(record_node) flags = np.zeros(num_nodes, dtype=np.uint32) flags[:sample_size] = msprime.NODE_IS_SAMPLE children_length = np.array(records_group["num_children"], dtype=np.uint32) total_rows = np.sum(children_length) left = np.zeros(total_rows, dtype=np.float64) right = np.zeros(total_rows, dtype=np.float64) parent = np.zeros(total_rows, dtype=np.int32) record_left = breakpoints[left_indexes] record_right = breakpoints[right_indexes] k = 0 for j in range(left_indexes.shape[0]): for _ in range(children_length[j]): left[k] = record_left[j] right[k] = record_right[j] parent[k] = record_node[j] k += 1 nodes = msprime.NodeTable() nodes.set_columns(flags=flags, time=nodes_group["time"], population=nodes_group["population"]) edges = msprime.EdgeTable() edges.set_columns(left=left, right=right, parent=parent, child=records_group["children"]) sites = msprime.SiteTable() mutations = msprime.MutationTable() if "mutations" in root: _convert_hdf5_mutations(root["mutations"], sites, mutations, remove_duplicate_positions) old_timestamp = datetime.datetime.min.isoformat() provenances = msprime.ProvenanceTable() if "provenance" in root: for record in root["provenance"]: provenances.add_row(timestamp=old_timestamp, record=record) provenances.add_row(_get_upgrade_provenance(root)) msprime.sort_tables(nodes=nodes, edges=edges, sites=sites, mutations=mutations) return msprime.load_tables(nodes=nodes, edges=edges, sites=sites, mutations=mutations, provenances=provenances)
def test_overlapping_generations(self): tables = wf_sim(N=30, ngens=10, survival=0.85, seed=self.random_seed) self.assertGreater(tables.nodes.num_rows, 0) self.assertGreater(tables.edges.num_rows, 0) self.assertEqual(tables.sites.num_rows, 0) self.assertEqual(tables.mutations.num_rows, 0) self.assertEqual(tables.migrations.num_rows, 0) nodes = tables.nodes edges = tables.edges msprime.sort_tables(nodes=nodes, edges=edges) samples = np.where(nodes.flags == msprime.NODE_IS_SAMPLE)[0].astype( np.int32) msprime.simplify_tables(samples=samples, nodes=nodes, edges=edges) ts = msprime.load_tables(nodes=nodes, edges=edges) for tree in ts.trees(): self.assertEqual(tree.num_roots, 1)
def test_ts_with_root_mutations(self): ts = self.get_example_ts(5, 3) t = ts.dump_tables() positions = set(site.position for site in ts.sites()) for tree in ts.trees(): pos = tree.interval[0] if pos not in positions: site_id = t.sites.add_row(position=pos, ancestral_state="0") t.mutations.add_row(site=site_id, node=tree.root, derived_state="1") positions.add(pos) self.assertGreater(len(positions), ts.num_sites) msprime.sort_tables(**t.asdict()) ts = msprime.load_tables(**t.asdict()) input_file = formats.SampleData.initialise( num_samples=ts.num_samples, sequence_length=ts.sequence_length) self.verify_data_round_trip(ts, input_file)
def test_ts_with_invariant_sites(self): ts = self.get_example_ts(5, 3) t = ts.dump_tables() positions = set(site.position for site in ts.sites()) for j in range(10): pos = 1 / (j + 1) if pos not in positions: t.sites.add_row(position=pos, ancestral_state="0") positions.add(pos) self.assertGreater(len(positions), ts.num_sites) msprime.sort_tables(**t.asdict()) ts = msprime.load_tables(**t.asdict()) input_file = formats.SampleData.initialise( num_samples=ts.num_samples, sequence_length=ts.sequence_length) self.verify_data_round_trip(ts, input_file) self.assertGreater(len(str(input_file)), 0)
def get_wf_sims(self, seed): """ Returns an iterator of example tree sequences produced by the WF simulator. """ for N in [5, 10, 20]: for surv in [0.0, 0.5, 0.9]: for mut in [0.01, 1.0]: for nloci in [1, 2, 3]: tables = wf_sim(N=N, ngens=N, survival=surv, seed=seed) msprime.sort_tables(**tables.asdict()) ts = msprime.load_tables(**tables.asdict()) ts = tsutil.jukes_cantor(ts, num_sites=nloci, mu=mut, seed=seed) self.verify_simulation(ts, ngens=N) yield ts
def simplify(self, generation, ancestry): # update node times: if self.__nodes.num_rows > 0: tc = self.__nodes.time dt = float(generation) - self.last_gc_time tc += dt self.last_gc_time = generation flags = np.empty([self.__nodes.num_rows], dtype=np.uint32) flags.fill(1) self.__nodes.set_columns(flags=flags, population=self.__nodes.population, time=tc) start = time.time() ancestry.prep_for_gc() na = np.array(ancestry.nodes, copy=False) ea = np.array(ancestry.edges, copy=False) samples = np.array(ancestry.samples, copy=False) flags = np.empty([len(na)], dtype=np.uint32) flags.fill(1) stop = time.time() self.__time_prepping += (stop - start) start = time.time() self.__nodes.append_columns(flags=flags, population=na['population'], time=na['generation']) self.__edges.append_columns(left=ea['left'], right=ea['right'], parent=ea['parent'], children=ea['child'], children_length=[1] * len(ea)) stop = time.time() self.__time_appending += (stop - start) start = time.time() msprime.sort_tables(nodes=self.__nodes, edgesets=self.__edges) stop = time.time() self.__time_sorting += (stop - start) start = time.time() msprime.simplify_tables(samples=samples.tolist(), nodes=self.__nodes, edgesets=self.__edges) stop = time.time() self.__time_simplifying += (stop - start) return (True, self.__nodes.num_rows)
def test4(self): self.n.set_columns(time=[1,0,0,2],flags=[msprime.NODE_IS_SAMPLE]*4) self.e.add_row(parent=0,child=1,left=0,right=0.4) self.e.add_row(parent=0,child=1,left=0.6,right=1.0) self.e.add_row(parent=0,child=2,left=0,right=1) self.e.add_row(parent=3,child=0,left=0,right=0.4) self.s.add_row(position=0.4,ancestral_state='0') self.m.add_row(site=0,node=3,derived_state='1') msprime.sort_tables(nodes=self.n,edges=self.e, sites=self.s,mutations=self.m) idmap = msprime.simplify_tables(nodes=self.n,edges=self.e, sites=self.s,mutations=self.m,samples=[1,2]) ts = msprime.load_tables(nodes=self.n,edges=self.e,sites=self.s, mutations=self.m) m = ts.genotype_matrix() self.assertEqual(m[0:].sum(),0)
def permute_nodes(ts, node_map): """ Returns a copy of the specified tree sequence such that the nodes are permuted according to the specified map. """ # Mapping from nodes in the new tree sequence back to nodes in the original reverse_map = [0 for _ in node_map] for j in range(ts.num_nodes): reverse_map[node_map[j]] = j old_nodes = list(ts.nodes()) new_nodes = msprime.NodeTable() for j in range(ts.num_nodes): old_node = old_nodes[reverse_map[j]] new_nodes.add_row(flags=old_node.flags, metadata=old_node.metadata, population=old_node.population, time=old_node.time) new_edges = msprime.EdgeTable() for edge in ts.edges(): new_edges.add_row(left=edge.left, right=edge.right, parent=node_map[edge.parent], child=node_map[edge.child]) new_sites = msprime.SiteTable() new_mutations = msprime.MutationTable() for site in ts.sites(): new_sites.add_row(position=site.position, ancestral_state=site.ancestral_state) for mutation in site.mutations: new_mutations.add_row(site=site.id, derived_state=mutation.derived_state, node=node_map[mutation.node]) msprime.sort_tables(nodes=new_nodes, edges=new_edges, sites=new_sites, mutations=new_mutations) provenances = ts.dump_tables().provenances add_provenance(provenances, "permute_nodes") return msprime.load_tables(nodes=new_nodes, edges=new_edges, sites=new_sites, mutations=new_mutations, provenances=provenances)
def simplify(self, generation, tracker): """ Details of taking new data, appending, and simplifying. :return: length of simplifed node table, which is next_id to use """ # Update time in current nodes. # Is this most effficient method? dt = generation - self.last_gc_time self.nodes.set_columns(flags=self.nodes.flags, population=self.nodes.population, time=self.nodes.time + dt) # Create "flags" for new nodes. # This is much faster than making a list flags = np.empty([len(tracker.nodes)], dtype=np.uint32) flags.fill(1) # Convert time from forwards to backwards tracker.convert_time() # Update internal *Tables self.nodes.append_columns(flags=flags, population=tracker.nodes['population'], time=tracker.nodes['generation']) self.edges.append_columns(left=tracker.edges['left'], right=tracker.edges['right'], parent=tracker.edges['parent'], children=tracker.edges['child'], children_length=[1] * len(tracker.edges)) # Sort and simplify msprime.sort_tables(nodes=self.nodes, edgesets=self.edges) msprime.simplify_tables(samples=tracker.samples.tolist(), nodes=self.nodes, edgesets=self.edges) # Return length of NodeTable, # which can be used as next offspring ID return self.nodes.num_rows
def test_non_overlapping_generations(self): tables = wf_sim(N=10, ngens=10, survival=0.0, seed=self.random_seed) self.assertGreater(tables.nodes.num_rows, 0) self.assertGreater(tables.edges.num_rows, 0) self.assertEqual(tables.sites.num_rows, 0) self.assertEqual(tables.mutations.num_rows, 0) self.assertEqual(tables.migrations.num_rows, 0) nodes = tables.nodes edges = tables.edges msprime.sort_tables(nodes=nodes, edges=edges) samples = np.where(nodes.flags == msprime.NODE_IS_SAMPLE)[0].astype( np.int32) msprime.simplify_tables(samples=samples, nodes=nodes, edges=edges) ts = msprime.load_tables(nodes=nodes, edges=edges) # All trees should have exactly one root and the leaves should be the samples, # and all internal nodes should have arity > 1 for tree in ts.trees(): self.assertEqual(tree.num_roots, 1) leaves = set(tree.leaves(tree.root)) self.assertEqual(leaves, set(ts.samples())) for u in tree.nodes(): if tree.is_internal(u): self.assertGreater(len(tree.children(u)), 1)
def wright_fisher(N, T, simplify_interval=1): """ An implementation of algorithm W where we simplify after every generation. The goal here is to measure the number of edges in the tree sequence representing the history as a function of time. For simplicity we assume that the genome length L = 1 and the probability of death delta = 1. """ L = 1 edges = msprime.EdgeTable() nodes = msprime.NodeTable() P = [j for j in range(N)] for j in range(N): nodes.add_row(time=T, flags=1) t = T S = np.zeros(T, dtype=int) while t > 0: t -= 1 Pp = [P[j] for j in range(N)] for j in range(N): n = len(nodes) nodes.add_row(time=t, flags=1) Pp[j] = n a = random.randint(0, N - 1) b = random.randint(0, N - 1) x = random.uniform(0, L) edges.add_row(0, x, P[a], n) edges.add_row(x, L, P[b], n) P = Pp if t % simplify_interval == 0: msprime.sort_tables(nodes=nodes, edges=edges) msprime.simplify_tables(Pp, nodes, edges) P = list(range(N)) S[T - t - 1] = len(edges) # We will always simplify at t = 0, so no need for special case at the end return msprime.load_tables(nodes=nodes, edges=edges), S
def test_many_generations_no_deep_history(self): N = 10 ngens = 100 tables = wf_sim(N=N, ngens=ngens, deep_history=False, seed=self.random_seed) self.assertEqual(tables.nodes.num_rows, N * (ngens + 1)) self.assertGreater(tables.edges.num_rows, 0) self.assertEqual(tables.sites.num_rows, 0) self.assertEqual(tables.mutations.num_rows, 0) self.assertEqual(tables.migrations.num_rows, 0) nodes = tables.nodes edges = tables.edges samples = np.where(nodes.flags == msprime.NODE_IS_SAMPLE)[0].astype( np.int32) msprime.sort_tables(nodes=nodes, edges=edges) msprime.simplify_tables(samples=samples, nodes=nodes, edges=edges) self.assertGreater(tables.nodes.num_rows, 0) self.assertGreater(tables.edges.num_rows, 0) # We are assuming that everything has coalesced and we have single-root trees ts = msprime.load_tables(nodes=nodes, edges=edges) for tree in ts.trees(): self.assertEqual(tree.num_roots, 1)
def insert_perfect_mutations(ts, delta=None): """ Returns a copy of the specified tree sequence where the left and right coordinates of all edgesets are marked by mutations. This *should* be sufficient information to recover the tree sequence exactly. This has to be fudged slightly because we cannot have two sites with precisely the same coordinates. We work around this by having sites at some very small delta from the correct location. """ tables = ts.dump_tables() tables.sites.clear() tables.mutations.clear() num_children = np.zeros(ts.num_nodes, dtype=int) parent = np.zeros(ts.num_nodes, dtype=int) - 1 current_delta = 0 if delta is not None: current_delta = delta for (left, right), edges_out, edges_in in ts.edge_diffs(): last_num_children = list(num_children) children_in = set() children_out = set() parents_in = set() parents_out = set() for e in edges_out: # print("out:", e) parent[e.child] = -1 num_children[e.parent] -= 1 children_out.add(e.child) parents_out.add(e.parent) for e in edges_in: # print("in:", e) parent[e.child] = e.parent num_children[e.parent] += 1 children_in.add(e.child) parents_in.add(e.parent) root = 0 while parent[root] != -1: root = parent[root] # If we have more than 4 edges in the diff, or we have a 2 edge diff # that is not a root change this must be a multiple recombination. if len(edges_out) > 4 or (len(edges_out) == 2 and root not in parents_in): raise ValueError("Multiple recombination detected") # We use the value of delta from the previous iteration x = left - current_delta for u in list(children_out - children_in) + list(children_in & children_out): if last_num_children[u] > 0: site_id = tables.sites.add_row(position=x, ancestral_state="0") tables.mutations.add_row(site=site_id, node=u, derived_state="1") x -= current_delta # Now update delta for this interval. if delta is None: max_nodes = 2 * (len(children_out) + len(children_in)) + len(parents_in) + 1 current_delta = (right - left) / max_nodes x = left for c in list(children_in - children_out) + list(children_in & children_out): if num_children[c] > 0: site_id = tables.sites.add_row(position=x, ancestral_state="0") tables.mutations.add_row(site=site_id, node=c, derived_state="1") x += current_delta # It seems wrong that we have to mark every parent, since a few of these # will already have been marked out by the children. for u in parents_in: if parent[u] != -1: # print("marking in parent", u, "at", x) site_id = tables.sites.add_row(position=x, ancestral_state="0") tables.mutations.add_row(site=site_id, node=u, derived_state="1") x += current_delta msprime.sort_tables(**tables.asdict()) return msprime.load_tables(**tables.asdict())
flags=nt.flags, #[2 * popsize:], population=nt.population, #[2 * popsize:], time=nt.time + ngens + 1) node_offset = nt.num_rows nt.append_columns(flags=flags, population=nodes['population'] + node_offset, time=nodes['generation']) es.append_columns(left=edges['left'], right=edges['right'], parent=edges['parent'] + node_offset, child=edges['child'] + node_offset) # Sort msprime.sort_tables(nodes=nt, edges=es) # Simplify: this is where the magic happens # PLR: since these tables aren't valid, you gotta use simplify_tables, not load them into a tree sequence msprime.simplify_tables(samples=samples.tolist(), nodes=nt, edges=es) # Create a tree sequence x = msprime.load_tables(nodes=nt, edges=es) # Lets look at the MRCAS. # This is where things go badly: MRCAS = [t.get_time(t.get_root()) for t in x.trees()] print(MRCAS) # Throw down some mutations # onto a sample of size nsam
def wfrec(nsam, rho, nsites, theta): samples = [] for i in range(nsam): samples.append(it.IntervalTree([it.Interval(0, nsites)])) links = np.array([sumIntervalTree(i) for i in samples], dtype=np.int) nlinks = links.sum() n = nsam rbp = rho / float(nsites - 1) t = 0.0 nodes = msprime.NodeTable() edges = msprime.EdgeTable() nodes.set_columns(time=np.zeros(nsam), flags=np.ones(nsam, dtype=np.uint32)) sample_indexes = [i for i in range(len(samples))] next_index = len(sample_indexes) while (n > 1): rcoal = float(n * (n - 1)) rrec = rbp * float(nlinks) iscoal = bool(np.random.random_sample(1)[0] < rcoal / (rcoal + rrec)) t += np.random.exponential(4. / (rcoal + rrec), 1)[0] assert len(samples) == len(links), "sample/link error" if iscoal is True: chroms = np.sort(np.random.choice(n, 2, replace=False)) c1 = chroms[0] c2 = chroms[1] nodes.add_row(time=t, flags=msprime.NODE_IS_SAMPLE) for i in samples[c1]: edges.add_row(left=i[0], right=i[1], parent=next_index, child=sample_indexes[c1]) edges.add_row(left=i[0], right=i[1], parent=next_index, child=sample_indexes[c2]) newchrom = it.IntervalTree() # Merge intervals of the two chromosomes # and remove overlaps for i in samples[c1]: newchrom.append(i) for i in samples[c2]: newchrom.append(i) newchrom.merge_overlaps() samples.pop(c2) samples.pop(c1) samples.append(newchrom) sample_indexes.pop(c2) sample_indexes.pop(c1) sample_indexes.append(next_index) next_index += 1 n -= 1 else: # Pick a chrom proportional to # its total size: chrom = np.random.choice(len(sample_indexes), 1, p=links / links.sum())[0] mnpos = min( [i for j in samples[chrom] for i in j if i is not None]) mxpos = max( [i for j in samples[chrom] for i in j if i is not None]) pos = np.random.randint(mnpos, mxpos) samples[chrom].chop(pos, pos) tc = it.IntervalTree([i for i in samples[chrom] if i[0] >= pos]) samples[chrom].remove_overlap(pos, nsites) samples.append(tc) sample_indexes.append(next_index) next_index += 1 n += 1 assert all([len(i) > 0 for i in samples]), "empty IntervalTree" assert len(samples) == len(sample_indexes), "sample/sample_index error" links = np.array([sumIntervalTree(i) for i in samples], dtype=np.int) nlinks = links.sum() assert len(samples) == len(links), "sample/link error 2" for i in range(len(edges)): assert edges[i].parent < len(nodes), "parent error" assert edges[i].child < len(nodes), "child error" msprime.sort_tables(nodes=nodes, edges=edges) return msprime.load_tables(nodes=nodes, edges=edges)
def writer(): msprime.sort_tables(**tables.asdict())
st = msprime.SiteTable() st.set_columns(position=mutas['position'], ancestral_state=np.zeros(len(mutas['position']), np.int8), ancestral_state_length=np.ones(len(mutas['position']), np.uint32)) mt = msprime.MutationTable() mt.set_columns(site=np.arange(len(mutas['node_id']), dtype=np.int32), node=mutas['node_id'], derived_state=np.ones(len(mutas['node_id']), np.int8), derived_state_length=np.ones(len(mutas['node_id']), np.uint32)) # Sort msprime.sort_tables(nodes=nt, edges=es, sites=st, mutations=mt) print("num total mutations: ", st.num_rows) # Simplify: this is where the magic happens ## PLR: since these tables aren't valid, you gotta use simplify_tables, not load them into a tree sequence nt_c = nt.copy() es_c = es.copy() st_c = st.copy() mt_c = mt.copy() msprime.simplify_tables(samples=samples.tolist(), nodes=nt_c, edges=es_c, sites=st_c, mutations=mt_c) print("num simplified mutations: ", st_c.num_rows) # Create a tree sequence
ri=struct.unpack('d',f.read(8)) p.append(pi[0]) c.append(ci[0]) l.append(li[0]) r.append(ri[0]) edges.set_columns(parent=p,child=c,left=l,right=r) N=int(sys.argv[3]) #samples=[i for i in range(len(times)-2*N,len(times))] samples=[i for i in range(0,len(times),132)] ts=None A=time.time() msprime.sort_tables(nodes=nodes,edges=edges) B=time.time() ts=msprime.simplify_tables(nodes=nodes,edges=edges,samples=samples) C=time.time() print("Sorting: ",B-A,"seconds") print("Simplifying: ",C-B,"seconds") with open(sys.argv[4],'w') as f: for i in edges: f.write("{} {} {:.6f} {:.6f}\n".format(i.parent,i.child,i.left,i.right,nodes[i.parent].time)) with open(sys.argv[5],'w') as f: for i in nodes: f.write("{}\n".format(i.time))
# Practice with the simplify API import msprime n = msprime.NodeTable() sv = [True, True, True, True, True, True, True] tv = [0.0, 0.0, 0.0, 0.4, 0.5, 0.7, 1.0] pv = [0, 0, 0, 0, 0, 0, 0] n = msprime.NodeTable() n.set_columns(flags=sv, population=pv, time=tv) print(n) left = [0.2, 0.2, 0.0, 0.0, 0.2, 0.2, 0.8, 0.8, 0.8, 0.8, 0.0, 0.0] right = [0.8, 0.8, 0.2, 0.2, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2] parent = [3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 6, 6] # children = [(0,2),(1,2),(1,3),(0,4),(0,4)] children = [0, 2, 1, 2, 1, 3, 1, 2, 0, 4, 0, 4] e = msprime.EdgesetTable() for l, r, p, c in zip(left, right, parent, children): e.add_row(left=l, right=r, parent=p, children=(c, )) print(e) msprime.sort_tables(nodes=n, edgesets=e) x = msprime.load_tables(nodes=n, edgesets=e) x = x.simplify(samples=[0, 1, 2]) x.dump_tables(nodes=n, edgesets=e) print(n) print(e) # make some fake nodes
def _load_legacy_hdf5_v2(root, remove_duplicate_positions): # Get the coalescence records trees_group = root["trees"] old_timestamp = datetime.datetime.min.isoformat() provenances = msprime.ProvenanceTable() provenances.add_row(timestamp=old_timestamp, record=_get_v2_provenance("generate_trees", trees_group.attrs)) num_rows = trees_group["node"].shape[0] index = np.arange(num_rows, dtype=int) parent = np.zeros(2 * num_rows, dtype=np.int32) parent[2 * index] = trees_group["node"] parent[2 * index + 1] = trees_group["node"] left = np.zeros(2 * num_rows, dtype=np.float64) left[2 * index] = trees_group["left"] left[2 * index + 1] = trees_group["left"] right = np.zeros(2 * num_rows, dtype=np.float64) right[2 * index] = trees_group["right"] right[2 * index + 1] = trees_group["right"] child = np.array(trees_group["children"], dtype=np.int32).flatten() edges = msprime.EdgeTable() edges.set_columns(left=left, right=right, parent=parent, child=child) cr_node = np.array(trees_group["node"], dtype=np.int32) num_nodes = max(np.max(child), np.max(cr_node)) + 1 sample_size = np.min(cr_node) flags = np.zeros(num_nodes, dtype=np.uint32) population = np.zeros(num_nodes, dtype=np.int32) time = np.zeros(num_nodes, dtype=np.float64) flags[:sample_size] = msprime.NODE_IS_SAMPLE cr_population = np.array(trees_group["population"], dtype=np.int32) cr_time = np.array(trees_group["time"]) time[cr_node] = cr_time population[cr_node] = cr_population if "samples" in root: samples_group = root["samples"] population[:sample_size] = samples_group["population"] if "time" in samples_group: time[:sample_size] = samples_group["time"] nodes = msprime.NodeTable() nodes.set_columns(flags=flags, population=population, time=time) sites = msprime.SiteTable() mutations = msprime.MutationTable() if "mutations" in root: mutations_group = root["mutations"] _convert_hdf5_mutations(mutations_group, sites, mutations, remove_duplicate_positions) provenances.add_row(timestamp=old_timestamp, record=_get_v2_provenance("generate_mutations", mutations_group.attrs)) provenances.add_row(_get_upgrade_provenance(root)) msprime.sort_tables(nodes=nodes, edges=edges, sites=sites, mutations=mutations) return msprime.load_tables(nodes=nodes, edges=edges, sites=sites, mutations=mutations, provenances=provenances)
def get_tree_sequence(self, rescale_positions=True, all_sites=False): """ Returns the current state of the build tree sequence. All samples and ancestors will have the sample node flag set. """ # TODO Change the API here to ask whether we want a final tree sequence # or not. In the latter case we also need to translate the ancestral # and derived states to the input values. tsb = self.tree_sequence_builder flags, time = tsb.dump_nodes() nodes = msprime.NodeTable() nodes.set_columns(flags=flags, time=time) left, right, parent, child = tsb.dump_edges() if rescale_positions: position = self.sample_data.position[:] sequence_length = self.sample_data.sequence_length if sequence_length is None or sequence_length < position[-1]: sequence_length = position[-1] + 1 # Subset down to the variants. position = position[self.sample_data.variant_site[:]] x = np.hstack([position, [sequence_length]]) x[0] = 0 left = x[left] right = x[right] else: position = np.arange(tsb.num_sites) sequence_length = max(1, tsb.num_sites) edges = msprime.EdgeTable() edges.set_columns(left=left, right=right, parent=parent, child=child) sites = msprime.SiteTable() sites.set_columns( position=position, ancestral_state=np.zeros(tsb.num_sites, dtype=np.int8) + ord('0'), ancestral_state_offset=np.arange(tsb.num_sites + 1, dtype=np.uint32)) mutations = msprime.MutationTable() site = np.zeros(tsb.num_mutations, dtype=np.int32) node = np.zeros(tsb.num_mutations, dtype=np.int32) parent = np.zeros(tsb.num_mutations, dtype=np.int32) derived_state = np.zeros(tsb.num_mutations, dtype=np.int8) site, node, derived_state, parent = tsb.dump_mutations() derived_state += ord('0') mutations.set_columns(site=site, node=node, derived_state=derived_state, derived_state_offset=np.arange( tsb.num_mutations + 1, dtype=np.uint32), parent=parent) if all_sites: # Append the sites and mutations for each singleton. num_singletons = self.sample_data.num_singleton_sites singleton_site = self.sample_data.singleton_site[:] singleton_sample = self.sample_data.singleton_sample[:] pos = self.sample_data.position[:] new_sites = np.arange(len(sites), len(sites) + num_singletons, dtype=np.int32) sites.append_columns( position=pos[singleton_site], ancestral_state=np.zeros(num_singletons, dtype=np.int8) + ord('0'), ancestral_state_offset=np.arange(num_singletons + 1, dtype=np.uint32)) mutations.append_columns( site=new_sites, node=self.sample_ids[singleton_sample], derived_state=np.zeros(num_singletons, dtype=np.int8) + ord('1'), derived_state_offset=np.arange(num_singletons + 1, dtype=np.uint32)) # Get the invariant sites num_invariants = self.sample_data.num_invariant_sites invariant_site = self.sample_data.invariant_site[:] sites.append_columns( position=pos[invariant_site], ancestral_state=np.zeros(num_invariants, dtype=np.int8) + ord('0'), ancestral_state_offset=np.arange(num_invariants + 1, dtype=np.uint32)) msprime.sort_tables(nodes, edges, sites=sites, mutations=mutations) return msprime.load_tables(nodes=nodes, edges=edges, sites=sites, mutations=mutations, sequence_length=sequence_length)
def simplify(self, generation, ancestry): # print(type(ancestry)) # update node times: if self.__nodes.num_rows > 0: tc = self.__nodes.time dt = float(generation) - self.last_gc_time tc += dt self.last_gc_time = generation flags = np.ones(self.__nodes.num_rows, dtype=np.uint32) self.__nodes.set_columns(flags=flags, population=self.__nodes.population, time=tc) before = time.process_time() # Acquire mutex ancestry.acquire() self.reverse_time(ancestry.nodes) na = np.array(ancestry.nodes, copy=False) ea = np.array(ancestry.edges, copy=False) new_min_id = na['id'][0] new_max_id = na['id'][-1] delta = new_min_id - len(self.__nodes) if delta != 0: self.update_indexes(ancestry.edges, ancestry.samples, delta, new_min_id, new_max_id) samples = np.array(ancestry.samples, copy=False) flags = np.ones(len(na), dtype=np.uint32) self.__time_prepping += time.process_time() - before before = time.process_time() clen = len(self.__nodes) self.__nodes.append_columns(flags=flags, population=na['population'], time=na['generation']) # Copy the already sorted edges to local arrays left = self.__edges.left[:] right = self.__edges.right[:] parent = self.__edges.parent[:] child = self.__edges.child[:] # Get the new edges and reverse them. After this, we know that all edges # are correctly sorted with respect to time. We then sort each time slice # individually, reducing the overall cost of the sort. new_left = ea['left'][::-1] new_right = ea['right'][::-1] new_parent = ea['parent'][::-1] new_child = ea['child'][::-1] parent_time = self.__nodes.time[new_parent] breakpoints = np.where(parent_time[1:] != parent_time[:-1])[0] + 1 self.__edges.reset() self.__time_appending += time.process_time() - before before = time.process_time() start = 0 for end in itertools.chain(breakpoints, [-1]): assert np.all(parent_time[start:end] == parent_time[start]) self.__edges.append_columns(left=new_left[start:end], right=new_right[start:end], parent=new_parent[start:end], child=new_child[start:end]) msprime.sort_tables(nodes=self.__nodes, edges=self.__edges, edge_start=start) start = end self.__time_sorting += time.process_time() - before # Append the old sorted edges to the table. self.__edges.append_columns(left=left, right=right, parent=parent, child=child) before = time.process_time() msprime.simplify_tables(samples=samples.tolist(), nodes=self.__nodes, edges=self.__edges) # Release any locks on the ancestry object ancestry.release() self.__last_edge_start = len(self.__edges) self.__time_simplifying += time.process_time() - before self.__process = True return (True, self.__nodes.num_rows)
def writer(thread_index, results): msprime.sort_tables(**tables.asdict())