def _get_upgrade_provenance(root): """ Returns the provenance string from upgrading the specified HDF5 file. """ # TODO add more parameters here like filename, etc. parameters = { "command": "upgrade", "source_version": list(map(int, root.attrs["format_version"])), } s = json.dumps(provenance.get_provenance_dict(parameters)) return s.encode()
def _get_v2_provenance(command, attrs): """ Returns the V2 tree provenance attributes reformatted as a provenance record. """ environment = {} parameters = {} # Try to get the provenance strings. Malformed JSON should not prevent us # from finishing the conversion. try: environment = json.loads(str(attrs["environment"])) except ValueError: logging.warn("Failed to convert environment provenance") try: parameters = json.loads(str(attrs["parameters"])) except ValueError: logging.warn("Failed to convert parameters provenance") parameters["command"] = command provenance_dict = provenance.get_provenance_dict(parameters) provenance_dict["version"] = environment.get("msprime_version", "Unknown_version") provenance_dict["environment"] = environment return json.dumps(provenance_dict).encode()
def add_provenance(provenance_table, method_name): d = provenance.get_provenance_dict( {"command": "tsutil.{}".format(method_name)}) provenance_table.add_row(json.dumps(d))
def add_provenance(provenance_table, method_name): d = provenance.get_provenance_dict({"command": "tsutil.{}".format(method_name)}) provenance_table.add_row(json.dumps(d))
def add_provenance(provenance_table, method_name): d = provenance.get_provenance_dict({"command": f"tsutil.{method_name}"}) provenance_table.add_row(json.dumps(d))
def randomly_split_polytomies( self, *, epsilon=None, squash_edges=True, record_provenance=True, random_seed=None, ): """ Modifies the table collection in place, adding extra nodes and edges so that any node with greater than 2 children (i.e. a multifurcation or "polytomy") is resolved into successive bifurcations. This is identical to :meth:`TreeSequence.randomly_split_polytomies` but acts *in place* to alter the data in this :class:`TableCollection`. Please see :meth:`TreeSequence.randomly_split_polytomies` for a fuller description, and details of parameters. """ if epsilon is None: epsilon = 1e-10 rng = np.random.default_rng(seed=random_seed) def resolve_polytomy(parent_node_id, child_ids, new_nodes_by_time_desc): """ For a polytomy and list of child node ids, return a list of (child, parent) tuples, describing a bifurcating tree, rooted at parent_node_id, where the new_nodes_by_time_desc have been used to break polytomies. All possible topologies should be equiprobable. """ nonlocal rng assert len(child_ids) == len(new_nodes_by_time_desc) + 2 # Polytomies broken by sequentially splicing onto edges, so an initial edge # is required. This will always remain above the top node & is removed later edges = [ [child_ids[0], None], ] # We know beforehand how many random ints are needed: generate them all now edge_choice = rng.integers(0, np.arange(1, len(child_ids) * 2 - 1, 2)) tmp_new_node_lab = [parent_node_id] + new_nodes_by_time_desc assert len(edge_choice) == len(child_ids) - 1 for node_lab, child_id, target_edge_id in zip(tmp_new_node_lab, child_ids[1:], edge_choice): target_edge = edges[target_edge_id] # Insert in the right place, to keep edges in parent time order edges.insert(target_edge_id, [child_id, node_lab]) edges.insert(target_edge_id, [target_edge[0], node_lab]) target_edge[0] = node_lab top_edge = edges.pop() # remove the edge above the top node assert top_edge[1] is None # Re-map the internal nodes IDs so they are used in time order real_node = iter(new_nodes_by_time_desc) node_map = {c: c for c in child_ids} node_map[edges[-1][1]] = parent_node_id # last edge == oldest parent for e in reversed(edges): # Reversing along the edges, parents are in inverse time order for idx in (1, 0): # look at parent (1) then child (0) if e[idx] not in node_map: node_map[e[idx]] = next(real_node) e[idx] = node_map[e[idx]] assert len( node_map) == len(new_nodes_by_time_desc) + len(child_ids) + 1 return edges edge_table = self.edges node_table = self.nodes # Store existing left, so we can change it if the edge is split existing_edges_left = edge_table.left # Keep other edge arrays etc. for fast read access existing_edges_right = edge_table.right existing_edges_parent = edge_table.parent existing_edges_child = edge_table.child existing_node_time = node_table.time # We can save a lot of effort if we don't need to check the time of mutations # We definitely don't need to check on the first iteration, a check_mutations = np.any( np.logical_not(tskit.is_unknown_time(self.mutations.time))) ts = self.tree_sequence() # Only needed to check mutations tree_iter = ts.trees() # ditto edge_table.clear() edges_from_node = collections.defaultdict( set) # Active descendant edge ids nodes_changed = set() for interval, e_out, e_in in ts.edge_diffs(include_terminal=True): pos = interval[0] prev_tree = None if pos == 0 else next(tree_iter) for edge in itertools.chain(e_out, e_in): if edge.parent != tskit.NULL: nodes_changed.add(edge.parent) oldest_mutation_for_node = {} if check_mutations and prev_tree is not None: # It would also help if mutations were sorted such that all mutations # above the same node appeared consecutively, with oldest first. for site in prev_tree.sites(): for mutation in site.mutations: if not tskit.is_unknown_time(mutation.time): if mutation.node in oldest_mutation_for_node: oldest_mutation_for_node[mutation.node] = max( oldest_mutation_for_node[mutation.node], mutation.time) else: oldest_mutation_for_node[ mutation.node] = mutation.time for parent_node in nodes_changed: child_edge_ids = edges_from_node[parent_node] if len(child_edge_ids) >= 3: # We have a previous polytomy to break parent_time = existing_node_time[parent_node] new_nodes = [] child_ids = existing_edges_child[list(child_edge_ids)] left = None max_time = 0 # Split existing edges for edge_id, child_id in zip(child_edge_ids, child_ids): max_time = max(max_time, existing_node_time[child_id]) if check_mutations and child_id in oldest_mutation_for_node: max_time = max(max_time, oldest_mutation_for_node[child_id]) if left is None: left = existing_edges_left[edge_id] else: assert left == existing_edges_left[edge_id] if existing_edges_right[edge_id] > interval[0]: # make sure we carry on the edge after this polytomy existing_edges_left[edge_id] = pos # Arbitrarily, if epsilon is not small enough, use half the min dist dt = min((parent_time - max_time) / (len(child_ids) * 2), epsilon) # Break this N-degree polytomy. This requires N-2 extra nodes to be # introduced: create them here in order of decreasing time new_nodes = [ node_table.add_row(time=parent_time - (i * dt)) for i in range(1, len(child_ids) - 1) ] # print("New nodes:", new_nodes, node_table.time[new_nodes]) for new_edge in resolve_polytomy(parent_node, child_ids, new_nodes): edge_table.add_row( left=left, right=pos, child=new_edge[0], parent=new_edge[1], ) # print("new_edge: left={}, right={}, child={}, parent={}" # .format(left, pos, new_edge[0], new_edge[1])) else: # Previous node was not a polytomy - just add the edges_out for edge_id in child_edge_ids: if existing_edges_right[edge_id] == pos: # is an out edge edge_table.add_row( left=existing_edges_left[edge_id], right=pos, parent=parent_node, child=existing_edges_child[edge_id], ) for edge in e_out: if edge.parent != tskit.NULL: # print("REMOVE", edge.id) edges_from_node[edge.parent].remove(edge.id) for edge in e_in: if edge.parent != tskit.NULL: # print("ADD", edge.id) edges_from_node[edge.parent].add(edge.id) # Chop if we have created a polytomy: the polytomy itself will be resolved # at a future iteration, when any edges move into or out of the polytomy while nodes_changed: node = nodes_changed.pop() edge_ids = edges_from_node[node] # print("Looking at", node) if len(edge_ids) == 0: del edges_from_node[node] # if this node has changed *to* a polytomy, we need to cut all of the # child edges that were previously present by adding the previous # segment and left-truncating elif len(edge_ids) >= 3: for edge_id in edge_ids: if existing_edges_left[edge_id] < interval[0]: self.edges.add_row( left=existing_edges_left[edge_id], right=interval[0], parent=existing_edges_parent[edge_id], child=existing_edges_child[edge_id], ) existing_edges_left[edge_id] = interval[0] assert len(edges_from_node) == 0 self.sort() if squash_edges: self.edges.squash() self.sort() # Bug: https://github.com/tskit-dev/tskit/issues/808 if record_provenance: parameters = {"command": "randomly_split_polytomies"} self.provenances.add_row( record=json.dumps(provenance.get_provenance_dict(parameters)))