def _upgrade_old_tables(tables): with warnings.catch_warnings(): warnings.simplefilter("ignore") provenance = get_provenance(tables) file_version = provenance.file_version slim_generation = provenance.slim_generation warnings.warn( "This is an version {} SLiM tree sequence.".format(file_version) + " When you write this out, " + "it will be converted to version {}.".format(slim_file_version)) if file_version == "0.1" or file_version == "0.2": # add empty nucleotide slots to metadata mut_bytes = tskit.unpack_bytes(tables.mutations.metadata, tables.mutations.metadata_offset) mut_metadata = [ _decode_mutation_pre_nucleotides(md) for md in mut_bytes ] metadata, metadata_offset = tskit.pack_bytes(mut_metadata) tables.mutations.set_columns( site=tables.mutations.site, node=tables.mutations.node, parent=tables.mutations.parent, derived_state=tables.mutations.derived_state, derived_state_offset=tables.mutations.derived_state_offset, metadata=metadata, metadata_offset=metadata_offset) if file_version == "0.1": # shift times node_times = tables.nodes.time + slim_generation tables.nodes.set_columns(flags=tables.nodes.flags, time=node_times, population=tables.nodes.population, individual=tables.nodes.individual, metadata=tables.nodes.metadata, metadata_offset=tables.nodes.metadata_offset) migration_times = tables.migrations.time + slim_generation tables.migrations.set_columns(left=tables.migrations.left, right=tables.migrations.right, node=tables.migrations.node, source=tables.migrations.source, dest=tables.migrations.dest, time=migration_times) new_record = { "schema_version": "1.0.0", "software": { "name": "pyslim", "version": pyslim_version, }, "parameters": { "command": ["_upgrade_old_tables"], "old_file_version": file_version, "new_file_version": slim_file_version, }, "environment": get_environment(), } tskit.validate_provenance(new_record) tables.provenances.add_row(json.dumps(new_record))
def extract_population_metadata(tables): ''' Returns an iterator over lists of :class:`PopulationMetadata` objects containing information about the populations in the tables. :param TableCollection tables: The tables, as produced by SLiM. ''' metadata = tskit.unpack_bytes(tables.populations.metadata, tables.populations.metadata_offset) for md in metadata: yield decode_population(md)
def test_annotate_nodes(self): for ts in self.get_slim_examples(): tables = ts.tables new_tables = ts.tables metadata = [] for md in tskit.unpack_bytes(tables.nodes.metadata, tables.nodes.metadata_offset): dm = pyslim.decode_node(md) edm = pyslim.encode_node(dm) self.assertEqual(md, edm) metadata.append(dm) pyslim.annotate_node_metadata(new_tables, metadata) self.assertEqual(tables, new_tables)
def nodes_time(tree_sequence, unconstrained=True): nodes_age = tree_sequence.tables.nodes.time[:] if unconstrained: metadata = tree_sequence.tables.nodes.metadata[:] metadata_offset = tree_sequence.tables.nodes.metadata_offset[:] for index, met in enumerate(tskit.unpack_bytes(metadata, metadata_offset)): if index not in tree_sequence.samples(): try: nodes_age[index] = json.loads(met.decode())["mn"] except (KeyError, json.decoder.JSONDecodeError): raise ValueError("Tree Sequence must be tsdated with the " "Inside-Outside Method. Use unconstrained=False " "if not.") return nodes_age
def test_annotate_populations(self): for ts in self.get_slim_examples(): tables = ts.tables new_tables = ts.tables metadata = [] for md in tskit.unpack_bytes(tables.populations.metadata, tables.populations.metadata_offset): with self.assertWarns(DeprecationWarning): dm = pyslim.decode_population(md) with self.assertWarns(DeprecationWarning): edm = pyslim.encode_population(dm) self.assertEqual(md, edm) metadata.append(dm) with self.assertWarns(DeprecationWarning): pyslim.annotate_population_metadata(new_tables, metadata) self.assertTableCollectionsEqual(tables, new_tables)
def extract_mutation_metadata(tables): ''' Returns an iterator over lists of :class:`MutationMetadata` objects containing information about the mutations in the tables. .. warning:: This method is deprecated, since metadata handling has been taken over by tskit. It will dissappear at some point in the future. :param TableCollection tables: The tables, as produced by SLiM. ''' _deprecation_warning("extract_mutation_metadata") metadata = tskit.unpack_bytes(tables.mutations.metadata, tables.mutations.metadata_offset) for mut in tables.mutations: yield [MutationMetadata.fromdict(mm) for mm in mut.metadata['mutation_list']]
def nodes_time_unconstrained(tree_sequence): """ Return the unconstrained node times for every node in a tree sequence that has been dated using ``tsdate`` with the inside-outside algorithm (these times are stored in the node metadata). Will produce an error if the tree sequence does not contain this information. """ nodes_time = tree_sequence.tables.nodes.time.copy() metadata = tree_sequence.tables.nodes.metadata metadata_offset = tree_sequence.tables.nodes.metadata_offset for index, met in enumerate(tskit.unpack_bytes(metadata, metadata_offset)): if index not in tree_sequence.samples(): try: nodes_time[index] = json.loads(met.decode())["mn"] except (KeyError, json.decoder.JSONDecodeError): raise ValueError( "Tree Sequence must be tsdated with the Inside-Outside Method." ) return nodes_time
def posterior_mean_var(ts, timepoints, posterior, Ne, *, fixed_node_set=None): """ Mean and variance of node age in scaled time. Fixed nodes will be given a mean of their exact time in the tree sequence, and zero variance (as long as they are identified by the fixed_node_set If fixed_node_set is None, we attempt to date all the non-sample nodes Also assigns the estimated mean and variance of the age of each node, in unscaled time, as metadata in the tree sequence. """ mn_post = np.full(ts.num_nodes, np.nan) # Fill with NaNs so we detect when there's vr_post = np.full(ts.num_nodes, np.nan) # been an error tables = ts.dump_tables() if fixed_node_set is None: fixed_node_set = ts.samples() fixed_nodes = np.array(list(fixed_node_set)) mn_post[fixed_nodes] = tables.nodes.time[fixed_nodes] vr_post[fixed_nodes] = 0 metadata_array = tskit.unpack_bytes(ts.tables.nodes.metadata, ts.tables.nodes.metadata_offset) timepoints = timepoints * 2 * Ne for row, node_id in zip(posterior.grid_data, posterior.nonfixed_nodes): mn_post[node_id] = np.sum(row * timepoints) / np.sum(row) vr_post[node_id] = np.sum( ((mn_post[node_id] - (timepoints))**2) * (row / np.sum(row))) metadata_array[node_id] = json.dumps({ "mn": mn_post[node_id], "vr": vr_post[node_id] }).encode() md, md_offset = tskit.pack_bytes(metadata_array) tables.nodes.set_columns( flags=tables.nodes.flags, time=tables.nodes.time, population=tables.nodes.population, individual=tables.nodes.individual, metadata=md, metadata_offset=md_offset, ) ts = tables.tree_sequence() return ts, mn_post, vr_post
def get_mut_ages(ts, unconstrained=True, ignore_sample_muts=False, geometric=True): mut_ages = np.zeros(ts.num_sites) mut_upper_bounds = np.zeros(ts.num_sites) node_ages = ts.tables.nodes.time oldest_mut_ids = np.zeros(ts.num_sites) if unconstrained: metadata = ts.tables.nodes.metadata[:] metadata_offset = ts.tables.nodes.metadata_offset[:] for index, met in enumerate( tskit.unpack_bytes(metadata, metadata_offset)): if index not in ts.samples(): node_ages[index] = json.loads(met.decode())["mn"] if ignore_sample_muts: mutations_table = ts.tables.mutations unique_sites = np.unique(ts.tables.mutations.site, return_counts=True) unique_sites = unique_sites[0][unique_sites[1] > 1] no_samp_muts = ~np.logical_and( np.isin(mutations_table.site, unique_sites), np.isin(mutations_table.node, ts.samples()), ) for tree in tqdm(ts.trees(), total=ts.num_trees, desc="Finding mutation ages"): for site in tree.sites(): for mut in site.mutations: parent_age = node_ages[tree.parent(mut.node)] if geometric: age = np.sqrt(node_ages[mut.node] * parent_age) else: age = (node_ages[mut.node] + parent_age) / 2 if mut_ages[site.id] < age: mut_upper_bounds[site.id] = parent_age mut_ages[site.id] = age oldest_mut_ids[site.id] = mut.id return mut_ages, mut_upper_bounds, oldest_mut_ids.astype(int)
def combine_chromosome_arms(args): """ Splices two chromosome arms together to form a full chromosome """ short_arm = tskit.load(args.p_arm) long_arm = tskit.load(args.q_arm) assert short_arm.num_samples == long_arm.num_samples # Remove material before first position and after last position short_arm = short_arm.keep_intervals( [[ short_arm.tables.sites.position[0] - 1, short_arm.tables.sites.position[-1] + 1, ]], simplify=False, ) long_arm = long_arm.keep_intervals( [[ long_arm.tables.sites.position[0] - 1, long_arm.tables.sites.position[-1] + 1, ]], simplify=False, ) short_tables = short_arm.dump_tables() long_tables = long_arm.dump_tables() assert np.array_equal(short_tables.individuals.metadata, long_tables.individuals.metadata) short_tables.sequence_length = long_arm.get_sequence_length() short_metadata = short_tables.nodes.metadata short_metadata_offset = short_tables.nodes.metadata_offset short_metadata = tskit.unpack_bytes(short_metadata, short_metadata_offset) long_metadata = long_tables.nodes.metadata long_metadata_offset = long_tables.nodes.metadata_offset long_metadata = tskit.unpack_bytes(long_metadata, long_metadata_offset) long_metadata = long_metadata[long_arm.num_samples:] combined_metadata = np.concatenate([short_metadata, long_metadata]) metadata, metadata_offset = tskit.pack_bytes(combined_metadata) all_nodes_except_samples = ~np.isin(np.arange(long_arm.num_nodes), long_arm.samples()) short_tables.nodes.append_columns( long_tables.nodes.flags[all_nodes_except_samples], long_tables.nodes.time[all_nodes_except_samples], long_tables.nodes.population[all_nodes_except_samples], ) short_tables.nodes.set_columns( flags=short_tables.nodes.flags, time=short_tables.nodes.time, population=short_tables.nodes.population, metadata=metadata, individual=short_tables.nodes.individual, metadata_offset=metadata_offset, ) long_edges_parent = long_tables.edges.parent long_edges_child = long_tables.edges.child long_arm_sample_map = np.zeros(long_arm.num_nodes).astype(int) long_arm_sample_map[long_arm.samples()] = short_arm.samples() long_edges_parent[~np.isin(long_edges_parent, long_arm.samples( ))] = long_edges_parent[~np.isin(long_edges_parent, long_arm.samples() )] + (short_arm.num_nodes) long_edges_parent[ long_arm.tables.edges.parent > long_arm.samples()[-1]] = ( long_edges_parent[ long_arm.tables.edges.parent > long_arm.samples()[-1]] - long_arm.num_samples) long_edges_child[~np.isin(long_edges_child, long_arm.samples( ))] = long_edges_child[~np.isin(long_edges_child, long_arm.samples())] + ( short_arm.num_nodes) long_edges_child[long_tables.edges.child > long_arm.samples()[-1]] = ( long_edges_child[long_tables.edges.child > long_arm.samples()[-1]] - long_arm.num_samples) long_edges_child[np.isin( long_tables.edges.child, long_arm.samples())] = long_arm_sample_map[ long_tables.edges.child[np.isin(long_tables.edges.child, long_arm.samples())]] short_tables.edges.append_columns( long_tables.edges.left, long_tables.edges.right, long_edges_parent, long_edges_child, ) short_tables.sites.append_columns( long_tables.sites.position, long_tables.sites.ancestral_state, long_tables.sites.ancestral_state_offset, ) long_mutations_node = long_tables.mutations.node long_mutations_node[~np.isin(long_mutations_node, long_arm.samples( ))] = long_mutations_node[~np.isin(long_mutations_node, long_arm.samples() )] + (short_arm.num_nodes) long_mutations_node[ long_tables.mutations.node > long_arm.samples()[-1]] = ( long_mutations_node[ long_tables.mutations.node > long_arm.samples()[-1]] - long_arm.num_samples) long_mutations_node[np.isin(long_tables.mutations.node, long_arm.samples())] = long_arm_sample_map[ long_tables.mutations.node[np.isin( long_tables.mutations.node, long_arm.samples())]] short_tables.mutations.append_columns( long_tables.mutations.site + short_arm.num_sites, long_mutations_node, long_tables.mutations.derived_state, long_tables.mutations.derived_state_offset, ) short_tables.sort() combined = short_tables.tree_sequence() assert combined.num_nodes == (short_arm.num_nodes + long_arm.num_nodes - short_arm.num_samples) assert combined.num_sites == (short_arm.num_sites + long_arm.num_sites) assert combined.num_edges == (short_arm.num_edges + long_arm.num_edges) assert combined.num_mutations == (short_arm.num_mutations + long_arm.num_mutations) assert (combined.num_individuals == short_arm.num_individuals == long_arm.num_individuals) assert np.array_equal( np.sort(combined.tables.sites.position), np.concatenate( [short_arm.tables.sites.position, long_arm.tables.sites.position]), ) assert np.array_equal( np.sort(combined.tables.nodes.time[combined.tables.mutations.node]), np.sort( np.concatenate([ short_arm.tables.nodes.time[short_arm.tables.mutations.node], long_arm.tables.nodes.time[long_arm.tables.mutations.node], ])), ) assert np.array_equal(combined.tables.individuals.metadata, long_tables.individuals.metadata) combined.dump(args.output)
def test_dump_to_tskit(self): import tskit dumped_ts = self.pop.dump_tables_to_tskit() self.assertEqual(len(dumped_ts.tables.nodes), len(self.pop.tables.nodes)) self.assertEqual(len(dumped_ts.tables.edges), len(self.pop.tables.edges)) self.assertEqual(len(dumped_ts.tables.mutations), len(self.pop.tables.mutations)) eview = np.array(self.pop.tables.edges, copy=False) self.assertEqual(eview['parent'].sum(), dumped_ts.tables.edges.parent.sum()) self.assertEqual(eview['child'].sum(), dumped_ts.tables.edges.child.sum()) self.assertEqual(eview['left'].sum(), dumped_ts.tables.edges.left.sum()) self.assertEqual(eview['right'].sum(), dumped_ts.tables.edges.right.sum()) tv = fwdpy11.TreeIterator(self.pop.tables, [i for i in range(2 * self.pop.N)]) tt_fwd = 0 for t in tv: tt_fwd += t.total_time(self.pop.tables.nodes) tt_tskit = 0 for t in dumped_ts.trees(): tt_tskit += t.get_total_branch_length() self.assertEqual(tt_fwd, tt_tskit) # Now, we make sure that the metadata can # be decoded md = tskit.unpack_bytes(dumped_ts.tables.individuals.metadata, dumped_ts.tables.individuals.metadata_offset) for i, j in zip(self.pop.diploid_metadata, md): d = eval(j) self.assertEqual(i.g, d['g']) self.assertEqual(i.w, d['w']) self.assertEqual(i.e, d['e']) self.assertEqual(i.label, d['label']) self.assertEqual(i.parents, d['parents']) self.assertEqual(i.sex, d['sex']) self.assertEqual(i.deme, d['deme']) self.assertEqual(i.geography, d['geography']) # Test that we can go backwards from node table to individuals samples = np.where( dumped_ts.tables.nodes.flags == tskit.NODE_IS_SAMPLE)[0] self.assertEqual(len(samples), 2 * self.pop.N) for i in samples[::2]: ind = i // 2 d = eval(md[ind]) fwdpy11_md = self.pop.diploid_metadata[ind] self.assertEqual(fwdpy11_md.g, d['g']) self.assertEqual(fwdpy11_md.w, d['w']) self.assertEqual(fwdpy11_md.e, d['e']) self.assertEqual(fwdpy11_md.label, d['label']) self.assertEqual(fwdpy11_md.parents, d['parents']) self.assertEqual(fwdpy11_md.sex, d['sex']) self.assertEqual(fwdpy11_md.deme, d['deme']) self.assertEqual(fwdpy11_md.geography, d['geography']) md = tskit.unpack_bytes(dumped_ts.tables.mutations.metadata, dumped_ts.tables.mutations.metadata_offset) for i, j, k in zip(self.pop.tables.mutations, dumped_ts.tables.mutations.site, md): d = eval(k) self.assertEqual(i.key, d['key']) site = dumped_ts.tables.sites[j] m = self.pop.mutations[d['key']] self.assertEqual(site.position, m.pos) self.assertEqual(d['s'], m.s) self.assertEqual(d['h'], m.h) self.assertTrue(np.array_equal(np.array(d['esizes']), m.esizes)) self.assertTrue(np.array_equal(np.array(d['heffects']), m.heffects)) self.assertEqual(d['label'], m.label) self.assertEqual(d['neutral'], m.neutral) self.assertEqual(mcounts_comparison(self.pop, dumped_ts), True)
def get_pairwise_tmrca_pops( ts_name, max_pop_nodes, hist_nbins=30, hist_min_gens=1000, num_processes=1, restrict_populations=None, return_raw_data=False, ): """ Get the mean tMRCA and a histogram of tMRCA times for pairs of populations from a tree sequence. :param int max_pop_nodes: The maximum number of sample nodes per pop to use. This number of samples (or lower, for small populations) will be taken at random from each population as a set of representative samples for which to construct pairwise statistics :param int hist_nbins: The number of bins used to save the histogram data. Bins will be spaced out evenly on a log scale. :param float hist_min_gens: A lower cutoff for the histogram bins, as there is usually very little in the lowest (logged) bins :param int num_processes: The number of CPUs to run in parallel on the calculation. :param list restrict_populations: A list of population IDs or names giving the populations among which to calculate pairwise distances. If ``None`` (default) then use all the populations defined in the tree sequence. :param bool return_raw_data is True, also return the full dataset of weights (which may be huge, as it is ~ num_unique_times * n_pops * n_pops /2 :return: a TmrcaData object containing a dataframe of the mean values for each pair, a HistData object with the histogram data, and (if return_full_data is ``True``) a potentially huge numpy array of weights of pairs X unique_times :rtype: TmrcaData """ ts = tskit.load(ts_name) deleted_trees = [tree.index for tree in ts.trees() if tree.parent(0) == -1] node_ages = np.zeros_like(ts.tables.nodes.time[:]) metadata = ts.tables.nodes.metadata[:] metadata_offset = ts.tables.nodes.metadata_offset[:] try: for index, met in enumerate( tskit.unpack_bytes(metadata, metadata_offset)): if index not in ts.samples(): try: # Get unconstrained node age if available node_ages[index] = json.loads(met.decode())["mn"] except json.decoder.JSONDecodeError: raise ValueError( "Tree Sequence must be dated to use unconstrained=True" ) logging.info("Using tsdate unconstrained node times") except KeyError: logging.info("Using standard ts node times") node_ages[:] = ts.tables.nodes.time[:] unique_times, time_index = np.unique(node_ages, return_inverse=True) with np.errstate(divide='ignore'): log_unique_times = np.log(unique_times) # Make a random selection of up to 10 samples from each population np.random.seed(123) pop_nodes = ts.tables.nodes.population[ts.samples()] nodes_for_pop = {} if restrict_populations is None: pops = [pop.id for pop in ts.populations()] else: # Convert any named populations to population ids name2id = { json.loads(pop.metadata)["name"]: pop.id for pop in ts.populations() } pops = [ int(p) if p.isdigit() else name2id[p] for p in restrict_populations ] for pop_id in pops: metadata = json.loads(ts.population(pop_id).metadata) key = metadata["name"] # Hack to distinguish SGDP from HGDP (all uppercase) pop names if 'region' in metadata and not metadata['region'].isupper(): key += " (SGDP)" assert key not in nodes_for_pop # Check for duplicate names nodes = np.where(pop_nodes == pop_id)[0] if len(nodes) > max_pop_nodes: nodes_for_pop[key] = np.random.choice(nodes, max_pop_nodes, replace=False) else: nodes_for_pop[key] = nodes # Make all combinations of populations pop_names = list(nodes_for_pop.keys()) tmrca_df = pd.DataFrame(columns=pop_names, index=pop_names) combos = itertools.combinations_with_replacement( np.arange(0, len(pop_names)), 2) combo_map = {c: i for i, c in enumerate(combos)} func_params = zip( combo_map.keys(), itertools.repeat(time_index), itertools.repeat(list(nodes_for_pop.values())), itertools.repeat(ts_name), itertools.repeat(deleted_trees), ) data = np.zeros((len(combo_map), len(unique_times)), dtype=np.float) with multiprocessing.Pool(processes=num_processes) as pool: for tmrca_weight, combo in tqdm(pool.imap_unordered( get_tmrca_weights, func_params), total=len(combo_map)): popA = pop_names[combo[0]] popB = pop_names[combo[1]] keep = (tmrca_weight != 0) # Deal with log_unique_times[0] == -inf mean_log_age = np.sum(log_unique_times[keep] * tmrca_weight[keep]) mean_log_age /= np.sum(tmrca_weight) # Normalise tmrca_df.loc[popA, popB] = np.exp(mean_log_age) data[combo_map[combo], :] = tmrca_weight bins, hist_data = make_histogram_data(log_unique_times, data, hist_nbins, hist_min_gens) named_combos = [None] * len(combo_map) for combo, i in combo_map.items(): named_combos[i] = (pop_names[combo[0]], pop_names[combo[1]]) hist = HistData(bins, hist_data, np.array(named_combos)) if return_raw_data is False: data = None return TmrcaData(means=tmrca_df, histogram=hist, raw_data=(log_unique_times, data))
def _set_populations( tables, pop_id=None, selfing_fraction=0.0, female_cloning_fraction=0.0, male_cloning_fraction=0.0, sex_ratio=0.5, bounds_x0=0.0, bounds_x1=0.0, bounds_y0=0.0, bounds_y1=0.0, bounds_z0=0.0, bounds_z1=0.0, migration_records=None): ''' Adds to a TableCollection the information about populations required for SLiM to load a tree sequence. This will replace anything already in the Population table. ''' num_pops = max(tables.nodes.population) + 1 for md in tskit.unpack_bytes(tables.individuals.metadata, tables.individuals.metadata_offset): try: ind_md = decode_individual(md) except: raise ValueError("Individuals do not have metadata: " "need to run set_nodes_individuals() first?") assert(ind_md.population < num_pops) if pop_id is None: pop_id = list(range(num_pops)) assert(len(pop_id) == num_pops) if type(selfing_fraction) is float: selfing_fraction = [selfing_fraction for _ in range(num_pops)] assert(len(selfing_fraction) == num_pops) if type(female_cloning_fraction) is float: female_cloning_fraction = [female_cloning_fraction for _ in range(num_pops)] assert(len(female_cloning_fraction) == num_pops) if type(male_cloning_fraction) is float: male_cloning_fraction = [male_cloning_fraction for _ in range(num_pops)] assert(len(male_cloning_fraction) == num_pops) if type(sex_ratio) is float: sex_ratio = [sex_ratio for _ in range(num_pops)] assert(len(sex_ratio) == num_pops) if type(bounds_x0) is float: bounds_x0 = [bounds_x0 for _ in range(num_pops)] assert(len(bounds_x0) == num_pops) if type(bounds_x1) is float: bounds_x1 = [bounds_x1 for _ in range(num_pops)] assert(len(bounds_x1) == num_pops) if type(bounds_y0) is float: bounds_y0 = [bounds_y0 for _ in range(num_pops)] assert(len(bounds_y0) == num_pops) if type(bounds_y1) is float: bounds_y1 = [bounds_y1 for _ in range(num_pops)] assert(len(bounds_y1) == num_pops) if type(bounds_z0) is float: bounds_z0 = [bounds_z0 for _ in range(num_pops)] assert(len(bounds_z0) == num_pops) if type(bounds_z1) is float: bounds_z1 = [bounds_z1 for _ in range(num_pops)] assert(len(bounds_z1) == num_pops) if migration_records is None: migration_records = [[] for _ in range(num_pops)] assert(len(migration_records) == num_pops) for mrl in migration_records: for mr in mrl: assert(type(mr) is PopulationMigrationMetadata) population_metadata = [PopulationMetadata(*x) for x in zip(pop_id, selfing_fraction, female_cloning_fraction, male_cloning_fraction, sex_ratio, bounds_x0, bounds_x1, bounds_y0, bounds_y1, bounds_z0, bounds_z1, migration_records)] annotate_population_metadata(tables, population_metadata)
def __init__(self, ts, reference_sequence=None): provenance = get_provenance(ts) slim_generation = provenance.slim_generation if provenance.file_version != "0.4": warnings.warn("This is an version {} SLiM tree sequence.".format(provenance.file_version) + " When you write this out, " + "it will be converted to version 0.4.") tables = ts.dump_tables() if provenance.file_version == "0.1" or provenance.file_version == "0.2": # add empty nucleotide slots to metadata mut_bytes = tskit.unpack_bytes(tables.mutations.metadata, tables.mutations.metadata_offset) mut_metadata = [_decode_mutation_pre_nucleotides(md) for md in mut_bytes] annotate_mutation_metadata(tables, mut_metadata) if provenance.file_version == "0.1": # shift times node_times = tables.nodes.time + slim_generation tables.nodes.set_columns( flags=tables.nodes.flags, time=node_times, population=tables.nodes.population, individual=tables.nodes.individual, metadata=tables.nodes.metadata, metadata_offset=tables.nodes.metadata_offset) migration_times = tables.migrations.time + slim_generation tables.migrations.set_columns( left=tables.migrations.left, right=tables.migrations.right, node=tables.migrations.node, source=tables.migrations.source, dest=tables.migrations.dest, time=migration_times) upgrade_slim_provenance(tables) ts = tables.tree_sequence() provenance = get_provenance(ts) assert(provenance.file_version == "0.4") super().__init__(ts._ll_tree_sequence) self.slim_generation = slim_generation self.reference_sequence = reference_sequence # pre-extract individual metadata self.individual_locations = ts.tables.individuals.location self.individual_locations.shape = (int(len(self.individual_locations)/3), 3) self.individual_ages = np.zeros(ts.num_individuals, dtype='int') if self.slim_provenance.model_type != "WF": self.individual_ages = np.fromiter(map(lambda ind: decode_individual(ind.metadata).age, ts.individuals()), dtype='int64') self.individual_times = np.zeros(ts.num_individuals) self.individual_populations = np.repeat(np.int32(-1), ts.num_individuals) if not np.all(unique_labels_by_group(ts.tables.nodes.individual, ts.tables.nodes.population)): raise ValueError("Individual has nodes from more than one population.") if not np.all(unique_labels_by_group(ts.tables.nodes.individual, ts.tables.nodes.time)): raise ValueError("Individual has nodes from more than one time.") has_indiv = (ts.tables.nodes.individual >= 0) which_indiv = ts.tables.nodes.individual[has_indiv] # if we did not do the sanity check above then an individual with nodes in more than one pop # would get the pop of their last node in the list self.individual_populations[which_indiv] = ts.tables.nodes.population[has_indiv] self.individual_times[which_indiv] = ts.tables.nodes.time[has_indiv]