示例#1
0
def _initializeIndividualTable(self, tc):
    """
    Returns node ID -> individual map
    """
    # First, alive individuals:
    individal_nodes = {}
    for i in range(self.N):
        individal_nodes[2 * i] = i
        individal_nodes[2 * i + 1] = i
    metadata_strings = _generate_individual_metadata(self.diploid_metadata, tc)

    # Now, preserved nodes
    num_ind_nodes = self.N
    for i in self.ancient_sample_metadata:
        assert i not in individal_nodes, "indivudal record error"
        individal_nodes[i.nodes[0]] = num_ind_nodes
        individal_nodes[i.nodes[1]] = num_ind_nodes
        num_ind_nodes += 1

    metadata_strings.extend(
        _generate_individual_metadata(self.ancient_sample_metadata, tc))

    md, mdo = tskit.pack_bytes(metadata_strings)
    flags = [0 for i in range(self.N + len(self.ancient_sample_metadata))]
    tc.individuals.set_columns(flags=flags, metadata=md, metadata_offset=mdo)
    return individal_nodes
示例#2
0
 def test_pickle(self):
     ts = msprime.simulate(10, random_seed=1)
     tables = ts.dump_tables()
     # For each node, we create some Python metadata that can be pickled
     metadata = [{
         "one": j,
         "two": 2 * j,
         "three": list(range(j))
     } for j in range(ts.num_nodes)]
     encoded, offset = tskit.pack_bytes(list(map(pickle.dumps, metadata)))
     tables.nodes.set_columns(
         flags=tables.nodes.flags,
         time=tables.nodes.time,
         population=tables.nodes.population,
         metadata_offset=offset,
         metadata=encoded,
     )
     self.assertTrue(np.array_equal(tables.nodes.metadata_offset, offset))
     self.assertTrue(np.array_equal(tables.nodes.metadata, encoded))
     ts1 = tables.tree_sequence()
     for j, node in enumerate(ts1.nodes()):
         decoded_metadata = pickle.loads(node.metadata)
         self.assertEqual(decoded_metadata, metadata[j])
     ts1.dump(self.temp_file)
     ts2 = tskit.load(self.temp_file)
     self.assertEqual(ts1.tables.nodes, ts2.tables.nodes)
示例#3
0
def _initializePopulationTable(node_view, tc):
    population_metadata = []
    for i in sorted(np.unique(node_view['population'])):
        md = "deme" + str(i)
        population_metadata.append(md.encode("utf-8"))

    pmd, pmdo = tskit.pack_bytes(population_metadata)
    tc.populations.set_columns(metadata=pmd, metadata_offset=pmdo)
示例#4
0
def _upgrade_old_tables(tables):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        provenance = get_provenance(tables)
    file_version = provenance.file_version
    slim_generation = provenance.slim_generation
    warnings.warn(
        "This is an version {} SLiM tree sequence.".format(file_version) +
        " When you write this out, " +
        "it will be converted to version {}.".format(slim_file_version))
    if file_version == "0.1" or file_version == "0.2":
        # add empty nucleotide slots to metadata
        mut_bytes = tskit.unpack_bytes(tables.mutations.metadata,
                                       tables.mutations.metadata_offset)
        mut_metadata = [
            _decode_mutation_pre_nucleotides(md) for md in mut_bytes
        ]
        metadata, metadata_offset = tskit.pack_bytes(mut_metadata)
        tables.mutations.set_columns(
            site=tables.mutations.site,
            node=tables.mutations.node,
            parent=tables.mutations.parent,
            derived_state=tables.mutations.derived_state,
            derived_state_offset=tables.mutations.derived_state_offset,
            metadata=metadata,
            metadata_offset=metadata_offset)
    if file_version == "0.1":
        # shift times
        node_times = tables.nodes.time + slim_generation
        tables.nodes.set_columns(flags=tables.nodes.flags,
                                 time=node_times,
                                 population=tables.nodes.population,
                                 individual=tables.nodes.individual,
                                 metadata=tables.nodes.metadata,
                                 metadata_offset=tables.nodes.metadata_offset)
        migration_times = tables.migrations.time + slim_generation
        tables.migrations.set_columns(left=tables.migrations.left,
                                      right=tables.migrations.right,
                                      node=tables.migrations.node,
                                      source=tables.migrations.source,
                                      dest=tables.migrations.dest,
                                      time=migration_times)
    new_record = {
        "schema_version": "1.0.0",
        "software": {
            "name": "pyslim",
            "version": pyslim_version,
        },
        "parameters": {
            "command": ["_upgrade_old_tables"],
            "old_file_version": file_version,
            "new_file_version": slim_file_version,
        },
        "environment": get_environment(),
    }
    tskit.validate_provenance(new_record)
    tables.provenances.add_row(json.dumps(new_record))
示例#5
0
def _set_sites_mutations(tables):
    '''
    Adds to a TableCollection the information relevant to mutations required
    for SLiM to load in a tree sequence. This means adding to the metadata column
    of the Mutation table,  It will also
    - give SLiM IDs to each mutation
    - round Site positions to integer values
    - stack any mutations that end up at the same position as a result
    - replace ancestral states with ""
    This will replace any information already in the metadata or derived state
    columns of the Mutation table.
    '''
    num_mutations = tables.mutations.num_rows
    default_mut = default_slim_metadata("mutation")
    dsb, dso = tskit.pack_bytes([str(j) for j in range(num_mutations)])
    slim_time = tables.metadata["SLiM"]["generation"] - tables.mutations.time
    mms = tables.mutations.metadata_schema
    mutation_metadata = [
        mms.encode_row({
            "mutation_list": [{
                "mutation_type": default_mut["mutation_type"],
                "selection_coeff": default_mut["selection_coeff"],
                "subpopulation": default_mut["subpopulation"],
                "slim_time": st,
                "nucleotide": default_mut["nucleotide"]
            }]
        }) for st in slim_time
    ]
    mdb, mdo = tskit.pack_bytes(mutation_metadata)
    tables.mutations.set_columns(site=tables.mutations.site,
                                 node=tables.mutations.node,
                                 time=tables.mutations.time,
                                 derived_state=dsb,
                                 derived_state_offset=dso,
                                 parent=tables.mutations.parent,
                                 metadata=mdb,
                                 metadata_offset=mdo)
    tables.sites.set_columns(position=tables.sites.position,
                             ancestral_state=np.array([], dtype='int8'),
                             ancestral_state_offset=np.zeros(
                                 tables.sites.num_rows + 1, dtype='uint32'))
示例#6
0
def _generate_mutation_metadata(pop):
    muts = []
    for mr in pop.tables.mutations:
        m = pop.mutations[mr.key]
        d = {
            's': m.s,
            'h': m.h,
            # 'g': m.g,
            'label': m.label,
            'esizes': list(m.esizes),
            'heffects': list(m.heffects),
            'neutral': m.neutral
        }
        muts.append(str(d).encode('utf-8'))
    return tskit.pack_bytes(muts)
示例#7
0
def _generate_mutation_metadata(self):
    muts = []
    for mr in self.tables.mutations:
        m = self.mutations[mr.key]
        d = {
            's': m.s,
            'h': m.h,
            'age': self.generation - m.g + 1,
            'label': m.label,
            'esizes': list(m.esizes),
            'heffects': list(m.heffects),
            'neutral': m.neutral,
            'key': mr.key
        }
        muts.append(str(d).encode('utf-8'))
    return tskit.pack_bytes(muts)
示例#8
0
def posterior_mean_var(ts, timepoints, posterior, Ne, *, fixed_node_set=None):
    """
    Mean and variance of node age in scaled time. Fixed nodes will be given a mean
    of their exact time in the tree sequence, and zero variance (as long as they are
    identified by the fixed_node_set
    If fixed_node_set is None, we attempt to date all the non-sample nodes
    Also assigns the estimated mean and variance of the age of each node, in unscaled
    time, as metadata in the tree sequence.
    """
    mn_post = np.full(ts.num_nodes,
                      np.nan)  # Fill with NaNs so we detect when there's
    vr_post = np.full(ts.num_nodes, np.nan)  # been an error
    tables = ts.dump_tables()

    if fixed_node_set is None:
        fixed_node_set = ts.samples()
    fixed_nodes = np.array(list(fixed_node_set))
    mn_post[fixed_nodes] = tables.nodes.time[fixed_nodes]
    vr_post[fixed_nodes] = 0

    metadata_array = tskit.unpack_bytes(ts.tables.nodes.metadata,
                                        ts.tables.nodes.metadata_offset)
    timepoints = timepoints * 2 * Ne
    for row, node_id in zip(posterior.grid_data, posterior.nonfixed_nodes):
        mn_post[node_id] = np.sum(row * timepoints) / np.sum(row)
        vr_post[node_id] = np.sum(
            ((mn_post[node_id] - (timepoints))**2) * (row / np.sum(row)))
        metadata_array[node_id] = json.dumps({
            "mn": mn_post[node_id],
            "vr": vr_post[node_id]
        }).encode()
    md, md_offset = tskit.pack_bytes(metadata_array)
    tables.nodes.set_columns(
        flags=tables.nodes.flags,
        time=tables.nodes.time,
        population=tables.nodes.population,
        individual=tables.nodes.individual,
        metadata=md,
        metadata_offset=md_offset,
    )
    ts = tables.tree_sequence()
    return ts, mn_post, vr_post
示例#9
0
def combine_chromosome_arms(args):
    """
    Splices two chromosome arms together to form a full chromosome
    """
    short_arm = tskit.load(args.p_arm)
    long_arm = tskit.load(args.q_arm)
    assert short_arm.num_samples == long_arm.num_samples
    # Remove material before first position and after last position
    short_arm = short_arm.keep_intervals(
        [[
            short_arm.tables.sites.position[0] - 1,
            short_arm.tables.sites.position[-1] + 1,
        ]],
        simplify=False,
    )
    long_arm = long_arm.keep_intervals(
        [[
            long_arm.tables.sites.position[0] - 1,
            long_arm.tables.sites.position[-1] + 1,
        ]],
        simplify=False,
    )
    short_tables = short_arm.dump_tables()
    long_tables = long_arm.dump_tables()
    assert np.array_equal(short_tables.individuals.metadata,
                          long_tables.individuals.metadata)
    short_tables.sequence_length = long_arm.get_sequence_length()
    short_metadata = short_tables.nodes.metadata
    short_metadata_offset = short_tables.nodes.metadata_offset
    short_metadata = tskit.unpack_bytes(short_metadata, short_metadata_offset)

    long_metadata = long_tables.nodes.metadata
    long_metadata_offset = long_tables.nodes.metadata_offset
    long_metadata = tskit.unpack_bytes(long_metadata, long_metadata_offset)
    long_metadata = long_metadata[long_arm.num_samples:]
    combined_metadata = np.concatenate([short_metadata, long_metadata])
    metadata, metadata_offset = tskit.pack_bytes(combined_metadata)

    all_nodes_except_samples = ~np.isin(np.arange(long_arm.num_nodes),
                                        long_arm.samples())
    short_tables.nodes.append_columns(
        long_tables.nodes.flags[all_nodes_except_samples],
        long_tables.nodes.time[all_nodes_except_samples],
        long_tables.nodes.population[all_nodes_except_samples],
    )
    short_tables.nodes.set_columns(
        flags=short_tables.nodes.flags,
        time=short_tables.nodes.time,
        population=short_tables.nodes.population,
        metadata=metadata,
        individual=short_tables.nodes.individual,
        metadata_offset=metadata_offset,
    )

    long_edges_parent = long_tables.edges.parent
    long_edges_child = long_tables.edges.child
    long_arm_sample_map = np.zeros(long_arm.num_nodes).astype(int)
    long_arm_sample_map[long_arm.samples()] = short_arm.samples()
    long_edges_parent[~np.isin(long_edges_parent, long_arm.samples(
    ))] = long_edges_parent[~np.isin(long_edges_parent, long_arm.samples()
                                     )] + (short_arm.num_nodes)
    long_edges_parent[
        long_arm.tables.edges.parent > long_arm.samples()[-1]] = (
            long_edges_parent[
                long_arm.tables.edges.parent > long_arm.samples()[-1]] -
            long_arm.num_samples)
    long_edges_child[~np.isin(long_edges_child, long_arm.samples(
    ))] = long_edges_child[~np.isin(long_edges_child, long_arm.samples())] + (
        short_arm.num_nodes)
    long_edges_child[long_tables.edges.child > long_arm.samples()[-1]] = (
        long_edges_child[long_tables.edges.child > long_arm.samples()[-1]] -
        long_arm.num_samples)
    long_edges_child[np.isin(
        long_tables.edges.child, long_arm.samples())] = long_arm_sample_map[
            long_tables.edges.child[np.isin(long_tables.edges.child,
                                            long_arm.samples())]]
    short_tables.edges.append_columns(
        long_tables.edges.left,
        long_tables.edges.right,
        long_edges_parent,
        long_edges_child,
    )
    short_tables.sites.append_columns(
        long_tables.sites.position,
        long_tables.sites.ancestral_state,
        long_tables.sites.ancestral_state_offset,
    )
    long_mutations_node = long_tables.mutations.node
    long_mutations_node[~np.isin(long_mutations_node, long_arm.samples(
    ))] = long_mutations_node[~np.isin(long_mutations_node, long_arm.samples()
                                       )] + (short_arm.num_nodes)
    long_mutations_node[
        long_tables.mutations.node > long_arm.samples()[-1]] = (
            long_mutations_node[
                long_tables.mutations.node > long_arm.samples()[-1]] -
            long_arm.num_samples)
    long_mutations_node[np.isin(long_tables.mutations.node,
                                long_arm.samples())] = long_arm_sample_map[
                                    long_tables.mutations.node[np.isin(
                                        long_tables.mutations.node,
                                        long_arm.samples())]]
    short_tables.mutations.append_columns(
        long_tables.mutations.site + short_arm.num_sites,
        long_mutations_node,
        long_tables.mutations.derived_state,
        long_tables.mutations.derived_state_offset,
    )

    short_tables.sort()
    combined = short_tables.tree_sequence()
    assert combined.num_nodes == (short_arm.num_nodes + long_arm.num_nodes -
                                  short_arm.num_samples)
    assert combined.num_sites == (short_arm.num_sites + long_arm.num_sites)
    assert combined.num_edges == (short_arm.num_edges + long_arm.num_edges)
    assert combined.num_mutations == (short_arm.num_mutations +
                                      long_arm.num_mutations)
    assert (combined.num_individuals == short_arm.num_individuals ==
            long_arm.num_individuals)
    assert np.array_equal(
        np.sort(combined.tables.sites.position),
        np.concatenate(
            [short_arm.tables.sites.position, long_arm.tables.sites.position]),
    )
    assert np.array_equal(
        np.sort(combined.tables.nodes.time[combined.tables.mutations.node]),
        np.sort(
            np.concatenate([
                short_arm.tables.nodes.time[short_arm.tables.mutations.node],
                long_arm.tables.nodes.time[long_arm.tables.mutations.node],
            ])),
    )
    assert np.array_equal(combined.tables.individuals.metadata,
                          long_tables.individuals.metadata)
    combined.dump(args.output)
示例#10
0
def _set_nodes_individuals(tables, age):
    '''
    Adds to a TableCollection the information relevant to individuals required
    for SLiM to load in a tree sequence, that is found in Node and Individual
    tables.  This will replace any existing Individual table, and will replace
    any information already in the individual, metadata, and population columns
    of the Node table.

    This is designed to make it easy to assign default values:
    - (node_ind) the 2*j-th and (2*j+1)-st `sample` nodes to individual j
    - (location) individual locations to (0, 0, 0)
    - (age) individual age to 0
    - (ind_id) SLiM individual pedigree IDs to sequential integers starting from 0
    - (ind_population) individual populations to 0
    - (node_id) SLiM genome IDs to sequential integers starting with samples from 0
    - (node_is_null) genomes to be non-null
    - (node_type) genome type to 0 (= autosome)
    - (ind_flags) INDIVIDUAL_ALIVE

    If you have other situations, like non-alive "remembered" individuals, you
    will need to edit the tables by hand, afterwards.
    '''
    samples = np.where(tables.nodes.flags & tskit.NODE_IS_SAMPLE)[0]
    if (len(samples) % 2) != 0:
        raise ValueError("There must be an even number of sampled nodes,"\
                         + "since organisms are diploid.")

    num_individuals = int(len(samples) / 2)
    node_ind = np.repeat(tskit.NULL, tables.nodes.num_rows).astype("int32")
    node_ind[samples] = np.arange(len(samples)) // 2
    ind_id = np.arange(num_individuals)
    slim_node_id = np.repeat(tskit.NULL, tables.nodes.num_rows)
    slim_node_id[samples] = np.arange(len(samples))

    ind_population = np.repeat(tskit.NULL, num_individuals)
    ind_population[node_ind[samples]] = tables.nodes.population[samples]

    if not np.all(unique_labels_by_group(node_ind, tables.nodes.population)):
        raise ValueError("Individual has nodes from more than one population.")
    if not np.all(unique_labels_by_group(node_ind, tables.nodes.time)):
        raise ValueError("Individual has nodes from more than one time.")

    loc_vec = np.zeros(num_individuals * 3).astype("float64")
    loc_off = 3 * np.arange(num_individuals + 1).astype("uint32")
    ind_flags = np.repeat(INDIVIDUAL_ALIVE, num_individuals).astype("uint32")

    default_ind = default_slim_metadata("individual")
    sex = default_ind['sex']
    slim_flag = default_ind['flags']

    ims = tables.individuals.metadata_schema
    individual_metadata = [
        ims.encode_row({
            'pedigree_id': iid,
            'age': age,
            'subpopulation': int(pop),
            'sex': sex,
            'flags': slim_flag
        }) for (iid, pop) in zip(ind_id, ind_population)
    ]
    imb, imo = tskit.pack_bytes(individual_metadata)
    tables.individuals.set_columns(flags=ind_flags,
                                   location=loc_vec,
                                   location_offset=loc_off,
                                   metadata=imb,
                                   metadata_offset=imo)
    assert (tables.individuals.num_rows == num_individuals)

    default_node = default_slim_metadata("node")
    node_is_null = default_node["is_null"]
    node_type = default_node["genome_type"]
    nms = tables.nodes.metadata_schema
    node_metadata = [b'' for _ in range(tables.nodes.num_rows)]
    for j in samples:
        node_metadata[j] = nms.encode_row({
            'slim_id': slim_node_id[j],
            'is_null': node_is_null,
            'genome_type': node_type
        })
    nmb, nmo = tskit.pack_bytes(node_metadata)
    tables.nodes.set_columns(flags=tables.nodes.flags,
                             time=tables.nodes.time,
                             population=tables.nodes.population,
                             individual=node_ind,
                             metadata=nmb,
                             metadata_offset=nmo)
示例#11
0
def _set_nodes_individuals(
        tables, node_ind=None, location=(0, 0, 0), age=0, ind_id=None,
        ind_population=None, ind_sex=INDIVIDUAL_TYPE_HERMAPHRODITE,
        ind_flags=INDIVIDUAL_ALIVE, slim_ind_flags=0, node_id=None,
        node_is_null=False, node_type=GENOME_TYPE_AUTOSOME):
    '''
    Adds to a TableCollection the information relevant to individuals required
    for SLiM to load in a tree sequence, that is found in Node and Individual
    tables.  This will replace any existing Individual table, and will replace
    any information already in the individual, metadata, and population columns
    of the Node table.

    This is designed to make it easy to assign default values:
    - (node_ind) the 2*j-th and (2*j+1)-st `sample` nodes to individual j
    - (location) individual locations to (0, 0, 0)
    - (age) individual age to 0
    - (ind_id) SLiM individual pedigree IDs to sequential integers starting from 0
    - (ind_population) individual populations to 0
    - (node_id) SLiM genome IDs to sequential integers starting with samples from 0
    - (node_is_null) genomes to be non-null
    - (node_type) genome type to 0 (= autosome)
    - (ind_flags) INDIVIDUAL_ALIVE

    If you have other situations, like non-alive "remembered" individuals, you
    will need to edit the tables by hand, afterwards.
    '''
    samples = list(filter(lambda j: tables.nodes.flags[j] & tskit.NODE_IS_SAMPLE,
                          range(tables.nodes.num_rows)))
    if (len(samples) % 2) != 0:
        raise ValueError("There must be an even number of sampled nodes,"\
                         + "since organisms are diploid.")

    if node_ind is None:
        node_ind = [tskit.NULL for _ in range(tables.nodes.num_rows)]
        for j, k in enumerate(samples):
            node_ind[j] = int(k/2)

    num_individuals = max(node_ind) + 1
    num_nodes = tables.nodes.num_rows

    if type(location) is tuple:
        location = [location for _ in range(num_individuals)]
    assert(len(location) == num_individuals)

    if type(age) is int or type(age) is float:
        age = [age for _ in range(num_individuals)]
    assert(len(age) == num_individuals)

    if ind_id is None:
        ind_id = list(range(num_individuals))
    assert(len(ind_id) == num_individuals)

    if type(ind_sex) is int:
        ind_sex = [ind_sex for _ in range(num_individuals)]
    assert(len(ind_sex) == num_individuals)

    if type(slim_ind_flags) is int:
        slim_ind_flags = [slim_ind_flags for _ in range(num_individuals)]
    assert(len(slim_ind_flags) == num_individuals)

    if type(ind_flags) is int:
        ind_flags = [ind_flags for _ in range(num_individuals)]
    assert(len(ind_flags) == num_individuals)

    if node_id is None:
        node_id = [-1 for _ in range(num_nodes)]
        for j, k in enumerate(list(samples)
                              + sorted(list(set(range(num_nodes))
                                            - set(samples)))):
            node_id[k] = j
    assert(len(node_id) == num_nodes)

    if type(node_is_null) is bool:
        node_is_null = [node_is_null for _ in range(num_nodes)]
    assert(len(node_is_null) == num_nodes)

    if type(node_type) is int:
        node_type = [node_type for _ in range(num_nodes)]
    assert(len(node_type) == tables.nodes.num_rows)

    if ind_population is None:
        # set the individual populations based on what's in the nodes
        ind_population = [tskit.NULL for _ in range(num_individuals)]
        for j, u in enumerate(node_ind):
            if u >= 0:
                ind_population[u] = tables.nodes.population[j]
    assert(len(ind_population) == num_individuals)

    # check for consistency: every individual has two nodes, and populations agree
    ploidy = [0 for _ in range(num_individuals)]
    for j in samples:
        u = node_ind[j]
        assert(u >= 0)
        ploidy[u] += 1
        if tables.nodes.population[j] != ind_population[u]:
            raise ValueError("Inconsistent populations: nodes and individuals do not agree.")

    if any([p != 2 for p in ploidy]):
        raise ValueError("Not all individuals have two assigned nodes.")

    tables.nodes.set_columns(flags=tables.nodes.flags, time=tables.nodes.time,
                             population=tables.nodes.population, individual=node_ind,
                             metadata=tables.nodes.metadata,
                             metadata_offset=tables.nodes.metadata_offset)

    loc_vec, loc_off = tskit.pack_bytes(location)
    tables.individuals.set_columns(
            flags=ind_flags, location=loc_vec, location_offset=loc_off)

    individual_metadata = [IndividualMetadata(*x) for x in
                           zip(ind_id, age, ind_population, ind_sex, slim_ind_flags)]
    node_metadata = [None for _ in range(num_nodes)]
    for j in samples:
        node_metadata[j] = NodeMetadata(slim_id=node_id[j], is_null=node_is_null[j],
                                        genome_type=node_type[j])

    annotate_individual_metadata(tables, individual_metadata)
    annotate_node_metadata(tables, node_metadata)