Пример #1
0
 def test_with_mutations(self):
     N = 10
     ngens = 100
     tables = wf_sim(N=N,
                     ngens=ngens,
                     deep_history=False,
                     seed=self.random_seed)
     tables.sort()
     ts = msprime.load_tables(**tables.asdict())
     ts = tsutil.jukes_cantor(ts, 10, 0.1, seed=self.random_seed)
     tables = ts.tables
     self.assertGreater(tables.sites.num_rows, 0)
     self.assertGreater(tables.mutations.num_rows, 0)
     samples = np.where(
         tables.nodes.flags == msprime.NODE_IS_SAMPLE)[0].astype(np.int32)
     tables.sort()
     tables.simplify(samples)
     self.assertGreater(tables.nodes.num_rows, 0)
     self.assertGreater(tables.edges.num_rows, 0)
     self.assertGreater(tables.nodes.num_rows, 0)
     self.assertGreater(tables.edges.num_rows, 0)
     self.assertGreater(tables.sites.num_rows, 0)
     self.assertGreater(tables.mutations.num_rows, 0)
     ts = msprime.load_tables(**tables.asdict())
     self.assertEqual(ts.sample_size, N)
     for hap in ts.haplotypes():
         self.assertEqual(len(hap), ts.num_sites)
Пример #2
0
 def test_with_recurrent_mutations(self):
     # actually with only ONE site, at 0.0
     N = 10
     ngens = 100
     tables = wf_sim(N=N,
                     ngens=ngens,
                     deep_history=False,
                     seed=self.random_seed)
     msprime.sort_tables(**tables.asdict())
     ts = msprime.load_tables(**tables.asdict())
     ts = tsutil.jukes_cantor(ts, 1, 10, seed=self.random_seed)
     tables = ts.tables
     self.assertEqual(tables.sites.num_rows, 1)
     self.assertGreater(tables.mutations.num_rows, 0)
     nodes = tables.nodes
     samples = np.where(nodes.flags == msprime.NODE_IS_SAMPLE)[0].astype(
         np.int32)
     # before simplify
     for h in ts.haplotypes():
         self.assertEqual(len(h), 1)
     # after simplify
     msprime.simplify_tables(samples=samples,
                             nodes=tables.nodes,
                             edges=tables.edges,
                             sites=tables.sites,
                             mutations=tables.mutations)
     self.assertGreater(tables.nodes.num_rows, 0)
     self.assertGreater(tables.edges.num_rows, 0)
     self.assertEqual(tables.sites.num_rows, 1)
     self.assertGreater(tables.mutations.num_rows, 0)
     ts = msprime.load_tables(**tables.asdict())
     self.assertEqual(ts.sample_size, N)
     for hap in ts.haplotypes():
         self.assertEqual(len(hap), ts.num_sites)
Пример #3
0
def run_vcf(args):
    ts = msprime.load(args.file)
    total_sites = ts.num_sites
    # Subset the tree sequence down to num_sites.

    t = ts.dump_tables()
    t.sites.set_columns(
        position=t.sites.position[:args.num_sites],
        ancestral_state=t.sites.ancestral_state[:args.num_sites],
        ancestral_state_length=t.sites.ancestral_state_length[:args.num_sites])
    t.mutations.set_columns(
        site=t.mutations.site[:args.num_sites],
        node=t.mutations.node[:args.num_sites],
        derived_state=t.mutations.derived_state[:args.num_sites],
        derived_state_length=t.mutations.derived_state_length[:args.num_sites])
    ts = msprime.load_tables(**t.asdict())
    print("subset down to ", ts.num_sites, "sites")
    megabyte = 1024 * 1024
    terabyte = megabyte * 1024 * 1024
    with io.StringIO() as output:
        ts.write_vcf(output)
        size = output.tell()
    print("Wrote {:.2f} MiB".format(size / megabyte))
    projected = (size / args.num_sites) * total_sites
    print("Estimate {:.2f} TiB".format(projected / terabyte))
Пример #4
0
def insert_branch_mutations(ts, mutations_per_branch=1):
    """
    Returns a copy of the specified tree sequence with a mutation on every branch
    in every tree.
    """
    sites = msprime.SiteTable()
    mutations = msprime.MutationTable()
    for tree in ts.trees():
        site = len(sites)
        sites.add_row(position=tree.interval[0], ancestral_state='0')
        for root in tree.roots:
            state = {root: 0}
            mutation = {root: -1}
            stack = [root]
            while len(stack) > 0:
                u = stack.pop()
                stack.extend(tree.children(u))
                v = tree.parent(u)
                if v != msprime.NULL_NODE:
                    state[u] = state[v]
                    parent = mutation[v]
                    for j in range(mutations_per_branch):
                        state[u] = (state[u] + 1) % 2
                        mutation[u] = len(mutations)
                        mutations.add_row(
                            site=site, node=u, derived_state=str(state[u]),
                            parent=parent)
                        parent = mutation[u]
    tables = ts.tables
    add_provenance(tables.provenances, "insert_branch_mutations")
    return msprime.load_tables(
        nodes=tables.nodes, edges=tables.edges, sites=sites, mutations=mutations,
        provenances=tables.provenances)
Пример #5
0
def get_common_mutations_ts(args, tree_sequence, log):

    common_sites = msprime.SiteTable()
    common_mutations = msprime.MutationTable()

    # Get the mutations > MAF.
    n_haps = tree_sequence.get_sample_size()
    log.log('Determining sites > MAF cutoff {m}'.format(m=args.maf))

    for tree in tree_sequence.trees():
        for site in tree.sites():
            f = tree.get_num_leaves(site.mutations[0].node) / n_haps
            if f > args.maf and f < 1 - args.maf:
                common_site_id = common_sites.add_row(
                    position=site.position,
                    ancestral_state=site.ancestral_state)
                common_mutations.add_row(
                    site=common_site_id,
                    node=site.mutations[0].node,
                    derived_state=site.mutations[0].derived_state)
    tables = tree_sequence.dump_tables()
    new_tree_sequence = msprime.load_tables(nodes=tables.nodes,
                                            edges=tables.edges,
                                            sites=common_sites,
                                            mutations=common_mutations)
    return new_tree_sequence
Пример #6
0
def resolve_polytomies(ts, polytomy_func):
    """
    polytomy_func should take a set of edge records, and an edgesets and a nodes object
    to be added to.
    """
    new_edgesets = msprime.EdgesetTable()
    nodes, mutations = get_nodes_and_mutations(ts)
    edge_records = [[]] #store the edge records per parent, split into contiguous blocks
    for e in ts.edgesets():    #assume records are in order
        if len(edge_records[0]==0) or e.parent == records[0][0].parent:
            if e.right==edge_records[-1][-1].left:
                #contiguous with the last record
                edge_records[-1].append(e)
            else:
                #this is the same parent, but not contiguous
                edge_records.append([e])
        else:
            #submit records for polytomy resolution - may require new nodes to be created
            polytomy_func(edge_records, new_edgesets, nodes)
            edge_records = [[e]]
    if edge_records:
        #last loop
        polytomy_func(edge_records, nodes, new_edgeset)

    return msprime.load_tables(nodes=nodes, edgesets=new_edgesets, mutations=mutations)
Пример #7
0
 def test_pickle(self):
     ts = msprime.simulate(10, random_seed=1)
     tables = ts.dump_tables()
     nodes = tables.nodes
     # For each node, we create some Python metadata that can be pickled
     metadata = [{
         "one": j,
         "two": 2 * j,
         "three": list(range(j))
     } for j in range(len(nodes))]
     encoded, offset = msprime.pack_bytes(list(map(pickle.dumps, metadata)))
     nodes.set_columns(flags=nodes.flags,
                       time=nodes.time,
                       population=nodes.population,
                       metadata_offset=offset,
                       metadata=encoded)
     self.assertTrue(np.array_equal(nodes.metadata_offset, offset))
     self.assertTrue(np.array_equal(nodes.metadata, encoded))
     ts1 = msprime.load_tables(nodes=nodes, edges=tables.edges)
     for j, node in enumerate(ts1.nodes()):
         decoded_metadata = pickle.loads(node.metadata)
         self.assertEqual(decoded_metadata, metadata[j])
     ts1.dump(self.temp_file)
     ts2 = msprime.load(self.temp_file)
     self.assertEqual(ts1.tables.nodes, ts2.tables.nodes)
Пример #8
0
def strip_singletons(ts, maf):
    """
    TODO: include maf filtering... done??
    modified from Jerome's
    :param maf:
    :param ts:
    :return:
    """
    n = ts.get_sample_size()
    sites = msprime.SiteTable()
    mutations = msprime.MutationTable()
    for tree in ts.trees():
        for site in tree.sites():
            assert len(site.mutations) == 1  # Only supports infinite sites muts.
            mut = site.mutations[0]
            f = tree.get_num_leaves(mut.node) / n
            if (tree.num_samples(mut.node) > 1) and (f > maf):
                site_id = sites.add_row(
                    position=site.position,
                    ancestral_state=site.ancestral_state)
                mutations.add_row(
                    site=site_id, node=mut.node, derived_state=mut.derived_state
                )
    tables = ts.dump_tables()
    new_ts = msprime.load_tables(
        nodes=tables.nodes, edges=tables.edges, sites=sites, mutations=mutations
    )
    return new_ts
Пример #9
0
def add_random_metadata(ts, seed=1, max_length=10):
    """
    Returns a copy of the specified tree sequence with random metadata assigned
    to the nodes, sites and mutations.
    """
    tables = ts.dump_tables()
    np.random.seed(seed)

    length = np.random.randint(0, max_length, ts.num_nodes)
    offset = np.cumsum(np.hstack(([0], length)), dtype=np.uint32)
    # Older versions of numpy didn't have a dtype argument for randint, so
    # must use astype instead.
    metadata = np.random.randint(-127, 127, offset[-1]).astype(np.int8)
    nodes = tables.nodes
    nodes.set_columns(
        flags=nodes.flags, population=nodes.population, time=nodes.time,
        metadata_offset=offset, metadata=metadata,
        individual=nodes.individual)

    length = np.random.randint(0, max_length, ts.num_sites)
    offset = np.cumsum(np.hstack(([0], length)), dtype=np.uint32)
    metadata = np.random.randint(-127, 127, offset[-1]).astype(np.int8)
    sites = tables.sites
    sites.set_columns(
        position=sites.position,
        ancestral_state=sites.ancestral_state,
        ancestral_state_offset=sites.ancestral_state_offset,
        metadata_offset=offset, metadata=metadata)

    length = np.random.randint(0, max_length, ts.num_mutations)
    offset = np.cumsum(np.hstack(([0], length)), dtype=np.uint32)
    metadata = np.random.randint(-127, 127, offset[-1]).astype(np.int8)
    mutations = tables.mutations
    mutations.set_columns(
        site=mutations.site,
        node=mutations.node,
        parent=mutations.parent,
        derived_state=mutations.derived_state,
        derived_state_offset=mutations.derived_state_offset,
        metadata_offset=offset, metadata=metadata)

    length = np.random.randint(0, max_length, ts.num_individuals)
    offset = np.cumsum(np.hstack(([0], length)), dtype=np.uint32)
    metadata = np.random.randint(-127, 127, offset[-1]).astype(np.int8)
    individuals = tables.individuals
    individuals.set_columns(
        flags=individuals.flags,
        location=individuals.location,
        location_offset=individuals.location_offset,
        metadata_offset=offset, metadata=metadata)

    length = np.random.randint(0, max_length, ts.num_populations)
    offset = np.cumsum(np.hstack(([0], length)), dtype=np.uint32)
    metadata = np.random.randint(-127, 127, offset[-1]).astype(np.int8)
    populations = tables.populations
    populations.set_columns(metadata_offset=offset, metadata=metadata)

    add_provenance(tables.provenances, "add_random_metadata")
    ts = msprime.load_tables(**tables.asdict())
    return ts
Пример #10
0
    def simplify(self):
        # print("START")
        # self.print_state()
        if self.ts.num_edges > 0:
            all_edges = list(self.ts.edges())
            edges = all_edges[:1]
            for e in all_edges[1:]:
                if e.parent != edges[0].parent:
                    self.process_parent_edges(edges)
                    edges = []
                edges.append(e)
            self.process_parent_edges(edges)
        # Record any final mutations over the roots.
        for input_id in list(self.A.keys()):
            x = self.A[input_id]
            while x is not None:
                mutations = self.get_mutations(input_id, x.left, x.right)
                for mutation_id in mutations:
                    # print("Recording mutation over root", x.node, mutation_id)
                    self.record_mutation(x.node, mutation_id)
                x = x.next

        self.finalise_sites()
        node_map = np.zeros(self.ts.num_nodes, np.int32) - 1
        for input_id, output_id in self.node_id_map.items():
            node_map[input_id] = output_id
        ts = msprime.load_tables(nodes=self.node_table,
                                 edges=self.edge_table,
                                 sites=self.site_table,
                                 mutations=self.mutation_table,
                                 sequence_length=self.sequence_length)
        return ts, node_map
Пример #11
0
def strip_singletons(ts):
    """
    Returns a copy of the specified tree sequence with singletons removed.
    """
    sites = msprime.SiteTable()
    mutations = msprime.MutationTable()
    dropped_mutations = 0
    for variant in ts.variants():
        if np.sum(variant.genotypes) > 1:
            site_id = sites.add_row(
                position=variant.site.position,
                ancestral_state=variant.site.ancestral_state)
            assert len(variant.site.mutations) >= 1
            mutation = variant.site.mutations[0]
            parent_id = mutations.add_row(site=site_id,
                                          node=mutation.node,
                                          derived_state=mutation.derived_state)
            for error in variant.site.mutations[1:]:
                parent = -1
                if error.parent != -1:
                    parent = parent_id
                mutations.add_row(site=site_id,
                                  node=error.node,
                                  derived_state=error.derived_state,
                                  parent=parent)
    tables = ts.dump_tables()
    return msprime.load_tables(nodes=tables.nodes,
                               edges=tables.edges,
                               sites=sites,
                               mutations=mutations)
Пример #12
0
def single_childify(ts):
    """
    Builds a new equivalent tree sequence which contains an extra node in the
    middle of all exising branches.
    """
    tables = ts.dump_tables()
    edges = tables.edges
    nodes = tables.nodes
    sites = tables.sites
    mutations = tables.mutations

    time = nodes.time[:]
    edges.reset()
    for edge in ts.edges():
        # Insert a new node in between the parent and child.
        u = len(nodes)
        t = time[edge.child] + (time[edge.parent] - time[edge.child]) / 2
        nodes.add_row(time=t)
        edges.add_row(
            left=edge.left, right=edge.right, parent=u, child=edge.child)
        edges.add_row(
            left=edge.left, right=edge.right, parent=edge.parent, child=u)
    msprime.sort_tables(
        nodes=nodes, edges=edges, sites=sites, mutations=mutations)
    add_provenance(tables.provenances, "insert_redundant_breakpoints")
    new_ts = msprime.load_tables(
        nodes=nodes, edges=edges, sites=sites, mutations=mutations,
        provenances=tables.provenances)
    return new_ts
Пример #13
0
def get_ancestral_haplotypes(ts):
    """
    Returns a numpy array of the haplotypes of the ancestors in the
    specified tree sequence.
    """
    nodes = ts.tables.nodes
    flags = nodes.flags[:]
    flags[:] = 1
    nodes.set_columns(time=nodes.time, flags=flags)

    sites = [site.position for site in ts.sites()]
    tsp = msprime.load_tables(nodes=nodes,
                              edges=ts.tables.edges,
                              sites=ts.tables.sites,
                              mutations=ts.tables.mutations)
    B = tsp.genotype_matrix().T

    A = np.zeros((ts.num_nodes, ts.num_sites), dtype=np.uint8)
    A[:] = inference.UNKNOWN_ALLELE
    for edge in ts.edges():
        start = bisect.bisect_left(sites, edge.left)
        end = bisect.bisect_right(sites, edge.right)
        if sites[end - 1] == edge.right:
            end -= 1
        A[edge.parent, start:end] = B[edge.parent, start:end]
    A[:ts.num_samples] = B[:ts.num_samples]
    return A
Пример #14
0
def insert_multichar_mutations(ts, seed=1, max_len=10):
    """
    Returns a copy of the specified tree sequence with multiple chararacter
    mutations on a randomly chosen branch in every tree.
    """
    rng = random.Random(seed)
    letters = ["A", "C", "T", "G"]
    sites = msprime.SiteTable()
    mutations = msprime.MutationTable()
    for tree in ts.trees():
        site = len(sites)
        ancestral_state = rng.choice(letters) * rng.randint(0, max_len)
        sites.add_row(position=tree.interval[0], ancestral_state=ancestral_state)
        nodes = list(tree.nodes())
        nodes.remove(tree.root)
        u = rng.choice(nodes)
        derived_state = ancestral_state
        while ancestral_state == derived_state:
            derived_state = rng.choice(letters) * rng.randint(0, max_len)
        mutations.add_row(site=site, node=u, derived_state=derived_state)
    tables = ts.tables
    add_provenance(tables.provenances, "insert_multichar_mutations")
    return msprime.load_tables(
        nodes=tables.nodes, edges=tables.edges, sites=sites, mutations=mutations,
        provenances=tables.provenances)
Пример #15
0
def jukes_cantor(ts, num_sites, mu, multiple_per_node=True, seed=None):
    """
    Returns a copy of the specified tree sequence with Jukes-Cantor mutations
    applied at the specfied rate at the specifed number of sites. Site positions
    are chosen uniformly.
    """
    random.seed(seed)
    positions = [
        ts.sequence_length * random.random() for _ in range(num_sites)
    ]
    positions.sort()
    sites = msprime.SiteTable(num_sites)
    mutations = msprime.MutationTable(num_sites)
    trees = ts.trees()
    t = next(trees)
    for position in positions:
        while position >= t.interval[1]:
            t = next(trees)
        generate_site_mutations(t,
                                position,
                                mu,
                                sites,
                                mutations,
                                multiple_per_node=multiple_per_node)
    tables = ts.dump_tables()
    add_provenance(tables.provenances, "jukes_cantor")
    new_ts = msprime.load_tables(nodes=tables.nodes,
                                 edges=tables.edges,
                                 sites=sites,
                                 mutations=mutations,
                                 provenances=tables.provenances)
    return new_ts
Пример #16
0
 def finalise(self, simplify=True, stabilise_node_ordering=False):
     logger.info("Finalising tree sequence")
     ts = self.get_tree_sequence(all_sites=True)
     if simplify:
         logger.info("Running simplify on {} nodes and {} edges".format(
             ts.num_nodes, ts.num_edges))
         if stabilise_node_ordering:
             # Ensure all the node times are distinct so that they will have
             # stable IDs after simplifying. This could possibly also be done
             # by reversing the IDs within a time slice. This is used for comparing
             # tree sequences produced by perfect inference.
             tables = ts.tables
             time = tables.nodes.time
             for t in range(1, int(time[0])):
                 index = np.where(time == t)[0]
                 k = index.shape[0]
                 time[index] += np.arange(k)[::-1] / k
             tables.nodes.set_columns(flags=tables.nodes.flags, time=time)
             msprime.sort_tables(**tables.asdict())
             ts = msprime.load_tables(**tables.asdict())
         ts = ts.simplify(samples=self.sample_ids,
                          filter_zero_mutation_sites=False)
         logger.info(
             "Finished simplify; now have {} nodes and {} edges".format(
                 ts.num_nodes, ts.num_edges))
     return ts
Пример #17
0
 def test_one_generation_no_deep_history(self):
     N = 20
     tables = wf_sim(N=N,
                     ngens=1,
                     deep_history=False,
                     seed=self.random_seed)
     self.assertEqual(tables.nodes.num_rows, 2 * N)
     self.assertGreater(tables.edges.num_rows, 0)
     self.assertEqual(tables.sites.num_rows, 0)
     self.assertEqual(tables.mutations.num_rows, 0)
     self.assertEqual(tables.migrations.num_rows, 0)
     nodes = tables.nodes
     edges = tables.edges
     samples = np.where(nodes.flags == msprime.NODE_IS_SAMPLE)[0].astype(
         np.int32)
     msprime.sort_tables(nodes=nodes, edges=edges)
     msprime.simplify_tables(samples=samples, nodes=nodes, edges=edges)
     self.assertGreater(tables.nodes.num_rows, 0)
     self.assertGreater(tables.edges.num_rows, 0)
     ts = msprime.load_tables(nodes=nodes, edges=edges)
     for tree in ts.trees():
         all_samples = set()
         for root in tree.roots:
             root_samples = set(tree.samples(root))
             self.assertEqual(len(root_samples & all_samples), 0)
             all_samples |= root_samples
         self.assertEqual(all_samples, set(ts.samples()))
Пример #18
0
def wright_fisher(N, delta, L, T):
    """
    Direct implementation of Algorithm W.
    """
    edges = msprime.EdgeTable()
    tau = []
    P = [j for j in range(N)]
    for j in range(N):
        tau.append(T)
    t = T
    n = N
    while t > 0:
        t -= 1
        j = 0
        Pp = [P[j] for j in range(N)]
        while j < N:
            if random.random() < delta:
                Pp[j] = n
                tau.append(t)
                a = random.randint(0, N - 1)
                b = random.randint(0, N - 1)
                x = random.uniform(0, L)
                edges.add_row(0, x, P[a], n)
                edges.add_row(x, L, P[b], n)
                n += 1
            j += 1
        P = Pp
    nodes = msprime.NodeTable()
    P = set(P)
    for j in range(n):
        nodes.add_row(time=tau[j], flags=int(j in P))
    msprime.sort_tables(nodes=nodes, edges=edges)
    return msprime.load_tables(nodes=nodes, edges=edges)
Пример #19
0
def _load_legacy_hdf5_v3(root, remove_duplicate_positions):
    # get the trees group for the records and samples
    trees_group = root["trees"]
    nodes_group = trees_group["nodes"]
    time = np.array(nodes_group["time"])

    breakpoints = np.array(trees_group["breakpoints"])
    records_group = trees_group["records"]
    left_indexes = np.array(records_group["left"])
    right_indexes = np.array(records_group["right"])
    record_node = np.array(records_group["node"], dtype=np.int32)
    num_nodes = time.shape[0]
    sample_size = np.min(record_node)
    flags = np.zeros(num_nodes, dtype=np.uint32)
    flags[:sample_size] = msprime.NODE_IS_SAMPLE

    children_length = np.array(records_group["num_children"], dtype=np.uint32)
    total_rows = np.sum(children_length)
    left = np.zeros(total_rows, dtype=np.float64)
    right = np.zeros(total_rows, dtype=np.float64)
    parent = np.zeros(total_rows, dtype=np.int32)
    record_left = breakpoints[left_indexes]
    record_right = breakpoints[right_indexes]
    k = 0
    for j in range(left_indexes.shape[0]):
        for _ in range(children_length[j]):
            left[k] = record_left[j]
            right[k] = record_right[j]
            parent[k] = record_node[j]
            k += 1
    nodes = msprime.NodeTable()
    nodes.set_columns(flags=flags,
                      time=nodes_group["time"],
                      population=nodes_group["population"])
    edges = msprime.EdgeTable()
    edges.set_columns(left=left,
                      right=right,
                      parent=parent,
                      child=records_group["children"])
    sites = msprime.SiteTable()
    mutations = msprime.MutationTable()
    if "mutations" in root:
        _convert_hdf5_mutations(root["mutations"], sites, mutations,
                                remove_duplicate_positions)
    old_timestamp = datetime.datetime.min.isoformat()
    provenances = msprime.ProvenanceTable()
    if "provenance" in root:
        for record in root["provenance"]:
            provenances.add_row(timestamp=old_timestamp, record=record)
    provenances.add_row(_get_upgrade_provenance(root))
    msprime.sort_tables(nodes=nodes,
                        edges=edges,
                        sites=sites,
                        mutations=mutations)
    return msprime.load_tables(nodes=nodes,
                               edges=edges,
                               sites=sites,
                               mutations=mutations,
                               provenances=provenances)
Пример #20
0
def make_tree_add_mutations(nodes, edges, mutrate):
    rng = msprime.RandomGenerator(42)
    m = msprime.MutationTable()
    s = msprime.SiteTable()
    mg = msprime.MutationGenerator(rng, mutrate)
    mg.generate(nodes, edges, s, m)
    rv = msprime.load_tables(nodes=nodes, edgesets=edges, sites=s, mutations=m)
    return (rv, s)
Пример #21
0
def provenance_timestamp_only_example():
    ts = msprime.simulate(10, random_seed=1)
    tables = ts.dump_tables()
    provenances = msprime.ProvenanceTable()
    provenances.add_row(timestamp="12345", record="")
    return msprime.load_tables(nodes=tables.nodes,
                               edges=tables.edges,
                               provenances=provenances)
Пример #22
0
 def get_multiroot_example(self):
     ts = msprime.simulate(
         sample_size=50, recombination_rate=5, random_seed=self.random_seed)
     tables = ts.dump_tables()
     edges = tables.edges
     n = len(edges) // 2
     edges.set_columns(
         left=edges.left[:n], right=edges.right[:n],
         parent=edges.parent[:n], child=edges.child[:n])
     return msprime.load_tables(nodes=tables.nodes, edges=edges)
Пример #23
0
def mutation_metadata_example():
    ts = msprime.simulate(10, length=10, random_seed=2)
    tables = ts.dump_tables()
    tables.sites.add_row(0, ancestral_state="a")
    for j in range(10):
        tables.mutations.add_row(site=0,
                                 node=j,
                                 derived_state="t",
                                 metadata=b"1234")
    return msprime.load_tables(**tables.asdict())
Пример #24
0
def general_mutation_example():
    ts = msprime.simulate(10, recombination_rate=1, length=10, random_seed=2)
    tables = ts.dump_tables()
    tables.sites.add_row(position=0, ancestral_state="A", metadata=b"{}")
    tables.sites.add_row(position=1, ancestral_state="C", metadata=b"{'id':1}")
    tables.mutations.add_row(site=0, node=0, derived_state="T")
    tables.mutations.add_row(site=1, node=0, derived_state="G")
    return msprime.load_tables(nodes=tables.nodes,
                               edges=tables.edges,
                               sites=tables.sites,
                               mutations=tables.mutations)
Пример #25
0
 def store_output(self):
     if self.num_ancestors > 0:
         ts = self.get_tree_sequence(rescale_positions=False)
     else:
         # Allocate an empty tree sequence.
         ts = msprime.load_tables(nodes=msprime.NodeTable(),
                                  edges=msprime.EdgeTable(),
                                  sequence_length=1)
     if self.output_path is not None:
         ts.dump(self.output_path)
     return ts
Пример #26
0
def node_metadata_example():
    ts = msprime.simulate(
        sample_size=100, recombination_rate=0.1, length=10, random_seed=1)
    nodes = msprime.NodeTable()
    edges = msprime.EdgeTable()
    ts.dump_tables(nodes=nodes, edges=edges)
    new_nodes = msprime.NodeTable()
    metadatas = ["n_{}".format(u) for u in range(ts.num_nodes)]
    packed, offset = msprime.pack_strings(metadatas)
    new_nodes.set_columns(
        metadata=packed, metadata_offset=offset, flags=nodes.flags, time=nodes.time)
    return msprime.load_tables(nodes=new_nodes, edges=edges)
Пример #27
0
def decapitate(ts, num_edges):
    """
    Returns a copy of the specified tree sequence in which the specified number of
    edges have been retained.
    """
    t = ts.dump_tables()
    t.edges.set_columns(
        left=t.edges.left[:num_edges], right=t.edges.right[:num_edges],
        parent=t.edges.parent[:num_edges], child=t.edges.child[:num_edges])
    add_provenance(t.provenances, "decapitate")
    return msprime.load_tables(
        nodes=t.nodes, edges=t.edges, sites=t.sites, mutations=t.mutations,
        provenances=t.provenances, sequence_length=ts.sequence_length)
Пример #28
0
 def test_nodes(self):
     nodes = msprime.NodeTable()
     edges = msprime.EdgeTable()
     metadata = ExampleMetadata(one="node1", two="node2")
     pickled = pickle.dumps(metadata)
     nodes.add_row(time=0.125, metadata=pickled)
     ts = msprime.load_tables(nodes=nodes, edges=edges, sequence_length=1)
     node = ts.node(0)
     self.assertEqual(node.time, 0.125)
     self.assertEqual(node.metadata, pickled)
     unpickled = pickle.loads(node.metadata)
     self.assertEqual(unpickled.one, metadata.one)
     self.assertEqual(unpickled.two, metadata.two)
Пример #29
0
def insert_redundant_breakpoints(ts):
    """
    Builds a new tree sequence containing redundant breakpoints.
    """
    tables = ts.dump_tables()
    tables.edges.reset()
    for r in ts.edges():
        x = r.left + (r.right - r.left) / 2
        tables.edges.add_row(left=r.left, right=x, child=r.child, parent=r.parent)
        tables.edges.add_row(left=x, right=r.right, child=r.child, parent=r.parent)
    add_provenance(tables.provenances, "insert_redundant_breakpoints")
    new_ts = msprime.load_tables(**tables.asdict())
    assert new_ts.num_edges == 2 * ts.num_edges
    return new_ts
Пример #30
0
 def get_multiroot_tree(self):
     ts = msprime.simulate(15, random_seed=1)
     # Take off the top quarter of edges
     tables = ts.dump_tables()
     edges = tables.edges
     n = len(edges) - len(edges) // 4
     edges.set_columns(
         left=edges.left[:n], right=edges.right[:n],
         parent=edges.parent[:n], child=edges.child[:n])
     ts = msprime.load_tables(nodes=tables.nodes, edges=edges)
     for t in ts.trees():
         if t.num_roots > 1:
             return t
     assert False
    def test4(self):
        self.n.set_columns(time=[1,0,0,2],flags=[msprime.NODE_IS_SAMPLE]*4)
        
        self.e.add_row(parent=0,child=1,left=0,right=0.4)
        self.e.add_row(parent=0,child=1,left=0.6,right=1.0)
        self.e.add_row(parent=0,child=2,left=0,right=1)
        self.e.add_row(parent=3,child=0,left=0,right=0.4)

        self.s.add_row(position=0.4,ancestral_state='0')
        self.m.add_row(site=0,node=3,derived_state='1')

        msprime.sort_tables(nodes=self.n,edges=self.e,
                sites=self.s,mutations=self.m)
        idmap = msprime.simplify_tables(nodes=self.n,edges=self.e,
                sites=self.s,mutations=self.m,samples=[1,2])
        ts = msprime.load_tables(nodes=self.n,edges=self.e,sites=self.s,
                mutations=self.m)
        m = ts.genotype_matrix()
        self.assertEqual(m[0:].sum(),0)
Пример #32
0
def simplify(S, Ni, Ei, L):
    """
    This is an implementation of the simplify algorithm described in Appendix A
    of the paper.
    """
    No = msprime.NodeTable()
    Eo = msprime.EdgeTable()
    A = [[] for _ in range(len(Ni))]
    Q = []

    ancient_nodes = []
    for u in S:
        v = No.add_row(time=Ni.time[u], flags=1)
        if Ni.time[u] != 0.0:
            ancient_nodes.append(u)
        assert(v == len(No)-1)
        A[u] = [Segment(0, L, v)]

    # for u in S:
    #     print(u, A[u])
    # print("ancient nodes = ", ancient_nodes)

    # These changes make sure that
    # we collect edges for merging
    # in proper time order.
    # inodes = [i for i in range(len(Ni))]
    # inodes = sorted(inodes,key=lambda x:Ni.time[x])
    # for u in range(len(Ni)):
    # for u in inodes:
    #     for e in [e for e in Ei if e.parent == u]:
    #         for x in A[e.child]:
    #             if x.right > e.left and e.right > x.left:
    #                 y = Segment(max(x.left, e.left), min(
    #                     x.right, e.right), x.node)
    #                 heapq.heappush(Q, y)
    ei = 0
    while ei < len(Ei):
        u = Ei.parent[ei]
        while ei < len(Ei) and Ei.parent[ei] == u:
            e = Ei[ei]
            for x in A[e.child]:
                if x.right > e.left and e.right > x.left:
                    y = Segment(max(x.left, e.left), min(
                        x.right, e.right), x.node)
                    heapq.heappush(Q, y)
            ei += 1

        v = -1
        while len(Q) > 0:
            l = Q[0].left
            r = L
            X = []
            while len(Q) > 0 and Q[0].left == l:
                x = heapq.heappop(Q)
                X.append(x)
                r = min(r, x.right)
            if len(Q) > 0:
                r = min(r, Q[0].left)

            if len(X) == 1:
                x = X[0]
                alpha = x
                if len(Q) > 0 and Q[0].left < x.right:
                    alpha = Segment(x.left, Q[0].left, x.node)
                    x.left = Q[0].left
                    heapq.heappush(Q, x)
            else:
                if v == -1:
                    v = No.add_row(time=Ni.time[u])
                alpha = Segment(l, r, v)
                for x in X:
                    Eo.add_row(l, r, v, x.node)
                    if x.right > r:
                        x.left = r
                        heapq.heappush(Q, x)

            A[u].append(alpha)

    # Sort the output edges and compact them as much as possible into
    # the output table. We skip this for the algorithm listing as it's pretty mundane.
    # TODO replace this with a calls to squash_edges() and sort_tables()
    E = list(Eo)
    Eo.clear()
    E.sort(key=lambda e: (e.parent, e.child, e.right, e.left))
    start = 0
    for j in range(1, len(E)):
        condition = (
            E[j - 1].right != E[j].left or
            E[j - 1].parent != E[j].parent or
            E[j - 1].child != E[j].child)
        if condition:
            Eo.add_row(E[start].left, E[j - 1].right,
                       E[j - 1].parent, E[j - 1].child)
            start = j
    j = len(E)
    Eo.add_row(E[start].left, E[j - 1].right, E[j - 1].parent, E[j - 1].child)

    # for i in Eo:
    #     print(i.left, i.right, i.parent, i.child,
    #           No.time[i.parent], No.time[i.child])
    return msprime.load_tables(nodes=No, edges=Eo)
Пример #33
0
def simplify(S, Ni, Ei, L):
    """
    This is an implementation of the simplify algorithm described in Appendix A
    of the paper.
    """
    No = msprime.NodeTable()
    Eo = msprime.EdgeTable()
    A = [[] for _ in range(len(Ni))]
    Q = []

    for u in S:
        v = No.add_row(time=Ni.time[u], flags=1)
        assert(v == len(No)-1)
        A[u] = [Segment(0, L, v)]

    for u in S:
        print(u,A[u])

    for u in range(len(Ni)):
        for e in [e for e in Ei if e.parent == u]:
            for x in A[e.child]:
                if x.right > e.left and e.right > x.left:
                    y = Segment(max(x.left, e.left), min(x.right, e.right), x.node)
                    heapq.heappush(Q, y)
        # if len(Q) >0:
        #     print("Qsize: ",u,len(Q))
        v = -1
        while len(Q) > 0:
            l = Q[0].left
            r = L
            X = []
            while len(Q) > 0 and Q[0].left == l:
                x = heapq.heappop(Q)
                X.append(x)
                r = min(r, x.right)
            if len(Q) > 0:
                r = min(r, Q[0].left)

            if len(X) == 1:
                x = X[0]
                alpha = x
                if len(Q) > 0 and Q[0].left < x.right:
                    alpha = Segment(x.left, Q[0].left, x.node)
                    x.left = Q[0].left
                    heapq.heappush(Q, x)
            else:
                if v == -1:
                    v = No.add_row(time=Ni.time[u])
                alpha = Segment(l, r, v)
                for x in X:
                    Eo.add_row(l, r, v, x.node)
                    if x.right > r:
                        x.left = r
                        heapq.heappush(Q, x)

            print("check:",u,e.parent)
            A[u].append(alpha)

    # Sort the output edges and compact them as much as possible into
    # the output table. We skip this for the algorithm listing as it's pretty mundane.
    # TODO replace this with a calls to squash_edges() and sort_tables()
    E = list(Eo)
    Eo.clear()
    E.sort(key=lambda e: (e.parent, e.child, e.right, e.left))
    start = 0
    for j in range(1, len(E)):
        condition = (
            E[j - 1].right != E[j].left or
            E[j - 1].parent != E[j].parent or
            E[j - 1].child != E[j].child)
        if condition:
            Eo.add_row(E[start].left, E[j - 1].right, E[j - 1].parent, E[j - 1].child)
            start = j
    j = len(E)
    Eo.add_row(E[start].left, E[j - 1].right, E[j - 1].parent, E[j - 1].child)

    return msprime.load_tables(nodes=No, edges=Eo)