예제 #1
0
    def test_example(self):
        tables = get_example_tables()
        tables.metadata_schema = tskit.MetadataSchema(
            {
                "codec": "struct",
                "type": "object",
                "properties": {"top-level": {"type": "string", "binaryFormat": "50p"}},
            }
        )
        tables.metadata = {"top-level": "top-level-metadata"}
        for table in [
            "individuals",
            "nodes",
            "edges",
            "migrations",
            "sites",
            "mutations",
            "populations",
        ]:
            t = getattr(tables, table)
            t.packset_metadata([f"{table}-{i}".encode() for i in range(t.num_rows)])
            t.metadata_schema = tskit.MetadataSchema(
                {
                    "codec": "struct",
                    "type": "object",
                    "properties": {table: {"type": "string", "binaryFormat": "50p"}},
                }
            )

        self.verify(tables)
예제 #2
0
    def test_example(self, tables):
        tables.metadata_schema = tskit.MetadataSchema({
            "codec": "struct",
            "type": "object",
            "properties": {
                "top-level": {
                    "type": "string",
                    "binaryFormat": "50p"
                }
            },
        })
        tables.metadata = {"top-level": "top-level-metadata"}
        for table in tskit.TABLE_NAMES:
            t = getattr(tables, table)
            if hasattr(t, "metadata_schema"):
                t.packset_metadata(
                    [f"{table}-{i}".encode() for i in range(t.num_rows)])
                t.metadata_schema = tskit.MetadataSchema({
                    "codec": "struct",
                    "type": "object",
                    "properties": {
                        table: {
                            "type": "string",
                            "binaryFormat": "50p"
                        }
                    },
                })

        self.verify(tables)
예제 #3
0
 def test_all_fields(self):
     demography = msprime.Demography()
     demography.add_population(name="A", initial_size=10_000)
     demography.add_population(name="B", initial_size=5_000)
     demography.add_population(name="C", initial_size=1_000)
     demography.add_population_split(time=1000, derived=["A", "B"], ancestral="C")
     ts = msprime.sim_ancestry(
         samples={"A": 1, "B": 1},
         demography=demography,
         random_seed=42,
         record_migrations=True,
     )
     ts = msprime.sim_mutations(ts, rate=1, random_seed=42)
     tables = ts.dump_tables()
     for name, table in tables.table_name_map.items():
         if name not in ["provenances", "edges"]:
             table.metadata_schema = tskit.MetadataSchema({"codec": "json"})
             metadatas = [f'{{"foo":"n_{name}_{u}"}}' for u in range(len(table))]
             metadata, metadata_offset = tskit.pack_strings(metadatas)
             table.set_columns(
                 **{
                     **table.asdict(),
                     "metadata": metadata,
                     "metadata_offset": metadata_offset,
                 }
             )
     tables.metadata_schema = tskit.MetadataSchema({"codec": "json"})
     tables.metadata = "Test metadata"
     self.verify(tables.tree_sequence())
예제 #4
0
def full_ts():
    """
    Return a tree sequence that has data in all fields.
    """
    """
    A tree sequence with data in all fields - duplcated from tskit's conftest.py
    as other test suites using this file will not have that fixture defined.
    """
    n = 10
    t = 1
    population_configurations = [
        msprime.PopulationConfiguration(n // 2),
        msprime.PopulationConfiguration(n // 2),
        msprime.PopulationConfiguration(0),
    ]
    demographic_events = [
        msprime.MassMigration(time=t, source=0, destination=2),
        msprime.MassMigration(time=t, source=1, destination=2),
    ]
    ts = msprime.simulate(
        population_configurations=population_configurations,
        demographic_events=demographic_events,
        random_seed=1,
        mutation_rate=1,
        record_migrations=True,
    )
    tables = ts.dump_tables()
    # TODO replace this with properly linked up individuals using sim_ancestry
    # once 1.0 is released.
    for j in range(n):
        tables.individuals.add_row(flags=j,
                                   location=(j, j),
                                   parents=(j - 1, j - 1))

    for name, table in tables.name_map.items():
        if name != "provenances":
            table.metadata_schema = tskit.MetadataSchema({"codec": "json"})
            metadatas = [f"n_{name}_{u}" for u in range(len(table))]
            metadata, metadata_offset = tskit.pack_strings(metadatas)
            table.set_columns(
                **{
                    **table.asdict(),
                    "metadata": metadata,
                    "metadata_offset": metadata_offset,
                })
    tables.metadata_schema = tskit.MetadataSchema({"codec": "json"})
    tables.metadata = "Test metadata"

    # Add some more provenance so we have enough rows for the offset deletion test.
    for j in range(10):
        tables.provenances.add_row(timestamp="x" * j, record="y" * j)
    return tables.tree_sequence()
예제 #5
0
def full_ts():
    """
    A tree sequence with data in all fields - duplicated from tskit's conftest.py
    as other test suites using this file will not have that fixture defined.
    """
    demography = msprime.Demography()
    demography.add_population(initial_size=100, name="A")
    demography.add_population(initial_size=100, name="B")
    demography.add_population(initial_size=100, name="C")
    demography.add_population_split(time=10, ancestral="C", derived=["A", "B"])

    ts = msprime.sim_ancestry(
        {"A": 5, "B": 5},
        demography=demography,
        random_seed=1,
        sequence_length=10,
        record_migrations=True,
    )
    assert ts.num_migrations > 0
    assert ts.num_individuals > 0
    ts = msprime.sim_mutations(ts, rate=0.1, random_seed=2)
    assert ts.num_mutations > 0
    tables = ts.dump_tables()
    tables.individuals.clear()

    for ind in ts.individuals():
        tables.individuals.add_row(flags=0, location=[ind.id, ind.id], parents=[-1, -1])

    for name, table in tables.table_name_map.items():
        if name != "provenances":
            table.metadata_schema = tskit.MetadataSchema({"codec": "json"})
            metadatas = [f"n_{name}_{u}" for u in range(len(table))]
            metadata, metadata_offset = tskit.pack_strings(metadatas)
            table.set_columns(
                **{
                    **table.asdict(),
                    "metadata": metadata,
                    "metadata_offset": metadata_offset,
                }
            )
    tables.metadata_schema = tskit.MetadataSchema({"codec": "json"})
    tables.metadata = {"A": "Test metadata"}

    tables.reference_sequence.data = "A" * int(tables.sequence_length)
    tables.reference_sequence.url = "https://example.com/sequence"
    tables.reference_sequence.metadata_schema = tskit.MetadataSchema.permissive_json()
    tables.reference_sequence.metadata = {"A": "Test metadata"}

    # Add some more provenance so we have enough rows for the offset deletion test.
    for j in range(10):
        tables.provenances.add_row(timestamp="x" * j, record="y" * j)
    return tables.tree_sequence()
예제 #6
0
def ts_fixture():
    """
    A tree sequence with data in all fields
    """
    demography = msprime.Demography()
    demography.add_population(name="A", initial_size=10_000)
    demography.add_population(name="B", initial_size=5_000)
    demography.add_population(name="C", initial_size=1_000)
    demography.add_population(name="D", initial_size=500)
    demography.add_population(name="E", initial_size=100)
    demography.add_population_split(time=1000, derived=["A", "B"], ancestral="C")
    ts = msprime.sim_ancestry(
        samples={"A": 10, "B": 10},
        demography=demography,
        sequence_length=5,
        random_seed=42,
        record_migrations=True,
        record_provenance=True,
    )
    ts = msprime.sim_mutations(ts, rate=0.001, random_seed=42)
    tables = ts.dump_tables()
    # Add locations to individuals
    individuals_copy = tables.individuals.copy()
    tables.individuals.clear()
    for i, individual in enumerate(individuals_copy):
        tables.individuals.append(
            individual.replace(location=[i, i + 1], parents=[i - 1, i - 1])
        )
    for name, table in tables.name_map.items():
        if name != "provenances":
            table.metadata_schema = tskit.MetadataSchema({"codec": "json"})
            metadatas = [f'{{"foo":"n_{name}_{u}"}}' for u in range(len(table))]
            metadata, metadata_offset = tskit.pack_strings(metadatas)
            table.set_columns(
                **{
                    **table.asdict(),
                    "metadata": metadata,
                    "metadata_offset": metadata_offset,
                }
            )
    tables.metadata_schema = tskit.MetadataSchema({"codec": "json"})
    tables.metadata = "Test metadata"

    # Add some more rows to provenance to have enough for testing.
    for _ in range(3):
        tables.provenances.add_row(record="A")

    return tables.tree_sequence()
예제 #7
0
 def test_set_tree_sequence_metadata_keeps(self):
     # make sure doesn't overwrite other stuff
     dummy_schema = tskit.MetadataSchema({
         'codec': 'json',
         'type': 'object',
         'properties': {
             'abc': {
                 'type': 'string'
             }
         }
     })
     dummy_metadata = {'abc': 'foo'}
     for ts in self.get_slim_examples():
         tables = ts.tables
         tables.metadata_schema = dummy_schema
         tables.metadata = dummy_metadata
         pyslim.set_tree_sequence_metadata(tables, "nonWF", 0)
         schema = tables.metadata_schema.schema
         for k in dummy_metadata:
             self.assertTrue(k in schema['properties'])
             self.assertTrue(k in tables.metadata)
             self.assertEqual(tables.metadata[k], dummy_metadata[k])
         self.validate_slim_metadata(tables)
         self.assertEqual(tables.metadata['SLiM']['model_type'], "nonWF")
         self.assertEqual(tables.metadata['SLiM']['generation'], 0)
         break
예제 #8
0
 def test_recover_metadata(self, recipe):
     # msprime <=0.7.5 discards metadata, but we can recover it from provenance
     ts = recipe["ts"]
     tables = ts.dump_tables()
     tables.metadata_schema = tskit.MetadataSchema(None)
     tables.metadata = b''
     new_ts = pyslim.load_tables(tables)
     assert new_ts.metadata == ts.metadata
예제 #9
0
 def test_bad_metadata(self):
     clean_tables = self.clean_example()
     tables = clean_tables.copy()
     tables.metadata_schema = tskit.MetadataSchema({"type": "object", "codec": "json"})
     tables.metadata = {}
     ts = tables.tree_sequence()
     with pytest.raises(ValueError):
         _ = pyslim.SlimTreeSequence(ts)
예제 #10
0
 def test_set_tree_sequence_metadata_errors(self):
     for ts in self.get_slim_examples():
         tables = ts.tables
         tables.metadata_schema = tskit.MetadataSchema(None)
         self.assertGreater(len(tables.metadata), 0)
         with self.assertRaises(ValueError):
             pyslim.set_tree_sequence_metadata(tables, "nonWF", 0)
         break
예제 #11
0
 def test_recover_metadata(self):
     # msprime <=0.7.5 discards metadata, but we can recover it from provenance
     for ts in self.get_slim_examples():
         t = ts.tables
         t.metadata_schema = tskit.MetadataSchema(None)
         t.metadata = b''
         new_ts = pyslim.load_tables(t)
         self.assertEqual(new_ts.metadata, ts.metadata)
예제 #12
0
 def verify_0_3_3(self, ts):
     for table in tskit.TABLE_NAMES:
         t = getattr(ts.tables, table)
         assert t.num_rows > 0
         if hasattr(t, "metadata_schema"):
             assert t.metadata_schema == tskit.MetadataSchema({"codec": "json"})
             assert t[2].metadata == f"n_{table}_2"
     assert ts.tables.has_index()
예제 #13
0
 def verify_mutation_decoding(self, t):
     ms = tskit.MetadataSchema(None)
     nt = t.copy()
     nt.metadata_schema = ms
     for a, b in zip(t, nt):
         md = a.metadata
         with self.assertWarns(DeprecationWarning):
             omd = pyslim.decode_mutation(b.metadata)
         self.assertEqual(md, {"mutation_list": [u.asdict() for u in omd]})
예제 #14
0
파일: conftest.py 프로젝트: MIzzo-IDM/tskit
def ts_fixture():
    """
    A tree sequence with data in all fields
    """
    n = 10
    t = 1
    population_configurations = [
        msprime.PopulationConfiguration(n // 2),
        msprime.PopulationConfiguration(n // 2),
        msprime.PopulationConfiguration(0),
    ]
    demographic_events = [
        msprime.MassMigration(time=t, source=0, destination=2),
        msprime.MassMigration(time=t, source=1, destination=2),
    ]
    ts = msprime.simulate(
        population_configurations=population_configurations,
        demographic_events=demographic_events,
        random_seed=1,
        mutation_rate=1,
        record_migrations=True,
    )
    tables = ts.dump_tables()
    # TODO replace this with properly linked up individuals using sim_ancestry
    # once 1.0 is released.
    for j in range(n):
        tables.individuals.add_row(flags=j,
                                   location=(j, j),
                                   parents=(j - 1, j - 1))

    for name, table in tables.name_map.items():
        if name != "provenances":
            table.metadata_schema = tskit.MetadataSchema({"codec": "json"})
            metadatas = [f"n_{name}_{u}" for u in range(len(table))]
            metadata, metadata_offset = tskit.pack_strings(metadatas)
            table.set_columns(
                **{
                    **table.asdict(),
                    "metadata": metadata,
                    "metadata_offset": metadata_offset,
                })
    tables.metadata_schema = tskit.MetadataSchema({"codec": "json"})
    tables.metadata = "Test metadata"
    return tables.tree_sequence()
예제 #15
0
 def verify_decoding(self, t, decoder):
     ms = tskit.MetadataSchema(None)
     nt = t.copy()
     nt.metadata_schema = ms
     for a, b in zip(t, nt):
         md = a.metadata
         with self.assertWarns(FutureWarning):
             omd = decoder(b.metadata)
         if md is None:
             self.assertTrue(omd is None)
         else:
             self.assertEqual(md, omd.asdict())
예제 #16
0
 def test_small_msprime_top_level_metadata(self):
     ts = msprime.simulate(10, recombination_rate=2, mutation_rate=2, random_seed=2)
     self.assertGreater(ts.num_sites, 2)
     self.assertGreater(ts.num_trees, 2)
     tables = ts.dump_tables()
     top_level_schema = {
         "codec": "json",
         "properties": {"my_int": {"type": "integer"}},
     }
     tables.metadata_schema = tskit.MetadataSchema(top_level_schema)
     tables.metadata = {"my_int": 1234}
     self.verify(tables.tree_sequence())
예제 #17
0
 def verify_0_3_3(self, ts):
     for table in [
             "populations",
             "individuals",
             "nodes",
             "edges",
             "sites",
             "migrations",
             "mutations",
     ]:
         t = getattr(ts.tables, table)
         assert t.num_rows > 0
         assert t.metadata_schema == tskit.MetadataSchema({"codec": "json"})
         assert t[2].metadata == f"n_{table}_2"
     assert ts.tables.has_index()
예제 #18
0
def test_tskit(selenium):
    import tskit

    # basic test
    tc = tskit.TableCollection(2)
    assert tc.sequence_length == 2
    tc.nodes.add_row(flags=tskit.NODE_IS_SAMPLE)
    tc.nodes.add_row(time=1)
    tc.edges.add_row(left=0, right=1, parent=1, child=0)
    tc.edges.add_row(left=1, right=2, parent=1, child=0)
    ts = tc.tree_sequence()
    assert ts.num_nodes == 2

    # save and load
    ts.dump("/tmp/tskit.trees")
    ts2 = tskit.load("/tmp/tskit.trees")
    ts.tables.assert_equals(ts2.tables)

    # test dependency related functions
    ts.draw_svg(size=(200, 200))
    tskit.MetadataSchema({"codec": "json"})
예제 #19
0
def add_default_schemas(ts):
    """
    Returns a copy of the specified tree sequence with permissive JSON
    schemas on the tables that are used for round-tripping data in tsinfer.
    """
    tables = ts.dump_tables()
    schema = tskit.MetadataSchema(tsinfer.permissive_json_schema())
    # Make sure we're not overwriting existing metadata. This will probably
    # fail when msprime 1.0 comes along, but we can fix it then.
    assert len(tables.metadata) == 0
    tables.metadata_schema = schema
    tables.metadata = {}
    tables.populations.metadata_schema = schema
    assert len(tables.populations.metadata) == 0
    tables.populations.packset_metadata([b"{}"] * ts.num_populations)
    tables.individuals.metadata_schema = schema
    assert len(tables.individuals.metadata) == 0
    tables.individuals.packset_metadata([b"{}"] * ts.num_individuals)
    tables.sites.metadata_schema = schema
    assert len(tables.sites.metadata) == 0
    tables.sites.packset_metadata([b"{}"] * ts.num_sites)
    return tables.tree_sequence()
예제 #20
0
def set_tree_sequence_metadata(tables,
                               model_type,
                               generation,
                               spatial_dimensionality='',
                               spatial_periodicity='',
                               separate_sexes=False,
                               nucleotide_based=False,
                               stage='late',
                               file_version=None):
    if file_version is None:
        file_version = slim_file_version
    if isinstance(tables.metadata, bytes):
        if len(tables.metadata) > 0:
            raise ValueError(
                "Tree sequence has top-level metadata but no schema: this is a problem "
                "since pyslim is trying to add to the metadata.")
        schema_dict = slim_metadata_schemas['tree_sequence'].schema
        metadata_dict = {}
    else:
        # we need to keep other keys in the metadata (and schema) if there are any
        schema_dict = tables.metadata_schema.schema
        metadata_dict = tables.metadata
    assert (schema_dict['codec'] == 'json')
    assert (schema_dict['type'] == 'object')
    schema_dict['properties']['SLiM'] = slim_metadata_schemas[
        'tree_sequence'].schema['properties']['SLiM']
    tables.metadata_schema = tskit.MetadataSchema(schema_dict)
    metadata_dict['SLiM'] = {
        "model_type": model_type,
        "generation": generation,
        "file_version": file_version,
        "spatial_dimensionality": spatial_dimensionality,
        "spatial_periodicity": spatial_periodicity,
        "separate_sexes": separate_sexes,
        "nucleotide_based": nucleotide_based,
        "stage": stage,
    }
    tables.metadata = metadata_dict
    _set_metadata_schemas(tables)
예제 #21
0
 def test_set_tree_sequence_metadata_keeps(self, recipe):
     # make sure doesn't overwrite other stuff
     for x in [{}, {'properties': {'abc': {'type': 'string'}}}]:
         schema_dict = {
             'codec': 'json',
             'type': 'object',
         }
         schema_dict.update(x)
         dummy_schema = tskit.MetadataSchema(schema_dict)
         dummy_metadata = {'abc': 'foo'}
         tables = recipe["ts"].dump_tables()
         tables.metadata_schema = dummy_schema
         tables.metadata = dummy_metadata
         pyslim.set_tree_sequence_metadata(tables, "nonWF", 0)
         schema = tables.metadata_schema.schema
         for k in dummy_metadata:
             if len(x) > 0:
                 assert k in schema['properties']
             assert k in tables.metadata
             assert tables.metadata[k] == dummy_metadata[k]
         self.validate_slim_metadata(tables)
         assert tables.metadata['SLiM']['model_type'] == "nonWF"
         assert tables.metadata['SLiM']['generation'] == 0
예제 #22
0
 def assertTableCollectionsEqual(self, t1, t2,
         skip_provenance=False, check_metadata_schema=True,
         reordered_individuals=False):
     if isinstance(t1, tskit.TreeSequence):
         t1 = t1.dump_tables()
     if isinstance(t2, tskit.TreeSequence):
         t2 = t2.dump_tables()
     t1_samples = [(n.metadata['slim_id'], j) for j, n in enumerate(t1.nodes) if (n.flags & tskit.NODE_IS_SAMPLE)]
     t1_samples.sort()
     t2_samples = [(n.metadata['slim_id'], j) for j, n in enumerate(t2.nodes) if (n.flags & tskit.NODE_IS_SAMPLE)]
     t2_samples.sort()
     t1.simplify([j for (_, j) in t1_samples], record_provenance=False)
     t2.simplify([j for (_, j) in t2_samples], record_provenance=False)
     if skip_provenance is True:
         t1.provenances.clear()
         t2.provenances.clear()
     if skip_provenance == -1:
         assert t1.provenances.num_rows + 1 == t2.provenances.num_rows
         t2.provenances.truncate(t1.provenances.num_rows)
         assert t1.provenances.num_rows == t2.provenances.num_rows
     if check_metadata_schema:
         # this is redundant now, but will help diagnose if things go wrong
         assert t1.metadata_schema.schema == t2.metadata_schema.schema
         assert t1.populations.metadata_schema.schema == t2.populations.metadata_schema.schema
         assert t1.individuals.metadata_schema.schema == t2.individuals.metadata_schema.schema
         assert t1.nodes.metadata_schema.schema == t2.nodes.metadata_schema.schema
         assert t1.edges.metadata_schema.schema == t2.edges.metadata_schema.schema
         assert t1.sites.metadata_schema.schema == t2.sites.metadata_schema.schema
         assert t1.mutations.metadata_schema.schema == t2.mutations.metadata_schema.schema
         assert t1.migrations.metadata_schema.schema == t2.migrations.metadata_schema.schema
     if not check_metadata_schema:
         # need to pull out metadata to compare as dicts before zeroing the schema
         m1 = t1.metadata
         m2 = t2.metadata
         ms = tskit.MetadataSchema(None)
         for t in (t1, t2):
             t.metadata_schema = ms
             t.populations.metadata_schema = ms
             t.individuals.metadata_schema = ms
             t.nodes.metadata_schema = ms
             t.edges.metadata_schema = ms
             t.sites.metadata_schema = ms
             t.mutations.metadata_schema = ms
             t.migrations.metadata_schema = ms
         t1.metadata = b''
         t2.metadata = b''
         assert m1 == m2
     if reordered_individuals:
         ind1 = {i.metadata['pedigree_id']: j for j, i in enumerate(t1.individuals)}
         ind2 = {i.metadata['pedigree_id']: j for j, i in enumerate(t2.individuals)}
         for pid in ind1:
             if not pid in ind2:
                 print("not in t2:", ind1[pid])
             assert pid in ind2
             if t1.individuals[ind1[pid]] != t2.individuals[ind2[pid]]:
                 print("t1:", t1.individuals[ind1[pid]])
                 print("t2:", t2.individuals[ind2[pid]])
             assert t1.individuals[ind1[pid]] == t2.individuals[ind2[pid]]
         for pid in ind2:
             if not pid in ind1:
                 print("not in t1:", ind2[pid])
             assert pid in ind1
         t1.individuals.clear()
         t2.individuals.clear()
     # go through one-by-one so we know which fails
     self.assertTablesEqual(t1.populations, t2.populations, "populations")
     self.assertTablesEqual(t1.individuals, t2.individuals, "individuals")
     self.assertTablesEqual(t1.nodes, t2.nodes, "nodes")
     self.assertTablesEqual(t1.edges, t2.edges, "edges")
     self.assertTablesEqual(t1.sites, t2.sites, "sites")
     self.assertTablesEqual(t1.mutations, t2.mutations, "mutations")
     self.assertTablesEqual(t1.migrations, t2.migrations, "migrations")
     self.assertTablesEqual(t1.provenances, t2.provenances, "provenances")
     self.assertMetadataEqual(t1, t2)
     assert t1.sequence_length == t2.sequence_length
     if t1.reference_sequence.data != t2.reference_sequence.data:
         print(t1.reference_sequence.data, " != ", t2.reference_sequence.data)
     assert t1.reference_sequence.data == t2.reference_sequence.data
예제 #23
0
def get_example_tables():
    """
    Return a tree sequence that has data in all fields.
    """
    pop_configs = [msprime.PopulationConfiguration(5) for _ in range(2)]
    migration_matrix = [[0, 1], [1, 0]]
    ts = msprime.simulate(
        population_configurations=pop_configs,
        migration_matrix=migration_matrix,
        mutation_rate=1,
        record_migrations=True,
        random_seed=1,
    )

    tables = ts.dump_tables()
    for j in range(ts.num_samples):
        tables.individuals.add_row(flags=j,
                                   location=np.arange(j),
                                   metadata=b"x" * j)
    tables.nodes.clear()
    for node in ts.nodes():
        tables.nodes.add_row(
            flags=node.flags,
            time=node.time,
            population=node.population,
            individual=node.id if node.id < ts.num_samples else -1,
            metadata=b"y" * node.id,
        )
    tables.edges.clear()
    for edge in ts.edges():
        tables.edges.add_row(
            left=edge.left,
            right=edge.right,
            child=edge.child,
            parent=edge.parent,
            metadata=b"y" * edge.id,
        )
    tables.sites.clear()
    for site in ts.sites():
        tables.sites.add_row(
            position=site.position,
            ancestral_state="A" * site.id,
            metadata=b"q" * site.id,
        )
    tables.mutations.clear()
    for mutation in ts.mutations():
        mut_id = tables.mutations.add_row(
            site=mutation.site,
            node=mutation.node,
            time=0,
            parent=-1,
            derived_state="C" * mutation.id,
            metadata=b"x" * mutation.id,
        )
        # Add another mutation on the same branch.
        tables.mutations.add_row(
            site=mutation.site,
            node=mutation.node,
            time=0,
            parent=mut_id,
            derived_state="G" * mutation.id,
            metadata=b"y" * mutation.id,
        )
    tables.migrations.clear()
    for migration in ts.migrations():
        tables.migrations.add_row(
            left=migration.left,
            right=migration.right,
            node=migration.node,
            source=migration.source,
            dest=migration.dest,
            time=migration.time,
            metadata=b"y" * migration.id,
        )
    for j in range(10):
        tables.populations.add_row(metadata=b"p" * j)
        tables.provenances.add_row(timestamp="x" * j, record="y" * j)
    tables.metadata_schema = tskit.MetadataSchema({
        "codec": "struct",
        "type": "object",
        "properties": {
            "top-level": {
                "type": "array",
                "items": {
                    "type": "integer",
                    "binaryFormat": "B"
                },
                "noLengthEncodingExhaustBuffer": True,
            }
        },
    })
    tables.metadata = {"top-level": [1, 2, 3, 4]}
    for table in [
            "individuals",
            "nodes",
            "edges",
            "migrations",
            "sites",
            "mutations",
            "populations",
    ]:
        t = getattr(tables, table)
        t.metadata_schema = tskit.MetadataSchema({
            "codec": "struct",
            "type": "object",
            "properties": {
                table: {
                    "type": "string",
                    "binaryFormat": "50p"
                }
            },
        })
    return tables
예제 #24
0
class TestEquals:
    def test_equal_self(self, ts_fixture):
        ts_fixture.reference_sequence.assert_equals(ts_fixture.reference_sequence)
        assert ts_fixture.reference_sequence == ts_fixture.reference_sequence
        assert not ts_fixture.reference_sequence != ts_fixture.reference_sequence
        assert ts_fixture.reference_sequence.equals(ts_fixture.reference_sequence)

    def test_equal_empty(self):
        tables = tskit.TableCollection(1)
        tables.reference_sequence.assert_equals(tables.reference_sequence)
        assert tables.reference_sequence == tables.reference_sequence
        assert tables.reference_sequence.equals(tables.reference_sequence)

    @pytest.mark.parametrize("attr", ["url", "data"])
    def test_unequal_attr_missing(self, ts_fixture, attr):
        t1 = ts_fixture.tables
        d = t1.asdict()
        del d["reference_sequence"][attr]
        t2 = tskit.TableCollection.fromdict(d)
        with pytest.raises(AssertionError, match=attr):
            t1.reference_sequence.assert_equals(t2.reference_sequence)
        assert t1.reference_sequence != t2.reference_sequence
        assert not t1.reference_sequence.equals(t2.reference_sequence)
        with pytest.raises(AssertionError, match=attr):
            t2.reference_sequence.assert_equals(t1.reference_sequence)
        assert t2.reference_sequence != t1.reference_sequence
        assert not t2.reference_sequence.equals(t1.reference_sequence)

    @pytest.mark.parametrize(
        ("attr", "val"),
        [
            ("url", "foo"),
            ("data", "bar"),
            ("metadata", {"json": "runs the world"}),
            ("metadata_schema", tskit.MetadataSchema(None)),
        ],
    )
    def test_different_not_equal(self, ts_fixture, attr, val):
        t1 = ts_fixture.dump_tables()
        t2 = t1.copy()
        setattr(t1.reference_sequence, attr, val)

        with pytest.raises(AssertionError):
            t1.reference_sequence.assert_equals(t2.reference_sequence)
        assert t1.reference_sequence != t2.reference_sequence
        assert not t1.reference_sequence.equals(t2.reference_sequence)
        with pytest.raises(AssertionError):
            t2.reference_sequence.assert_equals(t1.reference_sequence)
        assert t2.reference_sequence != t1.reference_sequence
        assert not t2.reference_sequence.equals(t1.reference_sequence)

    @pytest.mark.parametrize(
        ("attr", "val"),
        [
            ("metadata", {"json": "runs the world"}),
            ("metadata_schema", tskit.MetadataSchema(None)),
        ],
    )
    def test_different_but_ignore(self, ts_fixture, attr, val):
        t1 = ts_fixture.dump_tables()
        t2 = t1.copy()
        setattr(t1.reference_sequence, attr, val)

        with pytest.raises(AssertionError):
            t1.reference_sequence.assert_equals(t2.reference_sequence)
        assert t1.reference_sequence != t2.reference_sequence
        assert not t1.reference_sequence.equals(t2.reference_sequence)
        with pytest.raises(AssertionError):
            t2.reference_sequence.assert_equals(t1.reference_sequence)
        assert t2.reference_sequence != t1.reference_sequence
        assert not t2.reference_sequence.equals(t1.reference_sequence)

        t2.reference_sequence.assert_equals(t1.reference_sequence, ignore_metadata=True)
        assert t2.reference_sequence.equals(t1.reference_sequence, ignore_metadata=True)
def simulate_stdpopsim(
    species,
    model,
    contig,
    num_samples,
    mutation_file=None,
    seed=123,
    skip_existing=False,
    num_procs=1,
):
    base_fn = f"{model}_{contig}_n{num_samples}"
    tree_fn = f"{base_fn}_seed{seed}"
    logger.info(
        f"Using {species}:{contig} from stdpopsim using the {model} model")
    if skip_existing and os.path.exists(tree_fn + ".trees"):
        logger.info(
            f"Simulation file {tree_fn}.trees already exists, returning that.")
        return base_fn, tree_fn

    sample_data = None
    species = stdpopsim.get_species(species)
    model = species.get_demographic_model(model)
    num_pops = model.num_sampling_populations
    if num_samples < num_pops or num_samples % num_pops != 0:
        raise ValueError(
            f"num_samples must be an integer multiple of {num_pops} "
            f"(or 2 x {num_pops} if diploid sequencing error is injected)")
    pop_n = num_samples // num_pops
    logger.info(
        f"Simulating {num_pops}x{pop_n} samples, seed {seed}, file prefix '{tree_fn}'."
    )
    contig = species.get_contig(contig)
    l = contig.recombination_map.get_sequence_length()
    if mutation_file is not None:
        logger.debug(f"Loading {mutation_file}")
        sample_data = tsinfer.load(mutation_file)
        if sample_data.sequence_length != l:
            raise ValueError(
                f"Mismatching sequence_length between simulation and {mutation_file}"
            )
        # Reduce mutation rate to 0, as we will insert mutations later
        contig = stdpopsim.Contig(
            mutation_rate=0,
            recombination_map=contig.recombination_map,
            genetic_map=contig.genetic_map,
        )
    r_map = contig.recombination_map
    assert len(r_map.get_rates()) == 2  # Ensure a single rate over chr
    samples = model.get_samples(*([pop_n] * num_pops))
    engine = stdpopsim.get_engine('msprime')
    ts = engine.simulate(model, contig, samples, seed=seed)
    tables = ts.dump_tables()
    if sample_data is not None:
        pos = sample_data.sites_position[:]
        logger.info(
            f"Inserting {len(pos)} mutations at variable sites from {mutation_file}"
        )
        for tree in ts.trees():
            positions = pos[np.logical_and(pos >= tree.interval[0],
                                           pos < tree.interval[1])]
            if len(positions) == 0:
                continue
            muts = list(
                zip(
                    np.random.uniform(0,
                                      tree.total_branch_length,
                                      size=len(positions)), positions))
            muts.sort()
            tot = 0
            # place a mutation on a random branch, proportional to branch length
            try:
                for n in tree.nodes():
                    tot += tree.branch_length(n)
                    while muts[0][0] < tot:
                        _, position = muts.pop(0)
                        s = tables.sites.add_row(position=position,
                                                 ancestral_state="0")
                        tables.mutations.add_row(node=n,
                                                 site=s,
                                                 derived_state="1")
            except IndexError:
                # No more mutations - go to next tree
                continue
        tables.sort()
        logger.debug(
            f"Inserted mutations at density {ts.num_mutations/ts.sequence_length}"
        )
    interval = [int(l * 2 / 20),
                int(l * 2 / 20) + 1e7]  # 10Mb near the start, not centromeric
    tables.keep_intervals([interval])
    tables.trim()
    logger.debug(
        f"Cut down tree seq to  {interval} ({tables.sites.num_rows} sites) for speed"
    )

    # Add info to the top-level metadata
    user_data = {}

    logger.info(
        "Calculating the kc distance of the simulation against a flat tree")
    star_tree = tskit.Tree.generate_star(ts.num_samples,
                                         span=tables.sequence_length,
                                         record_provenance=False)
    user_data['kc_max'] = tables.tree_sequence().kc_distance(
        star_tree.tree_sequence)
    kc_array = []
    max_reps = 100
    ts = tables.tree_sequence()
    logger.info(
        f"Calculating KC distance of the sim against at most {max_reps} * {ts.num_trees}"
        f" random trees using {num_procs} parallel threads. This could take a while."
    )
    seeds = range(seed, seed + max_reps)
    with multiprocessing.Pool(num_procs) as pool:
        for i, kc in enumerate(
                pool.imap_unordered(rnd_kc, zip(itertools.repeat(ts), seeds))):
            kc_array.append(kc)
            if i > 10:
                se_mean = np.std(kc_array, ddof=1) / np.sqrt(i)
                # break if SEM < 1/100th of mean KC. This can take along time
                if se_mean / np.average(kc_array) < 0.01:
                    logger.info(
                        f"Stopped after {i} replicates as kc_max_split deemed accurate."
                    )
                    break
        user_data['kc_max_split'] = np.average(kc_array)

    if tables.metadata_schema != tskit.MetadataSchema({"codec": "json"}):
        if tables.metadata:
            raise RuntimeError("Metadata already exists, and is not JSON")
        tables.metadata_schema = tskit.MetadataSchema({"codec": "json"})
        tables.metadata = {}
    tables.metadata = {"user_data": user_data, **tables.metadata}
    tables.tree_sequence().dump(tree_fn + ".trees")
    return base_fn, tree_fn
예제 #26
0
    num_replicates=1,
)

status("Converting tables...")

# record individual id metadata
tables = next(sim).dump_tables()

individual_metadata_schema = tskit.MetadataSchema({
    "codec":
    "json",
    "type":
    "object",
    "properties": {
        # Name of the individual in the pedigree file
        "individual_name": {
            "type": "integer"
        },
        "is_sample": {
            "type": "boolean"
        },
    },
    "required": ["individual_name", "is_sample"],
})
meta_individuals = tskit.IndividualTable()

meta_individuals.metadata_schema = individual_metadata_schema
for i, ind in enumerate(tables.individuals):
    ind_name = int(ped.individual[i]) if i < ped.num_individuals else -1
    is_sample = bool(ped.is_sample[i]) if i < ped.num_individuals else False
    meta_individuals.add_row(metadata={
        "individual_name": ind_name,
예제 #27
0
def parse_fam(fam_file):
    """
    Parse PLINK .fam file and convert to tskit IndividualTable.

    Assumes fam file contains five columns: FID, IID, PAT, MAT, SEX

    :param fam_file: PLINK .fam file object
    :param tskit.TableCollection tc: TableCollection with IndividualTable to
        which the individuals will be added
    """
    individuals = np.loadtxt(
        fname=fam_file,
        dtype=str,
        ndmin=2,  # read file as 2-D table
        usecols=(0, 1, 2, 3, 4),  # only keep FID, IID, PAT, MAT, SEX columns
    )  # requires same number of columns in each row, i.e. not ragged

    id_map = {}  # dict for translating PLINK ID to tskit IndividualTable ID
    for tskit_id, (plink_fid, plink_iid, _pat, _mat,
                   _sex) in enumerate(individuals):
        # include space between strings to ensure uniqueness
        plink_id = f"{plink_fid} {plink_iid}"
        if plink_id in id_map:
            raise ValueError("Duplicate PLINK ID: {plink_id}")
        id_map[plink_id] = tskit_id
    id_map["0"] = -1  # -1 is used in tskit to denote "missing"

    tc = tskit.TableCollection(1)
    tb = tc.individuals
    tb.metadata_schema = tskit.MetadataSchema({
        "codec":
        "json",
        "type":
        "object",
        "properties": {
            "plink_fid": {
                "type": "string"
            },
            "plink_iid": {
                "type": "string"
            },
            "sex": {
                "type": "integer"
            },
        },
        "required": ["plink_fid", "plink_iid", "sex"],
        "additionalProperties":
        True,
    })
    for plink_fid, plink_iid, pat, mat, sex in individuals:
        sex = int(sex)
        if not (sex in range(3)):
            raise ValueError(
                "Sex must be one of the following: 0 (unknown), 1 (male), 2 (female)"
            )
        metadata_dict = {
            "plink_fid": plink_fid,
            "plink_iid": plink_iid,
            "sex": sex
        }
        pat_id = f"{plink_fid} {pat}" if pat != "0" else pat
        mat_id = f"{plink_fid} {mat}" if mat != "0" else mat
        tb.add_row(
            parents=[
                id_map[pat_id],
                id_map[mat_id],
            ],
            metadata=metadata_dict,
        )
    tc.sort()

    return tb
예제 #28
0
            0.0,
            "bounds_z1":
            100.0,
            "migration_records": [{
                "source_subpop": 1,
                "migration_rate": 0.9
            }, {
                "source_subpop": 2,
                "migration_rate": 0.1
            }]
        }]
    },
}

slim_metadata_schemas = {
    k: tskit.MetadataSchema(_raw_slim_metadata_schemas[k])
    for k in _raw_slim_metadata_schemas
}

default_slim_metadata = {
    "tree_sequence": {
        "SLiM": {
            "model_type": "nonWF",
            "generation": 1,
            "file_version": slim_file_version,
            "spatial_dimensionality": "",
            "spatial_periodicity": "",
            "separate_sexes": False,
            "nucleotide_based": False,
            "stage": "late"
        }
예제 #29
0
def from_newick(string, min_edge_length=0):
    """
    Returns a tree sequence representation of the specified newick string.

    The tree sequence will contain a single tree, as specified by the newick. All
    leaf nodes will be marked as samples (``tskit.NODE_IS_SAMPLE``). Newick names and
    comments will be written to the node metadata.

    :param string string: Newick string
    :param float min_edge_length: Replace any edge length shorter than this value by this
        value. Unlike newick, tskit doesn't support zero or negative edge lengths, so
        setting this argument to a small value is necessary when importing trees with
        zero or negative lengths.
    """
    trees = newick.loads(string)
    if len(trees) > 1:
        raise ValueError("Only one tree can be imported from a newick string")
    if len(trees) == 0:
        raise ValueError("Newick string was empty")
    tree = trees[0]
    tables = tskit.TableCollection(1)
    nodes = tables.nodes
    nodes.metadata_schema = tskit.MetadataSchema(
        {
            "codec": "json",
            "type": "object",
            "properties": {
                "name": {
                    "type": ["string"],
                    "description": "Name from newick file",
                },
                "comment": {
                    "type": ["string"],
                    "description": "Comment from newick file",
                },
            },
        }
    )

    id_map = {}

    def get_or_add_node(newick_node, time):
        if newick_node not in id_map:
            flags = tskit.NODE_IS_SAMPLE if len(newick_node.descendants) == 0 else 0
            metadata = {}
            if newick_node.name:
                metadata["name"] = newick_node.name
            if newick_node.comment:
                metadata["comment"] = newick_node.comment
            id_map[newick_node] = tables.nodes.add_row(
                flags=flags, time=time, metadata=metadata
            )
        return id_map[newick_node]

    root = next(tree.walk())
    get_or_add_node(root, 0)
    for newick_node in tree.walk():
        node_id = id_map[newick_node]
        for child in newick_node.descendants:
            length = max(child.length, min_edge_length)
            if length <= 0:
                raise ValueError(
                    "tskit tree sequences cannot contain edges with lengths"
                    " <= 0. Set min_edge_length to force lengths to a"
                    " minimum size"
                )
            child_node_id = get_or_add_node(child, nodes[node_id].time - length)
            tables.edges.add_row(0, 1, node_id, child_node_id)
    # Rewrite node times to fit the tskit convention of zero at the youngest leaf
    nodes = tables.nodes.copy()
    youngest = min(tables.nodes.time)
    tables.nodes.clear()
    for node in nodes:
        tables.nodes.append(node.replace(time=node.time - youngest + root.length))
    tables.sort()
    return tables.tree_sequence()
예제 #30
0
next_id = 0
for ind in both.individuals():
    md = ind.metadata
    md['pedigree_id'] = next_id
    j = tables.individuals.add_row(flags=ind.flags,
                                   location=ind.location,
                                   parents=ind.parents,
                                   metadata=md)
    ind_map[j] = md['pedigree_id']
    next_id += 1

tables.nodes.clear()
# hack because of https://github.com/tskit-dev/tskit/issues/1256
# (which is fixed in github main)
schema = tables.nodes.metadata_schema
tables.nodes.metadata_schema = tskit.MetadataSchema(None)
for n in both.nodes():
    md = n.metadata
    if md is not None:
        assert n.individual != tskit.NULL
        ind = both.individual(n.individual)
        offset = md['slim_id'] - 2 * ind.metadata['pedigree_id']
        md['slim_id'] = 2 * ind_map[ind.id] + offset
    tables.nodes.add_row(time=n.time,
                         population=n.population,
                         individual=n.individual,
                         flags=n.flags,
                         metadata=schema.validate_and_encode_row(md))
tables.nodes.metadata_schema = schema

both = tables.tree_sequence()