def test_json(self): ts = msprime.simulate(10, random_seed=1) tables = ts.dump_tables() nodes = tables.nodes # For each node, we create some Python metadata that can be JSON encoded. metadata = [{ "one": j, "two": 2 * j, "three": list(range(j)) } for j in range(len(nodes))] encoded, offset = msprime.pack_strings(map(json.dumps, metadata)) nodes.set_columns(flags=nodes.flags, time=nodes.time, population=nodes.population, metadata_offset=offset, metadata=encoded) self.assertTrue(np.array_equal(nodes.metadata_offset, offset)) self.assertTrue(np.array_equal(nodes.metadata, encoded)) ts1 = msprime.load_tables(nodes=nodes, edges=tables.edges) for j, node in enumerate(ts1.nodes()): decoded_metadata = json.loads(node.metadata.decode()) self.assertEqual(decoded_metadata, metadata[j]) ts1.dump(self.temp_file) ts2 = msprime.load(self.temp_file) self.assertEqual(ts1.tables.nodes, ts2.tables.nodes)
def test_simple_case(self): strings = ["hello", "world"] packed, length = msprime.pack_strings(strings) self.assertEqual(list(length), [5, 5]) self.assertEqual(packed.shape, (10, )) returned = msprime.unpack_strings(packed, length) self.assertEqual(returned, strings)
def verify_packing(self, strings): packed, length = msprime.pack_strings(strings) self.assertEqual(packed.dtype, np.int8) self.assertEqual(length.dtype, np.uint32) self.assertEqual(list(length), [len(s) for s in strings]) self.assertEqual(packed.shape[0], np.sum(length)) returned = msprime.unpack_strings(packed, length) self.assertEqual(strings, returned)
def node_metadata_example(): ts = msprime.simulate( sample_size=100, recombination_rate=0.1, length=10, random_seed=1) nodes = msprime.NodeTable() edges = msprime.EdgeTable() ts.dump_tables(nodes=nodes, edges=edges) new_nodes = msprime.NodeTable() metadatas = ["n_{}".format(u) for u in range(ts.num_nodes)] packed, offset = msprime.pack_strings(metadatas) new_nodes.set_columns( metadata=packed, metadata_offset=offset, flags=nodes.flags, time=nodes.time) return msprime.load_tables(nodes=new_nodes, edges=edges)
def test_optional_population(self): for num_rows in [0, 10, 100]: names = [str(j) for j in range(num_rows)] name, name_length = msprime.pack_strings(names) flags = list(range(num_rows)) time = list(range(num_rows)) table = msprime.NodeTable() table.set_columns(name=name, name_length=name_length, flags=flags, time=time) self.assertEqual(list(table.population), [-1 for _ in range(num_rows)]) self.assertEqual(list(table.flags), flags) self.assertEqual(list(table.time), time) self.assertEqual(list(table.name), list(name)) self.assertEqual(list(table.name_length), list(name_length))
def test_random_names(self): for num_rows in [0, 10, 100]: names = [random_string(10) for _ in range(num_rows)] name, name_length = msprime.pack_strings(names) flags = list(range(num_rows)) time = list(range(num_rows)) table = msprime.NodeTable() table.set_columns(name=name, name_length=name_length, flags=flags, time=time) self.assertEqual(list(table.flags), flags) self.assertEqual(list(table.time), time) self.assertEqual(list(table.name), list(name)) self.assertEqual(list(table.name_length), list(name_length)) unpacked_names = msprime.unpack_strings(table.name, table.name_length) self.assertEqual(names, unpacked_names)
def node_name_example(): ts = msprime.simulate(sample_size=100, recombination_rate=0.1, length=10, random_seed=1) nodes = msprime.NodeTable() edgesets = msprime.EdgesetTable() ts.dump_tables(nodes=nodes, edgesets=edgesets) new_nodes = msprime.NodeTable() names = ["n_{}".format(u) for u in range(ts.num_nodes)] packed, length = msprime.pack_strings(names) new_nodes.set_columns(name=packed, name_length=length, flags=nodes.flags, time=nodes.time) return msprime.load_tables(nodes=new_nodes, edgesets=edgesets, provenance_strings=[b"sdf"])
def finalise(self): if self.genotypes_buffer is None: raise ValueError("Cannot call finalise in read-mode") variant_sites = [] num_samples = self.num_samples num_sites = len(self.site_buffer) if num_sites == 0: raise ValueError("Must have at least one site") position = np.empty(num_sites) frequency = np.empty(num_sites, dtype=np.uint32) ancestral_states = [] derived_states = [] for j, site in enumerate(self.site_buffer): position[j] = site.position frequency[j] = site.frequency if site.frequency > 1 and site.frequency < num_samples: variant_sites.append(j) ancestral_states.append(site.alleles[0]) derived_states.append( "" if len(site.alleles) < 2 else site.alleles[1]) sites_group = self.data.create_group("sites") sites_group.array("position", data=position, chunks=(num_sites, ), compressor=self.compressor) sites_group.array("frequency", data=frequency, chunks=(num_sites, ), compressor=self.compressor) ancestral_state, ancestral_state_offset = msprime.pack_strings( ancestral_states) sites_group.array("ancestral_state", data=ancestral_state, chunks=(num_sites, ), compressor=self.compressor) sites_group.array("ancestral_state_offset", data=ancestral_state_offset, chunks=(num_sites + 1, ), compressor=self.compressor) derived_state, derived_state_offset = msprime.pack_strings( derived_states) sites_group.array("derived_state", data=derived_state, chunks=(num_sites, ), compressor=self.compressor) sites_group.array("derived_state_offset", data=derived_state_offset, chunks=(num_sites + 1, ), compressor=self.compressor) num_singletons = len(self.singletons_buffer) singleton_sites = np.array( [site for site, _ in self.singletons_buffer], dtype=np.int32) singleton_samples = np.array( [sample for _, sample in self.singletons_buffer], dtype=np.int32) singletons_group = self.data.create_group("singletons") chunks = max(num_singletons, 1), singletons_group.array("site", data=singleton_sites, chunks=chunks, compressor=self.compressor) singletons_group.array("sample", data=singleton_samples, chunks=chunks, compressor=self.compressor) num_invariants = len(self.invariants_buffer) invariant_sites = np.array(self.invariants_buffer, dtype=np.int32) invariants_group = self.data.create_group("invariants") chunks = max(num_invariants, 1), invariants_group.array("site", data=invariant_sites, chunks=chunks, compressor=self.compressor) num_variant_sites = len(variant_sites) self.data.attrs["num_sites"] = num_sites self.data.attrs["num_variant_sites"] = num_variant_sites self.data.attrs["num_singleton_sites"] = num_singletons self.data.attrs["num_invariant_sites"] = num_invariants chunks = max(num_variant_sites, 1), self.variants_group.create_dataset("site", shape=(num_variant_sites, ), chunks=chunks, dtype=np.int32, data=variant_sites, compressor=self.compressor) self.genotypes.append( self.genotypes_buffer[:self.genotypes_buffer_offset]) self.site_buffer = None self.genotypes_buffer = None super(SampleData, self).finalise()