def test_list_long_format_human(self): data = self.get_example_data() kas.dump(data, self.temp_file) stdout, stderr = self.get_output(["ls", self.temp_file, "-l", "-H"]) self.assertEqual(len(stderr), 0) lines = stdout.splitlines() self.assertEqual(len(lines), len(data))
def test_dump_fileobj_single(self): data = {"a": np.arange(10)} with open(self.temp_file, "wb") as f: kas.dump(data, f, engine=self.engine) data_out = kas.load(self.temp_file, engine=self.engine) data2 = dict(data_out.items()) self.verify_dicts_equal(data, data2)
def test_py_engine_single(self): data = {"a": np.arange(10), "b": np.zeros(100)} fileobj = io.BytesIO() kas.dump(data, fileobj, engine=kas.PY_ENGINE) fileobj.seek(0) data_2 = kas.load(fileobj, engine=kas.PY_ENGINE) self.verify_dicts_equal(data, data_2)
def test_dump(self): data = self.get_example_data() kas.dump(data, self.temp_file) for key in data.keys(): stdout, stderr = self.get_output(["dump", self.temp_file, key]) self.assertEqual(len(stderr), 0) self.assertEqual(stdout.splitlines(), list(map(str, data[key])))
def test_c_engine_fails(self): data = {"a": np.arange(10), "b": np.zeros(100)} fileobj = io.BytesIO() with self.assertRaises(io.UnsupportedOperation): kas.dump(data, fileobj, engine=kas.C_ENGINE) with self.assertRaises(io.UnsupportedOperation): kas.load(fileobj, engine=kas.C_ENGINE)
def validate_storage(self, data): kas.dump(data, self.temp_file, engine=self.engine) with open(self.temp_file, "rb") as f: contents = f.read() offset = store.HEADER_SIZE descriptors = [] for _ in range(len(data)): descriptor = store.ItemDescriptor.unpack( contents[offset:offset + store.ItemDescriptor.size]) descriptors.append(descriptor) offset += store.ItemDescriptor.size # Keys must be sorted lexicographically. sorted_keys = sorted(data.keys()) # Keys should be packed sequentially immediately after the descriptors. offset = store.HEADER_SIZE + len(data) * store.ITEM_DESCRIPTOR_SIZE for d, key in zip(descriptors, sorted_keys): self.assertEqual(d.key_start, offset) unpacked_key = contents[d.key_start:d.key_start + d.key_len] self.assertEqual(key.encode("utf8"), unpacked_key) offset += d.key_len # Arrays should be packed sequentially immediately after the keys on # 8 byte boundaries for d, key in zip(descriptors, sorted_keys): remainder = offset % 8 if remainder != 0: offset += 8 - remainder self.assertEqual(d.array_start, offset) nbytes = d.array_len * store.type_size(d.type) array = np.frombuffer( contents[d.array_start:d.array_start + nbytes], dtype=store.type_to_np_dtype_map[d.type], ) np.testing.assert_equal(data[key], array) offset += nbytes
def test_item_descriptor_format(self): for n in range(10): kas.dump( {str(j): j * np.ones(j) for j in range(n)}, self.temp_file, engine=self.engine, ) with open(self.temp_file, "rb") as f: contents = f.read() self.assertEqual(struct.unpack("<I", contents[12:16])[0], n) offset = store.HEADER_SIZE for _ in range(n): descriptor = contents[offset:offset + store.ITEM_DESCRIPTOR_SIZE] offset += store.ITEM_DESCRIPTOR_SIZE type_ = struct.unpack("<B", descriptor[0:1])[0] key_start, key_len, array_start, array_len = struct.unpack( "<QQQQ", descriptor[8:40]) trailer = descriptor[40:store.ITEM_DESCRIPTOR_SIZE] # The remainder should be zeros. self.assertEqual( trailer, bytearray(0 for _ in range(store.ITEM_DESCRIPTOR_SIZE - 40)), ) self.assertEqual(descriptor[1:4], bytearray([0, 0, 0])) self.assertEqual(type_, store.FLOAT64) self.assertGreater(key_start, 0) self.assertGreater(key_len, 0) self.assertGreater(array_start, 0) self.assertGreaterEqual(array_len, 0)
def verify_offset_columns(self, ts): ts.dump(self.temp_file) with kastore.load(self.temp_file) as store: all_data = dict(store) offset_col_pairs = [] for col in all_data.keys(): if col.endswith("_offset"): main_col = col[:col.index("_offset")] offset_col_pairs.append((main_col, col)) for col, offset_col in offset_col_pairs: num_rows = len(all_data[offset_col]) - 1 data = dict(all_data) # Check bad lengths of the offset col for bad_col_length in [[], range(2 * num_rows)]: data[offset_col] = bad_col_length kastore.dump(data, self.temp_file) with pytest.raises(exceptions.FileFormatError): tskit.load(self.temp_file) # Check for a bad offset data = dict(all_data) original_offset = data[offset_col] original_col = data[col] data[offset_col] = np.zeros_like(original_offset) data[col] = np.zeros(10, dtype=original_col.dtype) kastore.dump(data, self.temp_file) with pytest.raises(exceptions.LibraryError): tskit.load(self.temp_file)
def test_dump_fileobj_multi(self): with open(self.temp_file, "wb") as f: for i in range(10): data = { "i" + str(i): np.arange(i, dtype=int), "f" + str(i): np.arange(i, dtype=float), } kas.dump(data, f, engine=self.engine)
def verify(self, data): kas.dump(data, self.temp_file, engine=kas.C_ENGINE) with open(self.temp_file, "rb") as f: c_file = f.read() kas.dump(data, self.temp_file, engine=kas.PY_ENGINE) with open(self.temp_file, "rb") as f: py_file = f.read() self.assertEqual(c_file, py_file)
def test_bad_arrays(self): kas.dump(data={"a": []}, filename=self.temp_file, engine=self.engine) for bad_array in [kas, lambda x: x, "1234", None, [[0, 1], [0, 2]]]: self.assertRaises(ValueError, kas.dump, data={"a": bad_array}, filename=self.temp_file, engine=self.engine)
def test_dump(self): data = {"a": np.zeros(1)} try: kas._kastore_loaded = False with self.assertRaises(RuntimeError): kas.dump(data, self.temp_file, engine=kas.C_ENGINE) finally: kas._kastore_loaded = True
def test_context_manager(self): N = 100 data = {"a": np.arange(N)} kas.dump(data, self.temp_file) with kas.load(self.temp_file) as store: self.assertIn("a", store) self.assertTrue(np.array_equal(store["a"], np.arange(N))) self.verify_closed(store)
def test_manual_close(self): N = 100 data = {"a": np.arange(N)} kas.dump(data, self.temp_file) store = kas.load(self.temp_file) self.assertIn("a", store) self.assertTrue(np.array_equal(store["a"], np.arange(N))) store.close() self.verify_closed(store)
def simple_example(): data = { "one": np.arange(5, dtype=np.int8), "two": np.arange(5, dtype=np.uint64) } kastore.dump(data, "tmp.kas") d2 = kastore.load("tmp.kas") print(list(d2.items()))
def test_list_empty(self): kas.dump({}, self.temp_file) stdout, stderr = self.get_output(["ls", self.temp_file]) self.assertEqual(len(stderr), 0) self.assertEqual(len(stdout), 0) for opts in ["-l", "-lH"]: stdout, stderr = self.get_output(["ls", opts, self.temp_file]) self.assertEqual(len(stderr), 0) self.assertEqual(len(stdout), 0)
def test_load_and_dump_file_single_rw(self): data = {"a": np.arange(10)} with open(self.temp_file, "r+b") as f: kas.dump(data, f, engine=self.engine) for read_all in [True, False]: f.seek(0) data_out = kas.load(f, read_all=read_all, engine=self.engine) data2 = dict(data_out.items()) self.verify_dicts_equal(data, data2)
def test_py_engine_multi(self): data = {"a": np.arange(10), "b": np.zeros(100)} n = 10 fileobj = io.BytesIO() for _ in range(n): kas.dump(data, fileobj, engine=kas.PY_ENGINE) fileobj.seek(0) for _ in range(n): data_2 = kas.load(fileobj, read_all=True, engine=kas.PY_ENGINE) self.verify_dicts_equal(data, data_2)
def test_old_version_load_error(self): ts = msprime.simulate(10, random_seed=1) for bad_version in [(0, 1), (0, 8), (2, 0), (CURRENT_FILE_MAJOR - 1, 0)]: ts.dump(self.temp_file) with kastore.load(self.temp_file) as store: data = dict(store) data["format/version"] = np.array(bad_version, dtype=np.uint32) kastore.dump(data, self.temp_file) with pytest.raises(tskit.VersionTooOldError): tskit.load(self.temp_file)
def test_format_name_error(self): ts = msprime.simulate(10) for bad_name in ["tskit.tree", "tskit.treesAndOther", "", "x" * 100]: ts.dump(self.temp_file) with kastore.load(self.temp_file) as store: data = dict(store) data["format/name"] = np.array(bytearray(bad_name.encode()), dtype=np.int8) kastore.dump(data, self.temp_file) with pytest.raises(exceptions.FileFormatError): tskit.load(self.temp_file)
def test_new_version_load_error(self): ts = msprime.simulate(10, random_seed=1) for bad_version in [(CURRENT_FILE_MAJOR + j, 0) for j in range(1, 5)]: ts.dump(self.temp_file) with kastore.load(self.temp_file, use_mmap=False) as store: data = dict(store) data["format/version"] = np.array(bad_version, dtype=np.uint32) kastore.dump(data, self.temp_file) self.assertRaises(msprime.VersionTooNewError, msprime.load, self.temp_file)
def verify_fields(self, ts): ts.dump(self.temp_file) with kastore.load(self.temp_file, use_mmap=False) as store: all_data = dict(store) for key in all_data.keys(): data = dict(all_data) del data[key] kastore.dump(data, self.temp_file) self.assertRaises(exceptions.FileFormatError, msprime.load, self.temp_file)
def test_load_and_dump_fd_single_rw(self): data = {"a": np.arange(10)} with open(self.temp_file, "r+b") as f: fd = f.fileno() kas.dump(data, fd, engine=self.engine) for read_all in [True, False]: os.lseek(fd, 0, os.SEEK_SET) data_out = kas.load(fd, read_all=read_all, engine=self.engine) data2 = dict(data_out.items()) self.verify_dicts_equal(data, data2)
def verify(self, data): kas.dump(data, self.temp_file) for read_all in [True, False]: new_data = kas.load(self.temp_file, read_all=read_all) for key, array in new_data.items(): info = new_data.info(key) s = str(info) self.assertGreater(len(s), 0) self.assertEqual(array.nbytes, info.size) self.assertEqual(array.shape, info.shape) self.assertEqual(array.dtype, np.dtype(info.dtype))
def test_load_fileobj_single(self): data = {"a": np.arange(10)} kas.dump(data, self.temp_file, engine=self.engine) file_size = os.stat(self.temp_file).st_size for read_all in [True, False]: with open(self.temp_file, "rb") as f: data_out = kas.load(f, read_all=read_all, engine=self.engine) data2 = dict(data_out.items()) file_offset = f.tell() self.verify_dicts_equal(data, data2) self.assertEqual(file_offset, file_size)
def verify_logging(self, args, level): # We don't actually check the output here as we're mocking out the # call to logging config, but it's convenient to reuse the machinery # here in this class data = self.get_example_data() kas.dump(data, self.temp_file) log_format = '%(asctime)s %(message)s' with mock.patch("logging.basicConfig") as mocked_config: stdout, stderr = self.get_output(args + ["ls", self.temp_file]) mocked_config.assert_called_once_with(level=level, format=log_format) return stderr
def test_missing_attr(self, ts_fixture, tmp_path, attr): ts1 = ts_fixture temp_file = tmp_path / "tmp.trees" ts1.dump(temp_file) with kastore.load(temp_file) as store: all_data = dict(store) del all_data[f"reference_sequence/{attr}"] kastore.dump(all_data, temp_file) ts2 = tskit.load(temp_file) assert ts2.has_reference_sequence assert getattr(ts2.reference_sequence, attr) == ""
def handle(self): while True: try: data = kas.load(self.request.fileno(), engine=self.engine, read_all=True) except EOFError: break kas.dump(dict(data), self.request.fileno(), engine=self.engine) # We only read one list, so shutdown the server straight away self.server.shutdown()
def test_missing_metadata_schema(self, ts_fixture, tmp_path): ts1 = ts_fixture temp_file = tmp_path / "tmp.trees" ts1.dump(temp_file) with kastore.load(temp_file) as store: all_data = dict(store) del all_data["reference_sequence/metadata_schema"] kastore.dump(all_data, temp_file) ts2 = tskit.load(temp_file) assert ts2.has_reference_sequence assert repr(ts2.reference_sequence.metadata_schema) == ""
def test_load_from_pathlib_Path(self): data = {"a": np.arange(10)} kas.dump(data, str(self.temp_file), engine=self.engine) file_size = self.temp_file.stat().st_size for read_all in [True, False]: data_out = kas.load(self.temp_file, read_all=read_all, engine=self.engine) data2 = dict(data_out.items()) file_size2 = self.temp_file.stat().st_size self.verify_dicts_equal(data, data2) self.assertEqual(file_size, file_size2)