def _get_store(path): url = "hfs://{}".format(path) store = storefact.get_store_from_url(url) store.delete = partial(_check_and_delete, store=store, delete_orig=store.delete) return store
def test_complete(): url, expected = ACTUAL_URL store = storefact.get_store_from_url(url) assert store.bucket_name == expected["bucket_name"] assert store._client.project == 'central-splice-296415' with pytest.raises(RefreshError): store.get("somekey")
def setup(self, num_partitions, max_depth, num_leafs): self.store = get_store_from_url("hfs://{}".format(tempfile.mkdtemp())) dataset_metadata = generate_metadata(max_depth, num_leafs) self.partitions = [ generate_mp(dataset_metadata) for _ in range(num_partitions) ] self.dataset_uuid = "dataset_uuid" self.user_dataset_metadata = {}
def reference_store(): path = os.path.join( os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "..", "reference-data", "arrow-compat", ) return get_store_from_url("hfs://{}".format(path))
def test_dask_partitions(metadata_version): """ Create partitions for one table with dask and check that it can be read with kartothek """ import dask.dataframe bucket_dir = tempfile.mkdtemp() dataset_uuid = "uuid+namespace-attribute12_underscored" os.mkdir("{}/{}".format(bucket_dir, dataset_uuid)) table_dir = "{}/{}/core".format(bucket_dir, dataset_uuid) os.mkdir(table_dir) store = storefact.get_store_from_url("hfs://{}".format(bucket_dir)) locations = ["L-{}".format(i) for i in range(2)] df = pd.DataFrame() for location in locations: core = pd.DataFrame( data={ "date": np.array( ["2017-11-23", "2017-11-23", "2017-11-24", "2017-11-24"] ), "product": np.array(["P-0", "P-1", "P-0", "P-1"]), "location": location, "value": np.array(random.sample(range(1, 100), 4)), } ) df = pd.concat([df, core]) ddf = dask.dataframe.from_pandas(df, npartitions=1) dask.dataframe.to_parquet(ddf, table_dir, partition_on=["location"]) partition0 = "{}/core/location=L-0/part.0.parquet".format(dataset_uuid) partition1 = "{}/core/location=L-1/part.0.parquet".format(dataset_uuid) metadata = { "dataset_metadata_version": metadata_version, "dataset_uuid": dataset_uuid, } expected_partitions = { "partitions": { "location=L-0": {"files": {"core": partition0}}, "location=L-1": {"files": {"core": partition1}}, } } expected_tables = {"tables": {"core": ["date", "product", "value"]}} store.put( "{}.by-dataset-metadata.json".format(dataset_uuid), simplejson.dumps(metadata).encode(), ) metadata.update(expected_partitions) metadata.update(expected_tables) dmd = DatasetMetadata.load_from_store(dataset_uuid, store) actual_partitions = dmd.to_dict()["partitions"] # we partition on location ID which has two values assert len(actual_partitions) == 2 assert dmd.partition_keys == ["location"]
def store_input_types(request, tmpdir): url = f"hfs://{tmpdir}" if request.param == "URL": return url elif request.param == "KeyValue": return get_store_from_url(url) elif request.param == "Callable": return no_pickle_factory(url) else: raise RuntimeError(f"Encountered unknown store type {type(request.param)}")
def setup(self, number_values, number_partitions, dtype): py_type, arrow_type = dtype index_dct = { py_type(val): [str(part) for part in range(number_partitions)] for val in range(0, number_values) } self.column_name = "column" self.ktk_index = ExplicitSecondaryIndex(column=self.column_name, index_dct=index_dct, dtype=arrow_type) self.tmp_dir = tempfile.mkdtemp() self.store = storefact.get_store_from_url("hfs://{}".format( self.tmp_dir)) self.dataset_uuid = "some_uuid" self.storage_key = self.ktk_index.store(self.store, self.dataset_uuid) self.ktk_index_not_loaded = ExplicitSecondaryIndex( column=self.column_name, index_storage_key=self.storage_key) self.ktk_index_loaded = self.ktk_index_not_loaded.load(self.store)
def test_normalize_store(tmpdir, _type): store_url = f"hfs://{tmpdir}" store = get_store_from_url(store_url) store.put("test", b"") @normalize_args def func(store): assert isinstance(store, Callable) return store().keys() if _type == "callable": store_test = partial(get_store_from_url, store_url) elif _type == "url": store_test = store_url elif _type == "simplekv": store_test = store else: raise AssertionError(f"unknown parametrization {_type}") assert func(store_test)
def setup(self, num_partitions): self.store = get_store_from_url("hfs://{}".format(tempfile.mkdtemp())) self.partitions = [generate_mp() for _ in range(num_partitions)] self.dataset_uuid = "dataset_uuid"
#!/usr/bin/env python import os import pyarrow as pa from storefact import get_store_from_url from kartothek.core.testing import get_dataframe_alltypes from kartothek.serialization import ParquetSerializer if __name__ == "__main__": ser = ParquetSerializer() dir_path = os.path.dirname(os.path.realpath(__file__)) store = get_store_from_url(f"hfs://{dir_path}") df = get_dataframe_alltypes() df["byte"] = b"\x82\xd6\xc1\x06Z\x08\x11\xe9\x85eJ\x00\x07\xf8\n\x10" ref_file = f"{pa.__version__}" ser.store(store, ref_file, df)
def test_lazy_store_accepts_decorated_store(): store = get_store_from_url("memory://") pstore = PrefixDecorator("pre", store) assert lazy_store(pstore)() is pstore
def store2(tmpdir): path = tmpdir.join("store2").strpath url = "hfs://{}".format(path) return storefact.get_store_from_url(url)
def no_pickle_store_from_url(url): store = storefact.get_store_from_url(url) return NoPickleDecorator(store)
def no_pickle_store(url): store = get_store_from_url(url) mark_nopickle(store) return store
def test_roundtrip(): assert isinstance(storefact.get_store_from_url(u'memory://#wrap:readonly'), simplekv.decorator.ReadOnlyDecorator)
def test_ensure_store_returns_same_store(): store = get_store_from_url("memory://") assert ensure_store(lambda: store) is store
def test_lazy_store_returns_same_store(): store = get_store_from_url("memory://") assert lazy_store(lambda: store)() is store
def setup(self, num_rows, chunk_size): self.df = get_dataframe_not_nested(num_rows) self.serialiser = ParquetSerializer(chunk_size=chunk_size) self.store = get_store_from_url("memory://") self.key = self.serialiser.store(self.store, "key_prefix", self.df) self.predicates = [[("int16", "==", 123)]]
def store_session(tmpdir_factory): path = tmpdir_factory.mktemp("fsstore_test") path = path.realpath() url = "hfs://{}".format(path) return storefact.get_store_from_url(url)