def test_records_to_db(url): s: Storage = Storage.from_url(url) api_cls: Type[DatabaseApi] = s.storage_engine.get_api_cls() if not s.get_api().dialect_is_supported(): warnings.warn( f"Skipping tests for database engine {s.storage_engine.__name__} (client library not installed)" ) return mem_api: PythonStorageApi = new_local_python_storage().get_api() with api_cls.temp_local_database() as db_url: name = "_test" db_api: DatabaseStorageApi = Storage.from_url(db_url).get_api() # Records mdr = as_records(records) mem_api.put(name, mdr) conversion = Conversion( StorageFormat(LocalPythonStorageEngine, RecordsFormat), StorageFormat(s.storage_engine, DatabaseTableFormat), ) copy_records_to_db.copy(name, name, conversion, mem_api, db_api, schema=TestSchema4) with db_api.execute_sql_result(f"select * from {name}") as res: assert [dict(r) for r in res] == records
def test_data_copy_lookup(): @datacopy(cost=NoOpCost, from_storage_classes=[FileSystemStorageClass], unregistered=True) def noop_all(*args): pass @datacopy( from_storage_classes=[DatabaseStorageClass], from_data_formats=[DatabaseTableFormat], to_storage_classes=[PythonStorageClass], to_data_formats=[RecordsFormat], cost=NetworkToMemoryCost, unregistered=True, ) def db_to_mem(*args): pass lkup = get_datacopy_lookup(copiers=[noop_all, db_to_mem]) dcp = lkup.get_lowest_cost( Conversion( StorageFormat(PostgresStorageEngine, DatabaseTableFormat), StorageFormat(LocalPythonStorageEngine, RecordsFormat), )) assert dcp is db_to_mem
def test_mem_to_mem(from_fmt, to_fmt): from_fmt, obj = from_fmt to_fmt, expected = to_fmt if from_fmt == to_fmt: return mem_api: PythonStorageApi = new_local_python_storage().get_api() from_name = "_from_test" to_name = "_to_test" mem_api.put(from_name, as_records(obj(), data_format=from_fmt)) conversion = Conversion( StorageFormat(LocalPythonStorageEngine, from_fmt), StorageFormat(LocalPythonStorageEngine, to_fmt), ) pth = get_datacopy_lookup().get_lowest_cost_path(conversion) for i, ce in enumerate(pth.conversions): ce.copier.copy(from_name, to_name, ce.conversion, mem_api, mem_api, schema=TestSchema4) from_name = to_name to_name = to_name + str(i) to_name = from_name if isinstance(expected, pd.DataFrame): assert_dataframes_are_almost_equal( mem_api.get(to_name).records_object, expected) else: assert list(mem_api.get(to_name).records_object) == list(expected())
def copy_lowest_cost( env: Environment, sess: Session, sdb: StoredDataBlockMetadata, target_storage: Storage, target_format: DataFormat, eligible_storages: Optional[List[Storage]] = None, ) -> StoredDataBlockMetadata: if eligible_storages is None: eligible_storages = env.storages target_storage_format = StorageFormat(target_storage.storage_engine, target_format) cp = get_copy_path_for_sdb(sdb, target_storage_format, eligible_storages) if cp is None: raise CopyPathDoesNotExist( f"Copying {sdb} to format {target_format} on storage {target_storage}" ) return convert_sdb( env, sess=sess, sdb=sdb, conversion_path=cp, target_storage=target_storage, storages=eligible_storages, )
def test_file_to_mem(): dr = tempfile.gettempdir() s: Storage = Storage.from_url(f"file://{dr}") fs_api: FileSystemStorageApi = s.get_api() mem_api: PythonStorageApi = new_local_python_storage().get_api() name = "_test" fs_api.write_lines_to_file(name, ["f1,f2", "hi,2"]) # Records records_obj = [{"f1": "hi", "f2": 2}] conversion = Conversion( StorageFormat(s.storage_engine, DelimitedFileFormat), StorageFormat(LocalPythonStorageEngine, RecordsFormat), ) copy_delim_file_to_records.copy(name, name, conversion, fs_api, mem_api, schema=TestSchema4) assert mem_api.get(name).records_object == records_obj
def get_copy_path_for_sdb(sdb: StoredDataBlockMetadata, target_format: StorageFormat, storages: List[Storage]) -> Optional[ConversionPath]: source_format = StorageFormat(sdb.storage.storage_engine, sdb.data_format) if source_format == target_format: # Already exists, do nothing return ConversionPath() conversion = Conversion(source_format, target_format) conversion_path = get_datacopy_lookup(available_storage_engines=set( s.storage_engine for s in storages), ).get_lowest_cost_path(conversion, ) return conversion_path
def test_obj_to_file(): dr = tempfile.gettempdir() s: Storage = Storage.from_url(f"file://{dr}") fs_api: FileSystemStorageApi = s.get_api() mem_api: PythonStorageApi = new_local_python_storage().get_api() name = "_test" fmt = DelimitedFileObjectFormat obj = (lambda: StringIO("f1,f2\nhi,2"), )[0] mdr = as_records(obj(), data_format=fmt) mem_api.put(name, mdr) conversion = Conversion( StorageFormat(LocalPythonStorageEngine, fmt), StorageFormat(s.storage_engine, DelimitedFileFormat), ) copy_file_object_to_delim_file.copy(name, name, conversion, mem_api, fs_api, schema=TestSchema4) with fs_api.open(name) as f: assert f.read() == obj().read()
def test_db_to_mem(url): s: Storage = Storage.from_url(url) api_cls: Type[DatabaseApi] = s.storage_engine.get_api_cls() mem_api: PythonStorageApi = new_local_python_storage().get_api() if not s.get_api().dialect_is_supported(): return with api_cls.temp_local_database() as db_url: api: DatabaseStorageApi = Storage.from_url(db_url).get_api() name = "_test" api.execute_sql(f"create table {name} as select 1 a, 2 b") # Records conversion = Conversion( StorageFormat(s.storage_engine, DatabaseTableFormat), StorageFormat(LocalPythonStorageEngine, RecordsFormat), ) copy_db_to_records.copy(name, name, conversion, api, mem_api) assert mem_api.get(name).records_object == [{"a": 1, "b": 2}] # DatabaseCursor conversion = Conversion( StorageFormat(s.storage_engine, DatabaseTableFormat), StorageFormat(LocalPythonStorageEngine, DatabaseCursorFormat), ) copy_db_to_records.copy(name, name, conversion, api, mem_api) assert list(mem_api.get(name).records_object) == [{"a": 1, "b": 2}]
def test_records_to_file(): dr = tempfile.gettempdir() s: Storage = Storage.from_url(f"file://{dr}") fs_api: FileSystemStorageApi = s.get_api() mem_api: PythonStorageApi = new_local_python_storage().get_api() name = "_test" fmt = RecordsFormat obj = [{"f1": "hi", "f2": 2}] mdr = as_records(obj, data_format=fmt) mem_api.put(name, mdr) conversion = Conversion( StorageFormat(LocalPythonStorageEngine, fmt), StorageFormat(s.storage_engine, DelimitedFileFormat), ) copy_records_to_delim_file.copy(name, name, conversion, mem_api, fs_api, schema=TestSchema4) with fs_api.open(name) as f: recs = list(read_csv(f)) recs = RecordsFormat.conform_records_to_schema(recs, TestSchema4) assert recs == obj
lkup = get_datacopy_lookup(copiers=[noop_all, db_to_mem]) dcp = lkup.get_lowest_cost( Conversion( StorageFormat(PostgresStorageEngine, DatabaseTableFormat), StorageFormat(LocalPythonStorageEngine, RecordsFormat), )) assert dcp is db_to_mem @pytest.mark.parametrize( "conversion,length", [ # Memory to DB ( ( StorageFormat(LocalPythonStorageEngine, RecordsFormat), StorageFormat(PostgresStorageEngine, DatabaseTableFormat), ), 1, ), ( ( StorageFormat(LocalPythonStorageEngine, DataFrameFormat), StorageFormat(PostgresStorageEngine, DatabaseTableFormat), ), 2, ), ( ( StorageFormat(LocalPythonStorageEngine, DataFrameIteratorFormat),
def get_storage_format(self) -> StorageFormat: return StorageFormat(self.storage.storage_engine, self.data_format)
def ensure_data_block_on_storage( env: Environment, sess: Session, block: DataBlockMetadata, storage: Storage, fmt: Optional[DataFormat] = None, eligible_storages: Optional[List[Storage]] = None, ) -> StoredDataBlockMetadata: if eligible_storages is None: eligible_storages = env.storages sdbs = sess.query(StoredDataBlockMetadata).filter( StoredDataBlockMetadata.data_block == block) match = sdbs.filter(StoredDataBlockMetadata.storage_url == storage.url) if fmt: match = match.filter(StoredDataBlockMetadata.data_format == fmt) matched_sdb = match.first() if matched_sdb is not None: return matched_sdb # logger.debug(f"{cnt} SDBs total") existing_sdbs = sdbs.filter( # DO NOT fetch memory SDBs that aren't of current runtime (since we can't get them!) # TODO: clean up memory SDBs when the memory goes away? Doesn't make sense to persist them really # Should be a separate in-memory lookup for memory SDBs, so they naturally expire? or_( ~StoredDataBlockMetadata.storage_url.startswith("python:"), StoredDataBlockMetadata.storage_url == storage.url, ), ) # logger.debug( # f"{existing_sdbs.count()} SDBs on-disk or in local memory (local: {self.ctx.local_python_storage.url})" # ) if eligible_storages: existing_sdbs = existing_sdbs.filter( StoredDataBlockMetadata.storage_url.in_( s.url for s in eligible_storages), ) # logger.debug(f"{existing_sdbs.count()} SDBs in eligible storages") fmt = fmt or storage.storage_engine.get_natural_format() target_storage_format = StorageFormat(storage.storage_engine, fmt) # Compute conversion costs eligible_conversion_paths = ( []) #: List[List[Tuple[ConversionCostLevel, Type[Converter]]]] = [] existing_sdbs = list(existing_sdbs) for sdb in existing_sdbs: conversion_path = get_copy_path_for_sdb(sdb, target_storage_format, eligible_storages) if conversion_path is not None: eligible_conversion_paths.append( (conversion_path.total_cost, conversion_path, sdb)) if not eligible_conversion_paths: raise NotImplementedError( f"No converter to {target_storage_format} for existing StoredDataBlocks {existing_sdbs}" ) cost, conversion_path, in_sdb = min(eligible_conversion_paths, key=lambda x: x[0]) return convert_sdb( env, sess=sess, sdb=in_sdb, conversion_path=conversion_path, target_storage=storage, storages=eligible_storages, )