def test_data_block_methods(): env = make_test_env() db = DataBlockMetadata( id=get_datablock_id(), inferred_schema_key="_test.TestSchema1", nominal_schema_key="_test.TestSchema2", realized_schema_key="_test.TestSchema3", ) strg = env.get_default_local_python_storage() records = [{"a": 1}] mdr = as_records(records) sdb = StoredDataBlockMetadata( id=get_datablock_id(), data_block_id=db.id, data_block=db, storage_url=strg.url, data_format=RecordsFormat, ) with env.session_scope() as sess: sess.add(db) sess.add(sdb) assert sdb.name is None name = sdb.get_name() assert len(name) > 10 assert sdb.name == name strg.get_api().put(sdb.name, mdr) assert db.inferred_schema(env, sess) == TestSchema1 assert db.nominal_schema(env, sess) == TestSchema2 assert db.realized_schema(env, sess) == TestSchema3 db.compute_record_count() assert db.record_count == 1
def convert_sdb( env: Environment, sess: Session, sdb: StoredDataBlockMetadata, conversion_path: ConversionPath, target_storage: Storage, storages: Optional[List[Storage]] = None, ) -> StoredDataBlockMetadata: if not conversion_path.conversions: return sdb if storages is None: storages = env.storages prev_sdb = sdb next_sdb: Optional[StoredDataBlockMetadata] = None prev_storage = sdb.storage next_storage: Optional[Storage] = None realized_schema = sdb.realized_schema(env, sess) for conversion_edge in conversion_path.conversions: conversion = conversion_edge.conversion target_storage_format = conversion.to_storage_format next_storage = select_storage(target_storage, storages, target_storage_format) logger.debug( f"CONVERSION: {conversion.from_storage_format} -> {conversion.to_storage_format}" ) next_sdb = StoredDataBlockMetadata( # type: ignore id=get_datablock_id(), data_block_id=prev_sdb.data_block_id, data_block=prev_sdb.data_block, data_format=target_storage_format.data_format, storage_url=next_storage.url, ) sess.add(next_sdb) conversion_edge.copier.copy( from_name=prev_sdb.get_name(), to_name=next_sdb.get_name(), conversion=conversion, from_storage_api=prev_storage.get_api(), to_storage_api=next_storage.get_api(), schema=realized_schema, ) if (prev_sdb.data_format.is_python_format() and not prev_sdb.data_format.is_storable()): # If the records obj is in python and not storable, and we just used it, then it can be reused # TODO: Bit of a hack. Is there a central place we can do this? # also is reusable a better name than storable? prev_storage.get_api().remove(prev_sdb.get_name()) prev_sdb.data_block.stored_data_blocks.remove(prev_sdb) if prev_sdb in sess.new: sess.expunge(prev_sdb) else: sess.delete(prev_sdb) prev_sdb = next_sdb prev_storage = next_storage return next_sdb
def create_stored_datablock(self) -> StoredDataBlockMetadata: block = DataBlockMetadata( id=get_datablock_id(), inferred_schema_key=None, nominal_schema_key=None, realized_schema_key="Any", record_count=None, created_by_node_key=self.node.key, ) sdb = StoredDataBlockMetadata( # type: ignore id=get_stored_datablock_id(), data_block_id=block.id, data_block=block, storage_url=self.execution_context.target_storage.url, data_format=None, ) return sdb