def test_data_block_methods(): env = make_test_env() db = DataBlockMetadata( id=get_datablock_id(), inferred_schema_key="_test.TestSchema1", nominal_schema_key="_test.TestSchema2", realized_schema_key="_test.TestSchema3", ) strg = env.get_default_local_python_storage() records = [{"a": 1}] sdb = StoredDataBlockMetadata( id=get_stored_datablock_id(), data_block_id=db.id, data_block=db, storage_url=strg.url, data_format=RecordsFormat, ) with env.md_api.begin(): env.md_api.add(db) env.md_api.add(sdb) assert sdb.name is None name = sdb.get_name_for_storage() assert len(name) > 10 assert sdb.name == name strg.get_api().put(sdb.get_name_for_storage(), records) assert db.inferred_schema(env) == TestSchema1 assert db.nominal_schema(env) == TestSchema2 assert db.realized_schema(env) == TestSchema3 db.compute_record_count() assert db.record_count == 1
def append_records_to_stored_datablock(self, name: str, storage: Storage, sdb: StoredDataBlockMetadata): self.resolve_new_object_with_data_block(sdb, name, storage) if sdb.data_format is None: fmt = infer_format_for_name(name, storage) # if sdb.data_format and sdb.data_format != fmt: # raise Exception(f"Format mismatch {fmt} - {sdb.data_format}") if fmt is None: raise Exception(f"Could not infer format {name} on {storage}") sdb.data_format = fmt # TODO: to_format # TODO: make sure this handles no-ops (empty object, same storage) # TODO: copy or alias? sometimes we are just moving temp obj to new name, dont need copy result = dcp.copy( from_name=name, from_storage=storage, to_name=sdb.get_name_for_storage(), to_storage=sdb.storage, to_format=sdb.data_format, available_storages=self.execution_context.storages, if_exists="append", ) logger.debug(f"Copied {result}") logger.debug(f"REMOVING NAME {name}") storage.get_api().remove(name)
def convert_sdb( env: Environment, sess: Session, sdb: StoredDataBlockMetadata, conversion_path: ConversionPath, target_storage: Storage, storages: Optional[List[Storage]] = None, ) -> StoredDataBlockMetadata: if not conversion_path.conversions: return sdb if storages is None: storages = env.storages prev_sdb = sdb next_sdb: Optional[StoredDataBlockMetadata] = None prev_storage = sdb.storage next_storage: Optional[Storage] = None realized_schema = sdb.realized_schema(env, sess) for conversion_edge in conversion_path.conversions: conversion = conversion_edge.conversion target_storage_format = conversion.to_storage_format next_storage = select_storage(target_storage, storages, target_storage_format) logger.debug( f"CONVERSION: {conversion.from_storage_format} -> {conversion.to_storage_format}" ) next_sdb = StoredDataBlockMetadata( # type: ignore id=get_datablock_id(), data_block_id=prev_sdb.data_block_id, data_block=prev_sdb.data_block, data_format=target_storage_format.data_format, storage_url=next_storage.url, ) sess.add(next_sdb) conversion_edge.copier.copy( from_name=prev_sdb.get_name(), to_name=next_sdb.get_name(), conversion=conversion, from_storage_api=prev_storage.get_api(), to_storage_api=next_storage.get_api(), schema=realized_schema, ) if (prev_sdb.data_format.is_python_format() and not prev_sdb.data_format.is_storable()): # If the records obj is in python and not storable, and we just used it, then it can be reused # TODO: Bit of a hack. Is there a central place we can do this? # also is reusable a better name than storable? prev_storage.get_api().remove(prev_sdb.get_name()) prev_sdb.data_block.stored_data_blocks.remove(prev_sdb) if prev_sdb in sess.new: sess.expunge(prev_sdb) else: sess.delete(prev_sdb) prev_sdb = next_sdb prev_storage = next_storage return next_sdb
def resolve_new_object_with_data_block(self, sdb: StoredDataBlockMetadata, name: str, storage: Storage): # TOO expensive to infer schema every time, so just do first time if sdb.data_block.realized_schema_key in (None, "Any", "core.Any"): handler = get_handler_for_name(name, storage) inferred_schema = handler().infer_schema(name, storage) logger.debug( f"Inferred schema: {inferred_schema.key} {inferred_schema.fields_summary()}" ) self.env.add_new_generated_schema(inferred_schema) # Cast to nominal if no existing realized schema realized_schema = cast_to_realized_schema( self.env, inferred_schema=inferred_schema, nominal_schema=sdb.nominal_schema(self.env), ) logger.debug( f"Realized schema: {realized_schema.key} {realized_schema.fields_summary()}" ) self.env.add_new_generated_schema(realized_schema) sdb.data_block.realized_schema_key = realized_schema.key # # If already a realized schema, conform new inferred schema to existing realized # realized_schema = cast_to_realized_schema( # self.env, # inferred_schema=inferred_schema, # nominal_schema=sdb.data_block.realized_schema(self.env), # ) if sdb.data_block.nominal_schema_key: logger.debug( f"Nominal schema: {sdb.data_block.nominal_schema_key} {sdb.data_block.nominal_schema(self.env).fields_summary()}" )
def resolve_new_object_with_data_block(self, sdb: StoredDataBlockMetadata, name: str, storage: Storage): handler = get_handler_for_name(name, storage) inferred_schema = handler().infer_schema(name, storage) self.env.add_new_generated_schema(inferred_schema) if sdb.data_block.realized_schema_key in (None, "Any"): # Cast to nominal if no existing realized schema realized_schema = cast_to_realized_schema( self.env, inferred_schema=inferred_schema, nominal_schema=sdb.nominal_schema(self.env), ) else: # If already a realized schema, conform new inferred schema to existing realized realized_schema = cast_to_realized_schema( self.env, inferred_schema=inferred_schema, nominal_schema=sdb.data_block.realized_schema(self.env), ) self.env.add_new_generated_schema(realized_schema) sdb.data_block.realized_schema_key = realized_schema.key logger.debug( f"Inferred schema: {inferred_schema.key} {inferred_schema.fields_summary()}" ) logger.debug( f"Realized schema: {realized_schema.key} {realized_schema.fields_summary()}" ) if sdb.data_block.nominal_schema_key: logger.debug( f"Nominal schema: {sdb.data_block.nominal_schema_key} {sdb.data_block.nominal_schema(self.env).fields_summary()}" )
def append_records_to_stored_datablock(self, name: str, storage: Storage, sdb: StoredDataBlockMetadata): self.resolve_new_object_with_data_block(sdb, name, storage) if sdb.data_format is None: sdb.data_format = (self.execution_context.target_format or sdb.storage.storage_engine.get_natural_format()) # fmt = infer_format_for_name(name, storage) # # if sdb.data_format and sdb.data_format != fmt: # # raise Exception(f"Format mismatch {fmt} - {sdb.data_format}") # if fmt is None: # raise Exception(f"Could not infer format {name} on {storage}") # sdb.data_format = fmt # TODO: make sure this handles no-ops (empty object, same storage) # TODO: copy or alias? sometimes we are just moving temp obj to new name, dont need copy # to_name = sdb.get_name_for_storage() # if storage == sdb.storage: # # Same storage # if name == to_name: # # Nothing to do # logger.debug("Output already on storage with same name, nothing to do") # return # else: # # Same storage, just new name # # TODO: should be "rename" ideally (as it is if tmp gets deleted we lose it) # logger.debug("Output already on storage, creating alias") # storage.get_api().create_alias(name, to_name) # return logger.debug( f"Copying output from {name} {storage} to {sdb.get_name_for_storage()} {sdb.storage} ({sdb.data_format})" ) result = dcp.copy( from_name=name, from_storage=storage, to_name=sdb.get_name_for_storage(), to_storage=sdb.storage, to_format=sdb.data_format, available_storages=self.execution_context.storages, if_exists="append", ) logger.debug(f"Copied {result}") logger.debug(f"REMOVING NAME {name}") storage.get_api().remove(name)
def copy_sdb( env: Environment, request: CopyRequest, in_sdb: StoredDataBlockMetadata, out_sdb: StoredDataBlockMetadata, # target_storage: Storage, # storages: Optional[List[Storage]] = None, create_intermediate_sdbs: bool = True, ): result = execute_copy_request(request) if create_intermediate_sdbs: for name, storage, fmt in result.intermediate_created: i_sdb = StoredDataBlockMetadata( # type: ignore id=get_stored_datablock_id(), data_block_id=in_sdb.data_block_id, data_block=in_sdb.data_block, data_format=fmt, storage_url=storage.url, ) storage.get_api().create_alias(name, i_sdb.get_name_for_storage()) env.md_api.add(i_sdb)
def create_stored_datablock(self) -> StoredDataBlockMetadata: block = DataBlockMetadata( id=get_datablock_id(), inferred_schema_key=None, nominal_schema_key=None, realized_schema_key="Any", record_count=None, created_by_node_key=self.node.key, ) sdb = StoredDataBlockMetadata( # type: ignore id=get_stored_datablock_id(), data_block_id=block.id, data_block=block, storage_url=self.execution_context.target_storage.url, data_format=None, ) return sdb
def create_alias(self, sdb: StoredDataBlockMetadata) -> Optional[Alias]: self.metadata_api.flush([sdb.data_block, sdb]) alias = sdb.create_alias(self.env, self.node.get_alias()) self.metadata_api.flush([alias]) return alias
def ensure_data_block_on_storage( env: Environment, block: DataBlockMetadata, storage: Storage, fmt: Optional[DataFormat] = None, eligible_storages: Optional[List[Storage]] = None, ) -> StoredDataBlockMetadata: if eligible_storages is None: eligible_storages = env.storages sdbs = select(StoredDataBlockMetadata).filter( StoredDataBlockMetadata.data_block == block) match = sdbs.filter(StoredDataBlockMetadata.storage_url == storage.url) if fmt: match = match.filter(StoredDataBlockMetadata.data_format == fmt) matched_sdb = env.md_api.execute(match).scalar_one_or_none() if matched_sdb is not None: return matched_sdb # logger.debug(f"{cnt} SDBs total") existing_sdbs = sdbs.filter( # DO NOT fetch memory SDBs that aren't of current runtime (since we can't get them!) # TODO: clean up memory SDBs when the memory goes away? Doesn't make sense to persist them really # Should be a separate in-memory lookup for memory SDBs, so they naturally expire? or_( ~StoredDataBlockMetadata.storage_url.startswith("python:"), StoredDataBlockMetadata.storage_url == env._local_python_storage.url, ), ) # logger.debug( # f"{existing_sdbs.count()} SDBs on-disk or in local memory (local: {self.ctx.local_python_storage.url})" # ) if eligible_storages: existing_sdbs = existing_sdbs.filter( StoredDataBlockMetadata.storage_url.in_( s.url for s in eligible_storages), ) # logger.debug(f"{existing_sdbs.count()} SDBs in eligible storages") fmt = fmt or storage.storage_engine.get_natural_format() target_storage_format = StorageFormat(storage.storage_engine, fmt) # Compute conversion costs eligible_conversion_paths = ( []) #: List[List[Tuple[ConversionCostLevel, Type[Converter]]]] = [] existing_sdbs = list(env.md_api.execute(existing_sdbs).scalars()) for sdb in existing_sdbs: req = CopyRequest( from_name=sdb.get_name_for_storage(), from_storage=sdb.storage, to_name="placeholder", to_storage=storage, to_format=fmt, schema=sdb.realized_schema(env), available_storages=eligible_storages, ) pth = get_copy_path(req) if pth is not None: eligible_conversion_paths.append((pth.total_cost, pth, sdb, req)) if not eligible_conversion_paths: raise NotImplementedError( f"No copy path to {target_storage_format} for existing StoredDataBlocks {existing_sdbs}" ) cost, pth, in_sdb, req = min(eligible_conversion_paths, key=lambda x: x[0]) out_sdb = StoredDataBlockMetadata( # type: ignore id=get_stored_datablock_id(), data_block_id=block.id, data_block=block, data_format=fmt, storage_url=storage.url, ) env.md_api.add(out_sdb) req.to_name = out_sdb.get_name_for_storage() copy_sdb( env, request=req, in_sdb=in_sdb, out_sdb=out_sdb, ) return out_sdb
def ensure_alias(sess: Session, node: Node, sdb: StoredDataBlockMetadata) -> Alias: logger.debug( f"Creating alias {node.get_alias()} for node {node.key} on storage {sdb.storage_url}" ) return sdb.create_alias(sess, node.get_alias())