示例#1
0
def test_data_block_methods():
    env = make_test_env()
    db = DataBlockMetadata(
        id=get_datablock_id(),
        inferred_schema_key="_test.TestSchema1",
        nominal_schema_key="_test.TestSchema2",
        realized_schema_key="_test.TestSchema3",
    )
    strg = env.get_default_local_python_storage()
    records = [{"a": 1}]
    sdb = StoredDataBlockMetadata(
        id=get_stored_datablock_id(),
        data_block_id=db.id,
        data_block=db,
        storage_url=strg.url,
        data_format=RecordsFormat,
    )
    with env.md_api.begin():
        env.md_api.add(db)
        env.md_api.add(sdb)
        assert sdb.name is None
        name = sdb.get_name_for_storage()
        assert len(name) > 10
        assert sdb.name == name
        strg.get_api().put(sdb.get_name_for_storage(), records)
        assert db.inferred_schema(env) == TestSchema1
        assert db.nominal_schema(env) == TestSchema2
        assert db.realized_schema(env) == TestSchema3
        db.compute_record_count()
        assert db.record_count == 1
示例#2
0
 def append_records_to_stored_datablock(self, name: str, storage: Storage,
                                        sdb: StoredDataBlockMetadata):
     self.resolve_new_object_with_data_block(sdb, name, storage)
     if sdb.data_format is None:
         fmt = infer_format_for_name(name, storage)
         # if sdb.data_format and sdb.data_format != fmt:
         #     raise Exception(f"Format mismatch {fmt} - {sdb.data_format}")
         if fmt is None:
             raise Exception(f"Could not infer format {name} on {storage}")
         sdb.data_format = fmt
     # TODO: to_format
     # TODO: make sure this handles no-ops (empty object, same storage)
     # TODO: copy or alias? sometimes we are just moving temp obj to new name, dont need copy
     result = dcp.copy(
         from_name=name,
         from_storage=storage,
         to_name=sdb.get_name_for_storage(),
         to_storage=sdb.storage,
         to_format=sdb.data_format,
         available_storages=self.execution_context.storages,
         if_exists="append",
     )
     logger.debug(f"Copied {result}")
     logger.debug(f"REMOVING NAME {name}")
     storage.get_api().remove(name)
def convert_sdb(
    env: Environment,
    sess: Session,
    sdb: StoredDataBlockMetadata,
    conversion_path: ConversionPath,
    target_storage: Storage,
    storages: Optional[List[Storage]] = None,
) -> StoredDataBlockMetadata:
    if not conversion_path.conversions:
        return sdb
    if storages is None:
        storages = env.storages
    prev_sdb = sdb
    next_sdb: Optional[StoredDataBlockMetadata] = None
    prev_storage = sdb.storage
    next_storage: Optional[Storage] = None
    realized_schema = sdb.realized_schema(env, sess)
    for conversion_edge in conversion_path.conversions:
        conversion = conversion_edge.conversion
        target_storage_format = conversion.to_storage_format
        next_storage = select_storage(target_storage, storages,
                                      target_storage_format)
        logger.debug(
            f"CONVERSION: {conversion.from_storage_format} -> {conversion.to_storage_format}"
        )
        next_sdb = StoredDataBlockMetadata(  # type: ignore
            id=get_datablock_id(),
            data_block_id=prev_sdb.data_block_id,
            data_block=prev_sdb.data_block,
            data_format=target_storage_format.data_format,
            storage_url=next_storage.url,
        )
        sess.add(next_sdb)
        conversion_edge.copier.copy(
            from_name=prev_sdb.get_name(),
            to_name=next_sdb.get_name(),
            conversion=conversion,
            from_storage_api=prev_storage.get_api(),
            to_storage_api=next_storage.get_api(),
            schema=realized_schema,
        )
        if (prev_sdb.data_format.is_python_format()
                and not prev_sdb.data_format.is_storable()):
            # If the records obj is in python and not storable, and we just used it, then it can be reused
            # TODO: Bit of a hack. Is there a central place we can do this?
            #       also is reusable a better name than storable?
            prev_storage.get_api().remove(prev_sdb.get_name())
            prev_sdb.data_block.stored_data_blocks.remove(prev_sdb)
            if prev_sdb in sess.new:
                sess.expunge(prev_sdb)
            else:
                sess.delete(prev_sdb)
        prev_sdb = next_sdb
        prev_storage = next_storage
    return next_sdb
示例#4
0
 def resolve_new_object_with_data_block(self, sdb: StoredDataBlockMetadata,
                                        name: str, storage: Storage):
     # TOO expensive to infer schema every time, so just do first time
     if sdb.data_block.realized_schema_key in (None, "Any", "core.Any"):
         handler = get_handler_for_name(name, storage)
         inferred_schema = handler().infer_schema(name, storage)
         logger.debug(
             f"Inferred schema: {inferred_schema.key} {inferred_schema.fields_summary()}"
         )
         self.env.add_new_generated_schema(inferred_schema)
         # Cast to nominal if no existing realized schema
         realized_schema = cast_to_realized_schema(
             self.env,
             inferred_schema=inferred_schema,
             nominal_schema=sdb.nominal_schema(self.env),
         )
         logger.debug(
             f"Realized schema: {realized_schema.key} {realized_schema.fields_summary()}"
         )
         self.env.add_new_generated_schema(realized_schema)
         sdb.data_block.realized_schema_key = realized_schema.key
     #     # If already a realized schema, conform new inferred schema to existing realized
     #     realized_schema = cast_to_realized_schema(
     #         self.env,
     #         inferred_schema=inferred_schema,
     #         nominal_schema=sdb.data_block.realized_schema(self.env),
     #     )
     if sdb.data_block.nominal_schema_key:
         logger.debug(
             f"Nominal schema: {sdb.data_block.nominal_schema_key} {sdb.data_block.nominal_schema(self.env).fields_summary()}"
         )
示例#5
0
 def resolve_new_object_with_data_block(self, sdb: StoredDataBlockMetadata,
                                        name: str, storage: Storage):
     handler = get_handler_for_name(name, storage)
     inferred_schema = handler().infer_schema(name, storage)
     self.env.add_new_generated_schema(inferred_schema)
     if sdb.data_block.realized_schema_key in (None, "Any"):
         # Cast to nominal if no existing realized schema
         realized_schema = cast_to_realized_schema(
             self.env,
             inferred_schema=inferred_schema,
             nominal_schema=sdb.nominal_schema(self.env),
         )
     else:
         # If already a realized schema, conform new inferred schema to existing realized
         realized_schema = cast_to_realized_schema(
             self.env,
             inferred_schema=inferred_schema,
             nominal_schema=sdb.data_block.realized_schema(self.env),
         )
     self.env.add_new_generated_schema(realized_schema)
     sdb.data_block.realized_schema_key = realized_schema.key
     logger.debug(
         f"Inferred schema: {inferred_schema.key} {inferred_schema.fields_summary()}"
     )
     logger.debug(
         f"Realized schema: {realized_schema.key} {realized_schema.fields_summary()}"
     )
     if sdb.data_block.nominal_schema_key:
         logger.debug(
             f"Nominal schema: {sdb.data_block.nominal_schema_key} {sdb.data_block.nominal_schema(self.env).fields_summary()}"
         )
示例#6
0
 def append_records_to_stored_datablock(self, name: str, storage: Storage,
                                        sdb: StoredDataBlockMetadata):
     self.resolve_new_object_with_data_block(sdb, name, storage)
     if sdb.data_format is None:
         sdb.data_format = (self.execution_context.target_format or
                            sdb.storage.storage_engine.get_natural_format())
         # fmt = infer_format_for_name(name, storage)
         # # if sdb.data_format and sdb.data_format != fmt:
         # #     raise Exception(f"Format mismatch {fmt} - {sdb.data_format}")
         # if fmt is None:
         #     raise Exception(f"Could not infer format {name} on {storage}")
         # sdb.data_format = fmt
     # TODO: make sure this handles no-ops (empty object, same storage)
     # TODO: copy or alias? sometimes we are just moving temp obj to new name, dont need copy
     # to_name = sdb.get_name_for_storage()
     # if storage == sdb.storage:
     #     # Same storage
     #     if name == to_name:
     #         # Nothing to do
     #         logger.debug("Output already on storage with same name, nothing to do")
     #         return
     #     else:
     #         # Same storage, just new name
     #         # TODO: should be "rename" ideally (as it is if tmp gets deleted we lose it)
     #         logger.debug("Output already on storage, creating alias")
     #         storage.get_api().create_alias(name, to_name)
     #         return
     logger.debug(
         f"Copying output from {name} {storage} to {sdb.get_name_for_storage()} {sdb.storage} ({sdb.data_format})"
     )
     result = dcp.copy(
         from_name=name,
         from_storage=storage,
         to_name=sdb.get_name_for_storage(),
         to_storage=sdb.storage,
         to_format=sdb.data_format,
         available_storages=self.execution_context.storages,
         if_exists="append",
     )
     logger.debug(f"Copied {result}")
     logger.debug(f"REMOVING NAME {name}")
     storage.get_api().remove(name)
示例#7
0
def copy_sdb(
    env: Environment,
    request: CopyRequest,
    in_sdb: StoredDataBlockMetadata,
    out_sdb: StoredDataBlockMetadata,
    # target_storage: Storage,
    # storages: Optional[List[Storage]] = None,
    create_intermediate_sdbs: bool = True,
):
    result = execute_copy_request(request)
    if create_intermediate_sdbs:
        for name, storage, fmt in result.intermediate_created:
            i_sdb = StoredDataBlockMetadata(  # type: ignore
                id=get_stored_datablock_id(),
                data_block_id=in_sdb.data_block_id,
                data_block=in_sdb.data_block,
                data_format=fmt,
                storage_url=storage.url,
            )
            storage.get_api().create_alias(name, i_sdb.get_name_for_storage())
            env.md_api.add(i_sdb)
示例#8
0
 def create_stored_datablock(self) -> StoredDataBlockMetadata:
     block = DataBlockMetadata(
         id=get_datablock_id(),
         inferred_schema_key=None,
         nominal_schema_key=None,
         realized_schema_key="Any",
         record_count=None,
         created_by_node_key=self.node.key,
     )
     sdb = StoredDataBlockMetadata(  # type: ignore
         id=get_stored_datablock_id(),
         data_block_id=block.id,
         data_block=block,
         storage_url=self.execution_context.target_storage.url,
         data_format=None,
     )
     return sdb
示例#9
0
 def create_alias(self, sdb: StoredDataBlockMetadata) -> Optional[Alias]:
     self.metadata_api.flush([sdb.data_block, sdb])
     alias = sdb.create_alias(self.env, self.node.get_alias())
     self.metadata_api.flush([alias])
     return alias
示例#10
0
def ensure_data_block_on_storage(
    env: Environment,
    block: DataBlockMetadata,
    storage: Storage,
    fmt: Optional[DataFormat] = None,
    eligible_storages: Optional[List[Storage]] = None,
) -> StoredDataBlockMetadata:
    if eligible_storages is None:
        eligible_storages = env.storages
    sdbs = select(StoredDataBlockMetadata).filter(
        StoredDataBlockMetadata.data_block == block)
    match = sdbs.filter(StoredDataBlockMetadata.storage_url == storage.url)
    if fmt:
        match = match.filter(StoredDataBlockMetadata.data_format == fmt)
    matched_sdb = env.md_api.execute(match).scalar_one_or_none()
    if matched_sdb is not None:
        return matched_sdb

    # logger.debug(f"{cnt} SDBs total")
    existing_sdbs = sdbs.filter(
        # DO NOT fetch memory SDBs that aren't of current runtime (since we can't get them!)
        # TODO: clean up memory SDBs when the memory goes away? Doesn't make sense to persist them really
        # Should be a separate in-memory lookup for memory SDBs, so they naturally expire?
        or_(
            ~StoredDataBlockMetadata.storage_url.startswith("python:"),
            StoredDataBlockMetadata.storage_url ==
            env._local_python_storage.url,
        ), )
    # logger.debug(
    #     f"{existing_sdbs.count()} SDBs on-disk or in local memory (local: {self.ctx.local_python_storage.url})"
    # )
    if eligible_storages:
        existing_sdbs = existing_sdbs.filter(
            StoredDataBlockMetadata.storage_url.in_(
                s.url for s in eligible_storages), )
    # logger.debug(f"{existing_sdbs.count()} SDBs in eligible storages")
    fmt = fmt or storage.storage_engine.get_natural_format()
    target_storage_format = StorageFormat(storage.storage_engine, fmt)

    # Compute conversion costs
    eligible_conversion_paths = (
        [])  #: List[List[Tuple[ConversionCostLevel, Type[Converter]]]] = []
    existing_sdbs = list(env.md_api.execute(existing_sdbs).scalars())
    for sdb in existing_sdbs:
        req = CopyRequest(
            from_name=sdb.get_name_for_storage(),
            from_storage=sdb.storage,
            to_name="placeholder",
            to_storage=storage,
            to_format=fmt,
            schema=sdb.realized_schema(env),
            available_storages=eligible_storages,
        )
        pth = get_copy_path(req)
        if pth is not None:
            eligible_conversion_paths.append((pth.total_cost, pth, sdb, req))
    if not eligible_conversion_paths:
        raise NotImplementedError(
            f"No copy path to {target_storage_format} for existing StoredDataBlocks {existing_sdbs}"
        )
    cost, pth, in_sdb, req = min(eligible_conversion_paths, key=lambda x: x[0])
    out_sdb = StoredDataBlockMetadata(  # type: ignore
        id=get_stored_datablock_id(),
        data_block_id=block.id,
        data_block=block,
        data_format=fmt,
        storage_url=storage.url,
    )
    env.md_api.add(out_sdb)
    req.to_name = out_sdb.get_name_for_storage()
    copy_sdb(
        env,
        request=req,
        in_sdb=in_sdb,
        out_sdb=out_sdb,
    )
    return out_sdb
def ensure_alias(sess: Session, node: Node,
                 sdb: StoredDataBlockMetadata) -> Alias:
    logger.debug(
        f"Creating alias {node.get_alias()} for node {node.key} on storage {sdb.storage_url}"
    )
    return sdb.create_alias(sess, node.get_alias())