示例#1
0
def _get_output_disk(step_uuid: str, serialization: str) -> Any:
    """Gets data from disk.

    Args:
        step_uuid: The UUID of the step to get output data from.
        serialization: The serialization for the output. For possible
            values see :class:`Serialization`.

    Returns:
        Data from the step identified by `step_uuid`.

    Raises:
        DiskOutputNotFoundError: If output from `step_uuid` cannot be
            found.
        DeserializationError: If the data could not be deserialized.
    """
    step_data_dir = Config.get_step_data_dir(step_uuid)
    full_path = os.path.join(step_data_dir, step_uuid)

    try:
        return _deserialize_output_disk(full_path, serialization=serialization)
    except FileNotFoundError:
        # TODO: Ideally we want to provide the user with the step's
        #       name instead of UUID.
        raise error.DiskOutputNotFoundError(
            f'Output from incoming step "{step_uuid}" cannot be found. '
            "Try rerunning it.")
    # IOError is to try to catch pyarrow failures on opening the file.
    except (pickle.UnpicklingError, IOError):
        raise error.DeserializationError(
            f'Output from incoming step "{step_uuid}" ({full_path}) '
            "could not be deserialized.")
示例#2
0
def _get_output_memory(step_uuid: str, consumer: Optional[str] = None) -> Any:
    """Gets data from memory.

    Args:
        step_uuid: The UUID of the step to get output data from.
        consumer: The consumer of the output data. This is put inside
            the metadata of an empty object to trigger a notification in
            the plasma store, which is then used to manage eviction of
            objects.

    Returns:
        Data from step identified by `step_uuid`.

    Raises:
        DeserializationError: If the data could not be deserialized.
        MemoryOutputNotFoundError: If output from `step_uuid` cannot be
            found.
        OrchestNetworkError: Could not connect to the
            ``Config.STORE_SOCKET_NAME``, because it does not exist.
            Which might be because the specified value was wrong or the
            store died.
    """
    client = _PlasmaConnector().client

    obj_id = _convert_uuid_to_object_id(step_uuid)
    try:
        obj = _deserialize_output_memory(obj_id, client)

    except error.ObjectNotFoundError:
        raise error.MemoryOutputNotFoundError(
            f'Output from incoming step "{step_uuid}" cannot be found. '
            "Try rerunning it.")
    # IOError is to try to catch pyarrow deserialization errors.
    except (pickle.UnpicklingError, IOError):
        raise error.DeserializationError(
            f'Output from incoming step "{step_uuid}" could not be deserialized.'
        )
    else:
        # TODO: note somewhere (maybe in the docstring) that it might
        #       although very unlikely raise MemoryError, because the
        #       receive is now actually also outputing data.
        # NOTE: the "ORCHEST_MEMORY_EVICTION" ENV variable is set in the
        # orchest-api. Now we always know when we are running inside a
        # jupyter kernel interactively. And in that case we never want
        # to do eviction.
        if os.getenv("ORCHEST_MEMORY_EVICTION") is not None:
            empty_obj, _ = _serialize("")
            msg = f"{Config.IDENTIFIER_EVICTION};{step_uuid},{consumer}"
            metadata = bytes(msg, "utf-8")
            _output_to_memory(empty_obj, client, metadata=metadata)

    return obj