def _get_output_disk(step_uuid: str, serialization: str) -> Any: """Gets data from disk. Args: step_uuid: The UUID of the step to get output data from. serialization: The serialization for the output. For possible values see :class:`Serialization`. Returns: Data from the step identified by `step_uuid`. Raises: DiskOutputNotFoundError: If output from `step_uuid` cannot be found. DeserializationError: If the data could not be deserialized. """ step_data_dir = Config.get_step_data_dir(step_uuid) full_path = os.path.join(step_data_dir, step_uuid) try: return _deserialize_output_disk(full_path, serialization=serialization) except FileNotFoundError: # TODO: Ideally we want to provide the user with the step's # name instead of UUID. raise error.DiskOutputNotFoundError( f'Output from incoming step "{step_uuid}" cannot be found. ' "Try rerunning it.") # IOError is to try to catch pyarrow failures on opening the file. except (pickle.UnpicklingError, IOError): raise error.DeserializationError( f'Output from incoming step "{step_uuid}" ({full_path}) ' "could not be deserialized.")
def _get_output_memory(step_uuid: str, consumer: Optional[str] = None) -> Any: """Gets data from memory. Args: step_uuid: The UUID of the step to get output data from. consumer: The consumer of the output data. This is put inside the metadata of an empty object to trigger a notification in the plasma store, which is then used to manage eviction of objects. Returns: Data from step identified by `step_uuid`. Raises: DeserializationError: If the data could not be deserialized. MemoryOutputNotFoundError: If output from `step_uuid` cannot be found. OrchestNetworkError: Could not connect to the ``Config.STORE_SOCKET_NAME``, because it does not exist. Which might be because the specified value was wrong or the store died. """ client = _PlasmaConnector().client obj_id = _convert_uuid_to_object_id(step_uuid) try: obj = _deserialize_output_memory(obj_id, client) except error.ObjectNotFoundError: raise error.MemoryOutputNotFoundError( f'Output from incoming step "{step_uuid}" cannot be found. ' "Try rerunning it.") # IOError is to try to catch pyarrow deserialization errors. except (pickle.UnpicklingError, IOError): raise error.DeserializationError( f'Output from incoming step "{step_uuid}" could not be deserialized.' ) else: # TODO: note somewhere (maybe in the docstring) that it might # although very unlikely raise MemoryError, because the # receive is now actually also outputing data. # NOTE: the "ORCHEST_MEMORY_EVICTION" ENV variable is set in the # orchest-api. Now we always know when we are running inside a # jupyter kernel interactively. And in that case we never want # to do eviction. if os.getenv("ORCHEST_MEMORY_EVICTION") is not None: empty_obj, _ = _serialize("") msg = f"{Config.IDENTIFIER_EVICTION};{step_uuid},{consumer}" metadata = bytes(msg, "utf-8") _output_to_memory(empty_obj, client, metadata=metadata) return obj