def get_step_uuid(pipeline: Pipeline) -> str: """Gets the currently running script's step UUID. Args: pipeline: Pipeline object describing the pipeline and its steps. Returns: The UUID of the currently running step. May it be through an active Jupyter kernel or as part of a partial run. Raises: StepUUIDResolveError: The step's UUID cannot be resolved. """ # In case of partial runs, the step UUID can be obtained via the # environment. if "ORCHEST_STEP_UUID" in os.environ: return os.environ["ORCHEST_STEP_UUID"] # The KERNEL_ID environment variable is set by the Jupyter # Enterprise Gateway. kernel_id = os.environ.get("KERNEL_ID") if kernel_id is None: raise StepUUIDResolveError('Environment variable "KERNEL_ID" not present.') # Get JupyterLab sessions to resolve the step's UUID via the id of # the running kernel and the step's associated file path. This # requires an authenticated request, which is obtained by requesting # the token via the Orchest API. # Orchest API --token--> Jupyter sessions --notebook path--> UUID. launches_url = f'http://orchest-api/api/sessions/{Config.PROJECT_UUID}/{pipeline.properties["uuid"]}' launch_data = _request_json(launches_url) jupyter_api_url = "http://{ip}:{port}/{proxy_prefix}/api/sessions?token={token}" jupyter_api_url = jupyter_api_url.format( ip=launch_data["jupyter_server_ip"], port=launch_data["notebook_server_info"]["port"], proxy_prefix="jupyter_" + launch_data["jupyter_server_ip"].replace(".", "_"), token=launch_data["notebook_server_info"]["token"], ) jupyter_sessions = _request_json(jupyter_api_url) for session in jupyter_sessions: if session["kernel"]["id"] == kernel_id: notebook_path = session["notebook"]["path"] break else: raise StepUUIDResolveError( f'Jupyter session data has no "kernel" with "id" equal to the ' '"KERNEL_ID" of this step: {kernel_id}.' ) for step in pipeline.steps: if step.properties["file_path"] == notebook_path: # NOTE: the UUID cannot be cached here. Because if the # notebook is assigned to a different step, then the env # variable does not change and thus the notebooks wrongly # thinks it is a different step. return step.properties["uuid"] raise StepUUIDResolveError('No step with "notebook_path": {notebook_path}.')
def update_params(params: Dict[str, Any]) -> None: """Updates the parameters of the current step. Additionally, you can set new parameters by giving parameters that do not yet exist in the current parameters of the pipeline step. Internally the updating is done by calling the ``dict.update`` method. This further explains the behavior of this method. Args: params: The parameters to update. Either updating their values or adding new parameter keys. """ with open(Config.PIPELINE_DEFINITION_PATH, "r") as f: pipeline_definition = json.load(f) pipeline = Pipeline.from_json(pipeline_definition) try: step_uuid = get_step_uuid(pipeline) except StepUUIDResolveError: raise StepUUIDResolveError("Parameters could not be identified.") step = pipeline.get_step_by_uuid(step_uuid) step.update_params(params) with open(Config.PIPELINE_DEFINITION_PATH, "w") as f: json.dump(pipeline.to_dict(), f)
def get_params() -> Dict[str, Any]: """Gets the parameters of the current step. Returns: The parameters of the current step. """ with open(Config.PIPELINE_DEFINITION_PATH, "r") as f: pipeline_definition = json.load(f) pipeline = Pipeline.from_json(pipeline_definition) try: step_uuid = get_step_uuid(pipeline) except StepUUIDResolveError: raise StepUUIDResolveError("Parameters could not be identified.") step = pipeline.get_step_by_uuid(step_uuid) params = step.get_params() return params
def get_params() -> Dict[str, Any]: """Gets the parameters of the current step. Returns: The parameters of the current step. """ with open(Config.PIPELINE_DESCRIPTION_PATH, "r") as f: pipeline_description = json.load(f) pipeline = Pipeline.from_json(pipeline_description) try: step_uuid = get_step_uuid(pipeline) except StepUUIDResolveError: raise StepUUIDResolveError( "Failed to determine from where to get data.") step = pipeline.get_step_by_uuid(step_uuid) params = step.get_params() return params
def update_params(params: Dict[str, Any]) -> Dict[str, Any]: """Updates the parameters of the current step. Additionally, you can set new parameters by giving parameters that do not yet exist in the current parameters of the pipeline step. Internally the updating is done by calling the ``dict.update`` method. This further explains the behavior of this method. Args: params: The parameters to update. Either updating their values or adding new parameter keys. Returns: The updated parameters mapping. """ with open(Config.PIPELINE_DESCRIPTION_PATH, "r") as f: pipeline_description = json.load(f) pipeline = Pipeline.from_json(pipeline_description) try: step_uuid = get_step_uuid(pipeline) except StepUUIDResolveError: raise StepUUIDResolveError( "Failed to determine from where to get data.") # TODO: This is inefficient, we could just use the `step_uuid` and # update the params of the `pipeline_description` and write it # back to the `pipeline.json`. However, I think it is good # practice to use our own defined classes to do so. step = pipeline.get_step_by_uuid(step_uuid) curr_params = step.get_params() curr_params.update(params) with open(Config.PIPELINE_DESCRIPTION_PATH, "w") as f: json.dump(pipeline.to_dict(), f) return curr_params
def get_inputs(ignore_failure: bool = False, verbose: bool = False) -> List[Any]: """Gets all data sent from incoming steps. Args: ignore_failure: If True then the returned result can have ``None`` values if the data of a step could not be retrieved. If False, then this function will fail if any of the incoming steps's data could not be retrieved. Example: ``[None, 'Hello World!']`` vs :exc:`OutputNotFoundError` verbose: If True print all the steps from which the current step has retrieved data. Returns: List of all the data in the specified order from the front-end. Example: Raises: StepUUIDResolveError: The step's UUID cannot be resolved and thus it cannot determine what inputs to get. Example: >>> # It does not matter how the data was output in steps 1 and 2. >>> # It is resolved automatically by the get_inputs method. >>> data_step_1, data_step_2 = get_inputs() Warning: Only call :meth:`get_inputs` once! When auto eviction is configured data might no longer be available. Either cache the data or maintain a copy yourself. """ with open(Config.PIPELINE_DESCRIPTION_PATH, "r") as f: pipeline_description = json.load(f) pipeline = Pipeline.from_json(pipeline_description) try: step_uuid = get_step_uuid(pipeline) except StepUUIDResolveError: raise StepUUIDResolveError( "Failed to determine from where to get data.") # TODO: maybe instead of for loop we could first get the receive # method and then do batch receive. For example memory allows # to do get_buffers which operates in batch. # NOTE: the order in which the `parents` list is traversed is # indirectly set in the UI. The order is important since it # determines the order in which the inputs are received in the next # step. data = [] for parent in pipeline.get_step_by_uuid(step_uuid).parents: parent_uuid = parent.properties["uuid"] get_output_method, args, kwargs = resolve(parent_uuid, consumer=step_uuid) # Either raise an error on failure of getting output or # continue with other steps. try: incoming_step_data = get_output_method(*args, **kwargs) except OutputNotFoundError as e: if not ignore_failure: raise OutputNotFoundError(e) incoming_step_data = None if verbose: parent_title = parent.properties["title"] if incoming_step_data is None: print(f'Failed to retrieve input from step: "{parent_title}"') else: print(f'Retrieved input from step: "{parent_title}"') data.append(incoming_step_data) return data
def output_to_memory(data: Any, pickle_fallback: bool = True, disk_fallback: bool = True) -> None: """Outputs data to memory. To manage outputing the data to memory for the user, this function uses metadata to add info to objects inside the plasma store. Args: data: Data to output. pickle_fallback: This option is passed to :meth:`serialize`. If ``pyarrow`` cannot serialize the data, then it will fall back to using ``pickle``. This is helpful for custom data types. disk_fallback: If True, then outputing to disk is used when the `data` does not fit in memory. If False, then a :exc:`MemoryError` is thrown. Raises: MemoryError: If the `data` does not fit in memory and ``disk_fallback=False``. OrchestNetworkError: Could not connect to the ``Config.STORE_SOCKET_NAME``, because it does not exist. Which might be because the specified value was wrong or the store died. StepUUIDResolveError: The step's UUID cannot be resolved and thus it cannot set the correct ID to identify the data in the memory store. Example: >>> data = 'Data I would like to use in my next step' >>> output_to_memory(data) Note: Calling :meth:`output_to_memory` multiple times within the same script will overwrite the output. Generally speaking you therefore want to be only calling the function once. """ # TODO: we might want to wrap this so we can throw a custom error, # if the file cannot be found, i.e. FileNotFoundError. with open(Config.PIPELINE_DESCRIPTION_PATH, "r") as f: pipeline_description = json.load(f) pipeline = Pipeline.from_json(pipeline_description) try: step_uuid = get_step_uuid(pipeline) except StepUUIDResolveError: raise StepUUIDResolveError( "Failed to determine where to output data to.") # Serialize the object and collect the serialization metadata. obj, serialization = serialize(data, pickle_fallback=pickle_fallback) try: client = _PlasmaConnector().client except OrchestNetworkError as e: if not disk_fallback: raise OrchestNetworkError(e) # TODO: note that metadata is lost when falling back to disk. # Therefore we will only support metadata added by the # user, once disk also supports passing metadata. return output_to_disk(obj, serialization=serialization) # Try to output to memory. obj_id = _convert_uuid_to_object_id(step_uuid) metadata = bytes(f"{Config.IDENTIFIER_SERIALIZATION};{serialization}", "utf-8") try: obj_id = _output_to_memory(obj, client, obj_id=obj_id, metadata=metadata) except MemoryError: if not disk_fallback: raise MemoryError("Data does not fit in memory.") # TODO: note that metadata is lost when falling back to disk. # Therefore we will only support metadata added by the # user, once disk also supports passing metadata. return output_to_disk(obj, serialization=serialization) return
def output_to_disk(data: Any, pickle_fallback: bool = True, serialization: Optional[str] = None) -> None: """Outputs data to disk. To manage outputing the data to disk, this function has a side effect: * Writes to a HEAD file alongside the actual data file. This file serves as a protocol that returns the timestamp of the latest write to disk via this function alongside the used serialization. Args: data: Data to output to disk. pickle_fallback: This option is passed to :meth:`serialize`. If ``pyarrow`` cannot serialize the data, then it will fall back to using ``pickle``. This is helpful for custom data types. serialization: Serialization of the `data` in case it is already serialized. Currently supported values are: ``['arrow', 'arrowpickle']``. Raises: StepUUIDResolveError: The step's UUID cannot be resolved and thus it cannot determine where to output data to. Example: >>> data = 'Data I would like to use in my next step' >>> output_to_disk(data) Note: Calling :meth:`output_to_disk` multiple times within the same script will overwrite the output. Generally speaking you therefore want to be only calling the function once. """ with open(Config.PIPELINE_DESCRIPTION_PATH, "r") as f: pipeline_description = json.load(f) pipeline = Pipeline.from_json(pipeline_description) try: step_uuid = get_step_uuid(pipeline) except StepUUIDResolveError: raise StepUUIDResolveError( "Failed to determine where to output data to.") # In case the data is not already serialized, then we need to # serialize it. if serialization is None: data, serialization = serialize(data, pickle_fallback=pickle_fallback) # Recursively create any directories if they do not already exists. step_data_dir = Config.get_step_data_dir(step_uuid) os.makedirs(step_data_dir, exist_ok=True) # The HEAD file serves to resolve the transfer method. head_file = os.path.join(step_data_dir, "HEAD") with open(head_file, "w") as f: current_time = datetime.utcnow() f.write( f'{current_time.isoformat(timespec="seconds")}, {serialization}') # Full path to write the actual data to. full_path = os.path.join(step_data_dir, step_uuid) return _output_to_disk(data, full_path, serialization=serialization)
def get_inputs(ignore_failure: bool = False, verbose: bool = False) -> Dict[str, Any]: """Gets all data sent from incoming steps. Args: ignore_failure: If True then the returned result can have ``None`` values if the data of a step could not be retrieved. If False, then this function will fail if any of the incoming steps's data could not be retrieved. Example: ``[None, "Hello World!"]`` vs :exc:`OutputNotFoundError` verbose: If True print all the steps from which the current step has retrieved data. Returns: Dictionary with input data for this step. We differentiate between two cases: * Named data, which is data that was outputted with a `name` by any parent step. Named data can be retrieved through the dictionary by its name, e.g. ``data = get_inputs()["my_name"]``. Name collisions will raise an :exc:`InputNameCollisionError`. * Unnamed data, which is an ordered list containing all the data that was outputted without a name by the parent steps. Unnamed data can be retrieved by accessing the reserved ``"unnamed"`` key. The order of this list depends on the order of the parent steps of the node, which is visible through the GUI. Example:: # It does not matter how the data was output by parent steps. # It is resolved automatically by the get_inputs method. { "unnamed" : ["Hello World!", (3, 4)], "named_1" : "mystring", "named_2" : [1, 2, 3] } Raises: InputNameCollisionError: Multiple steps have outputted data with the same name. OutputNotFoundError: If no output can be found of the given `step_uuid`. Either no output was generated or the in-memory object store died (and therefore lost all its data). StepUUIDResolveError: The step's UUID cannot be resolved and thus it cannot determine what inputs to get. Warning: Only call :meth:`get_inputs` once! When auto eviction is configured data might no longer be available. Either cache the data or maintain a copy yourself. """ with open(Config.PIPELINE_DEFINITION_PATH, "r") as f: pipeline_definition = json.load(f) pipeline = Pipeline.from_json(pipeline_definition) try: step_uuid = get_step_uuid(pipeline) except StepUUIDResolveError: raise StepUUIDResolveError( "Failed to determine from where to get data.") collisions_dict = defaultdict(list) get_output_methods = [] # Check for collisions before retrieving any data. for parent in pipeline.get_step_by_uuid(step_uuid).parents: # For each parent get what function to use to retrieve its # output data and metadata related to said data. parent_uuid = parent.properties["uuid"] get_output_method, args, kwargs, metadata = _resolve( parent_uuid, consumer=step_uuid) # Maintain the output methods in order, but wait with calling # them so that we can first check for collisions. get_output_methods.append( (parent, get_output_method, args, kwargs, metadata)) if metadata["name"] != Config._RESERVED_UNNAMED_OUTPUTS_STR: collisions_dict[metadata["name"]].append( parent.properties["title"]) # If there are collisions raise an error. collisions_dict = {k: v for k, v in collisions_dict.items() if len(v) > 1} if collisions_dict: msg = [ f"\n{name}: {sorted(step_names)}" for name, step_names in collisions_dict.items() ] msg = "".join(msg) raise InputNameCollisionError( f"Name collisions between input data coming from different steps: {msg}" ) # TODO: maybe instead of for loop we could first get the receive # method and then do batch receive. For example memory allows # to do get_buffers which operates in batch. # NOTE: the order in which the `parents` list is traversed is # indirectly set in the UI. The order is important since it # determines the order in which unnamed inputs are received in # the next step. data = {Config._RESERVED_UNNAMED_OUTPUTS_STR: []} # type: Dict[str, Any] for parent, get_output_method, args, kwargs, metadata in get_output_methods: # Either raise an error on failure of getting output or # continue with other steps. try: incoming_step_data = get_output_method(*args, **kwargs) except OutputNotFoundError as e: if not ignore_failure: raise OutputNotFoundError(e) incoming_step_data = None if verbose: parent_title = parent.properties["title"] if incoming_step_data is None: print(f'Failed to retrieve input from step: "{parent_title}"') else: print(f'Retrieved input from step: "{parent_title}"') # Populate the return dictionary, where nameless data gets # appended to a list and named data becomes a (name, data) pair. name = metadata["name"] if name == Config._RESERVED_UNNAMED_OUTPUTS_STR: data[Config._RESERVED_UNNAMED_OUTPUTS_STR].append( incoming_step_data) else: data[name] = incoming_step_data return data
def output_to_memory(data: Any, name: Optional[str], disk_fallback: bool = True) -> None: """Outputs data to memory. To manage outputing the data to memory for the user, this function uses metadata to add info to objects inside the plasma store. Args: data: Data to output. name: Name of the output data. As a string, it becomes the name of the data, when ``None``, the data is considered nameless. This affects the way the data can be later retrieved using :func:`get_inputs`. disk_fallback: If True, then outputing to disk is used when the `data` does not fit in memory. If False, then a :exc:`MemoryError` is thrown. Raises: DataInvalidNameError: The name of the output data is invalid, e.g because it is a reserved name (``"unnamed"``) or because it contains a reserved substring. MemoryError: If the `data` does not fit in memory and ``disk_fallback=False``. OrchestNetworkError: Could not connect to the ``Config.STORE_SOCKET_NAME``, because it does not exist. Which might be because the specified value was wrong or the store died. StepUUIDResolveError: The step's UUID cannot be resolved and thus it cannot set the correct ID to identify the data in the memory store. Example: >>> data = "Data I would like to use in my next step" >>> output_to_memory(data, name="my_data") Note: Calling :meth:`output_to_memory` multiple times within the same script will overwrite the output, even when using a different output ``name``. You therefore want to be only calling the function once. """ try: _check_data_name_validity(name) except (ValueError, TypeError) as e: raise DataInvalidNameError(e) # TODO: we might want to wrap this so we can throw a custom error, # if the file cannot be found, i.e. FileNotFoundError. with open(Config.PIPELINE_DEFINITION_PATH, "r") as f: pipeline_definition = json.load(f) pipeline = Pipeline.from_json(pipeline_definition) try: step_uuid = get_step_uuid(pipeline) except StepUUIDResolveError: raise StepUUIDResolveError( "Failed to determine where to output data to.") # Serialize the object and collect the serialization metadata. obj, serialization = _serialize(data) try: client = _PlasmaConnector().client except OrchestNetworkError as e: if not disk_fallback: raise OrchestNetworkError(e) # TODO: note that metadata is lost when falling back to disk. # Therefore we will only support metadata added by the # user, once disk also supports passing metadata. return output_to_disk(obj, name, serialization=serialization) # Try to output to memory. obj_id = _convert_uuid_to_object_id(step_uuid) metadata = [ str(Config.IDENTIFIER_SERIALIZATION), # The plasma store allows to get the creation timestamp, but # creating it this way makes the process more consistent with # the metadata we are writing when outputting to disk, moreover, # it makes the code less dependent on the plasma store API. datetime.utcnow().isoformat(timespec="seconds"), serialization.name, # Can't simply assign to name beforehand because name might be # passed to output_to_disk, which needs to check for name # validity itself since its a public function. name if name is not None else Config._RESERVED_UNNAMED_OUTPUTS_STR, ] metadata = bytes(Config.__METADATA_SEPARATOR__.join(metadata), "utf-8") try: obj_id = _output_to_memory(obj, client, obj_id=obj_id, metadata=metadata) except MemoryError: if not disk_fallback: raise MemoryError("Data does not fit in memory.") # TODO: note that metadata is lost when falling back to disk. # Therefore we will only support metadata added by the # user, once disk also supports passing metadata. return output_to_disk(obj, name, serialization=serialization) return
def output_to_disk(data: Any, name: Optional[str], serialization: Optional[Serialization] = None) -> None: """Outputs data to disk. To manage outputing the data to disk, this function has a side effect: * Writes to a HEAD file alongside the actual data file. This file serves as a protocol that returns the timestamp of the latest write to disk via this function alongside the used serialization. Args: data: Data to output to disk. name: Name of the output data. As a string, it becomes the name of the data, when ``None``, the data is considered nameless. This affects the way the data can be later retrieved using :func:`get_inputs`. serialization: Serialization of the `data` in case it is already serialized. For possible values see :class:`Serialization`. Raises: DataInvalidNameError: The name of the output data is invalid, e.g because it is a reserved name (``"unnamed"``) or because it contains a reserved substring. StepUUIDResolveError: The step's UUID cannot be resolved and thus it cannot determine where to output data to. Example: >>> data = "Data I would like to use in my next step" >>> output_to_disk(data, name="my_data") Note: Calling :meth:`output_to_disk` multiple times within the same script will overwrite the output, even when using a different output ``name``. You therefore want to be only calling the function once. """ try: _check_data_name_validity(name) except (ValueError, TypeError) as e: raise DataInvalidNameError(e) if name is None: name = Config._RESERVED_UNNAMED_OUTPUTS_STR with open(Config.PIPELINE_DEFINITION_PATH, "r") as f: pipeline_definition = json.load(f) pipeline = Pipeline.from_json(pipeline_definition) try: step_uuid = get_step_uuid(pipeline) except StepUUIDResolveError: raise StepUUIDResolveError( "Failed to determine where to output data to.") # In case the data is not already serialized, then we need to # serialize it. if serialization is None: data, serialization = _serialize(data) # Recursively create any directories if they do not already exists. step_data_dir = Config.get_step_data_dir(step_uuid) os.makedirs(step_data_dir, exist_ok=True) # The HEAD file serves to resolve the transfer method. head_file = os.path.join(step_data_dir, "HEAD") with open(head_file, "w") as f: metadata = [ datetime.utcnow().isoformat(timespec="seconds"), serialization.name, name, ] metadata = Config.__METADATA_SEPARATOR__.join(metadata) f.write(metadata) # Full path to write the actual data to. full_path = os.path.join(step_data_dir, step_uuid) return _output_to_disk(data, full_path, serialization=serialization)
def get_step_uuid(pipeline: Pipeline) -> str: """Gets the currently running script's step UUID. Args: pipeline: Pipeline object describing the pipeline and its steps. Returns: The UUID of the currently running step. May it be through an active Jupyter kernel or as part of a partial run. Raises: StepUUIDResolveError: The step's UUID cannot be resolved. """ # In case of partial runs, the step UUID can be obtained via the # environment. if "ORCHEST_STEP_UUID" in os.environ: return os.environ["ORCHEST_STEP_UUID"] # The KERNEL_ID environment variable is set by the Jupyter # Enterprise Gateway. kernel_id = os.environ.get("KERNEL_ID") if kernel_id is None: raise StepUUIDResolveError( 'Environment variable "KERNEL_ID" not present.') # Get JupyterLab sessions to resolve the step's UUID via the id of # the running kernel and the step's associated file path. # Orchest API --jupyter_server_ip/port--> Jupyter sessions --notebook path--> UUID. launches_url = (f"http://orchest-api/api/sessions/" f'{Config.PROJECT_UUID}/{pipeline.properties["uuid"]}') launch_data = _request_json(launches_url) # NOTE: the `proxy_prefix` already includes the "/" at the start jupyter_api_url = "http://{ip}:{port}{proxy_prefix}/api/sessions" jupyter_api_url = jupyter_api_url.format( ip=launch_data["jupyter_server_ip"], port=launch_data["notebook_server_info"]["port"], proxy_prefix=launch_data["notebook_server_info"]["base_url"], ) jupyter_sessions = _request_json(jupyter_api_url) for session in jupyter_sessions: if session["kernel"]["id"] == kernel_id: notebook_path = session["notebook"]["path"] break else: raise StepUUIDResolveError( f'Jupyter session data has no "kernel" with "id" equal to the ' '"KERNEL_ID" of this step: {kernel_id}.') for step in pipeline.steps: # Compare basenames, one pipeline can not have duplicate notebook names, # so this should work if os.path.basename(step.properties["file_path"]) == os.path.basename( notebook_path): # NOTE: the UUID cannot be cached here. Because if the # notebook is assigned to a different step, then the env # variable does not change and thus the notebooks wrongly # thinks it is a different step. return step.properties["uuid"] raise StepUUIDResolveError( f'No step with "notebook_path": {notebook_path}.')