def _run(self, pipeline: Pipeline, catalog: DataCatalog) -> None: """The method implementing sequential pipeline running. Args: pipeline: The ``Pipeline`` to run. catalog: The ``DataCatalog`` from which to fetch data. """ nodes = pipeline.nodes load_counts = Counter(chain.from_iterable(n.inputs for n in nodes)) for exec_index, node in enumerate(nodes): run_node(node, catalog) # decrement load counts and release any data sets we've finished with for data_set in node.inputs: load_counts[data_set] -= 1 if load_counts[data_set] < 1 and data_set not in pipeline.inputs(): catalog.release(data_set) for data_set in node.outputs: if load_counts[data_set] < 1 and data_set not in pipeline.outputs(): catalog.release(data_set) self._logger.info( "Completed %d out of %d tasks", exec_index + 1, len(nodes) )
def _run(self, pipeline: Pipeline, catalog: DataCatalog) -> None: """The method implementing sequential pipeline running. Args: pipeline: The ``Pipeline`` to run. catalog: The ``DataCatalog`` from which to fetch data. """ nodes = pipeline.nodes for exec_index, node in enumerate(nodes): run_node(node, catalog) self._logger.info("Completed %d out of %d tasks", exec_index + 1, len(nodes))
def _run_node_synchronization(node: Node, catalog: DataCatalog, is_async: bool = False, run_id: str = None) -> Node: """Run a single `Node` with inputs from and outputs to the `catalog`. `KedroContext` class is initialized in every subprocess because of Windows (latest OSX with Python 3.8) limitation. Windows has no "fork", so every subprocess is a brand new process created via "spawn", and KedroContext needs to be created in every subprocess in order to make KedroContext logging setup and hook manager work. Args: node: The ``Node`` to run. catalog: A ``DataCatalog`` containing the node's inputs and outputs. is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. run_id: The id of the pipeline run. Returns: The node argument. """ if multiprocessing.get_start_method() == "spawn": # type: ignore # pylint: disable=import-outside-toplevel import kedro.framework.context.context as context # pragma: no cover context.load_context(Path.cwd()) # pragma: no cover # The hard-coded current working directory causes # parallel runner to not work in notebook environment, # but we will revisit this when we work on access `project_path` # from within the runner and data in KedroContext # See https://github.com/quantumblacklabs/private-kedro/issues/701. return run_node(node, catalog, is_async, run_id)
def _run_node_synchronization( # pylint: disable=too-many-arguments node: Node, catalog: DataCatalog, is_async: bool = False, run_id: str = None, package_name: str = None, conf_logging: Dict[str, Any] = None, ) -> Node: """Run a single `Node` with inputs from and outputs to the `catalog`. `KedroSession` instance is activated in every subprocess because of Windows (and latest OSX with Python 3.8) limitation. Windows has no "fork", so every subprocess is a brand new process created via "spawn", hence the need to a) setup the logging, b) register the hooks, and c) activate `KedroSession` in every subprocess. Args: node: The ``Node`` to run. catalog: A ``DataCatalog`` containing the node's inputs and outputs. is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. run_id: The id of the pipeline run. package_name: The name of the project Python package. conf_logging: A dictionary containing logging configuration. Returns: The node argument. """ if multiprocessing.get_start_method( ) == "spawn" and package_name: # type: ignore conf_logging = conf_logging or {} _bootstrap_subprocess(package_name, conf_logging) return run_node(node, catalog, is_async, run_id)
def _run(self, pipeline: Pipeline, catalog: DataCatalog, run_id: str = None) -> None: """The method implementing sequential pipeline running. Args: pipeline: The ``Pipeline`` to run. catalog: The ``DataCatalog`` from which to fetch data. run_id: The id of the run. Raises: Exception: in case of any downstream node failure. """ nodes = pipeline.nodes done_nodes = set() load_counts = Counter(chain.from_iterable(n.inputs for n in nodes)) for exec_index, node in enumerate(nodes): try: run_node(node, catalog, self._is_async, run_id) done_nodes.add(node) except Exception: self._suggest_resume_scenario(pipeline, done_nodes) raise # decrement load counts and release any data sets we've finished with for data_set in node.inputs: load_counts[data_set] -= 1 if load_counts[ data_set] < 1 and data_set not in pipeline.inputs(): catalog.release(data_set) for data_set in node.outputs: if load_counts[ data_set] < 1 and data_set not in pipeline.outputs(): catalog.release(data_set) self._logger.info("Completed %d out of %d tasks", exec_index + 1, len(nodes))