def test_from_sane_config_versioned(self, sane_config, dummy_dataframe): """Test load and save of versioned data sets from config""" sane_config["catalog"]["boats"]["versioned"] = True # Decompose `generate_timestamp` to keep `current_ts` reference. current_ts = datetime.now(tz=timezone.utc) fmt = ( "{d.year:04d}-{d.month:02d}-{d.day:02d}T{d.hour:02d}" ".{d.minute:02d}.{d.second:02d}.{ms:03d}Z" ) version = fmt.format(d=current_ts, ms=current_ts.microsecond // 1000) journal = Journal({"run_id": "fake-id", "project_path": "fake-path"}) catalog = DataCatalog.from_config( **sane_config, load_versions={"boats": version}, save_version=version, journal=journal, ) assert catalog._journal == journal catalog.save("boats", dummy_dataframe) path = Path(sane_config["catalog"]["boats"]["filepath"]) path = path / version / path.name assert path.is_file() reloaded_df = catalog.load("boats") assert_frame_equal(reloaded_df, dummy_dataframe) reloaded_df_version = catalog.load("boats", version=version) assert_frame_equal(reloaded_df_version, dummy_dataframe) # Verify that `VERSION_FORMAT` can help regenerate `current_ts`. actual_timestamp = datetime.strptime( catalog.datasets.boats.resolve_load_version(), # pylint: disable=no-member VERSION_FORMAT, ) expected_timestamp = current_ts.replace( microsecond=current_ts.microsecond // 1000 * 1000, tzinfo=None ) assert actual_timestamp == expected_timestamp
def run( # pylint: disable=too-many-arguments,too-many-locals self, tags: Iterable[str] = None, runner: AbstractRunner = None, node_names: Iterable[str] = None, from_nodes: Iterable[str] = None, to_nodes: Iterable[str] = None, from_inputs: Iterable[str] = None, to_outputs: Iterable[str] = None, load_versions: Dict[str, str] = None, pipeline_name: str = None, ) -> Dict[str, Any]: """Runs the pipeline with a specified runner. Args: tags: An optional list of node tags which should be used to filter the nodes of the ``Pipeline``. If specified, only the nodes containing *any* of these tags will be run. runner: An optional parameter specifying the runner that you want to run the pipeline with. node_names: An optional list of node names which should be used to filter the nodes of the ``Pipeline``. If specified, only the nodes with these names will be run. from_nodes: An optional list of node names which should be used as a starting point of the new ``Pipeline``. to_nodes: An optional list of node names which should be used as an end point of the new ``Pipeline``. from_inputs: An optional list of input datasets which should be used as a starting point of the new ``Pipeline``. to_outputs: An optional list of output datasets which should be used as an end point of the new ``Pipeline``. load_versions: An optional flag to specify a particular dataset version timestamp to load. pipeline_name: Name of the ``Pipeline`` to execute. Defaults to "__default__". Raises: KedroContextError: If the resulting ``Pipeline`` is empty or incorrect tags are provided. Exception: Any uncaught exception will be re-raised after being passed to``on_pipeline_error``. Returns: Any node outputs that cannot be processed by the ``DataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ # Report project name logging.info("** Kedro project %s", self.project_path.name) pipeline = self._get_pipeline(name=pipeline_name) filtered_pipeline = self._filter_pipeline( pipeline=pipeline, tags=tags, from_nodes=from_nodes, to_nodes=to_nodes, node_names=node_names, from_inputs=from_inputs, to_outputs=to_outputs, ) save_version = self._get_save_version() run_id = self.run_id or save_version record_data = { "run_id": run_id, "project_path": str(self.project_path), "env": self.env, "tags": tags, "from_nodes": from_nodes, "to_nodes": to_nodes, "node_names": node_names, "from_inputs": from_inputs, "to_outputs": to_outputs, "load_versions": load_versions, "pipeline_name": pipeline_name, "extra_params": self._extra_params, } journal = Journal(record_data) catalog = self._get_catalog(save_version=save_version, journal=journal, load_versions=load_versions) # Run the runner runner = runner or SequentialRunner() hook_manager = get_hook_manager() hook_manager.hook.before_pipeline_run( # pylint: disable=no-member run_params=record_data, pipeline=filtered_pipeline, catalog=catalog) try: run_result = runner.run(filtered_pipeline, catalog, run_id) except Exception as exc: hook_manager.hook.on_pipeline_error( # pylint: disable=no-member error=exc, run_params=record_data, pipeline=filtered_pipeline, catalog=catalog, ) raise exc hook_manager.hook.after_pipeline_run( # pylint: disable=no-member run_params=record_data, run_result=run_result, pipeline=filtered_pipeline, catalog=catalog, ) return run_result
def run( # pylint: disable=too-many-arguments,too-many-locals self, tags: Iterable[str] = None, runner: AbstractRunner = None, node_names: Iterable[str] = None, from_nodes: Iterable[str] = None, to_nodes: Iterable[str] = None, from_inputs: Iterable[str] = None, load_versions: Dict[str, str] = None, pipeline_name: str = None, ) -> Dict[str, Any]: """Runs the pipeline with a specified runner. Args: tags: An optional list of node tags which should be used to filter the nodes of the ``Pipeline``. If specified, only the nodes containing *any* of these tags will be run. runner: An optional parameter specifying the runner that you want to run the pipeline with. node_names: An optional list of node names which should be used to filter the nodes of the ``Pipeline``. If specified, only the nodes with these names will be run. from_nodes: An optional list of node names which should be used as a starting point of the new ``Pipeline``. to_nodes: An optional list of node names which should be used as an end point of the new ``Pipeline``. from_inputs: An optional list of input datasets which should be used as a starting point of the new ``Pipeline``. load_versions: An optional flag to specify a particular dataset version timestamp to load. pipeline_name: Name of the ``Pipeline`` to execute. Defaults to "__default__". Raises: KedroContextError: If the resulting ``Pipeline`` is empty or incorrect tags are provided. Returns: Any node outputs that cannot be processed by the ``DataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ # Report project name logging.info("** Kedro project %s", self.project_path.name) try: pipeline = self._get_pipeline(name=pipeline_name) except NotImplementedError: common_migration_message = ( "`ProjectContext._get_pipeline(self, name)` method is expected. " "Please refer to the 'Modular Pipelines' section of the documentation." ) if pipeline_name: raise KedroContextError( "The project is not fully migrated to use multiple pipelines. " + common_migration_message ) warn( "You are using the deprecated pipeline construction mechanism. " + common_migration_message, DeprecationWarning, ) pipeline = self.pipeline filtered_pipeline = self._filter_pipeline( pipeline=pipeline, tags=tags, from_nodes=from_nodes, to_nodes=to_nodes, node_names=node_names, from_inputs=from_inputs, ) run_id = generate_timestamp() record_data = { "run_id": run_id, "project_path": str(self.project_path), "env": self.env, "kedro_version": self.project_version, "tags": tags, "from_nodes": from_nodes, "to_nodes": to_nodes, "node_names": node_names, "from_inputs": from_inputs, "load_versions": load_versions, "pipeline_name": pipeline_name, } journal = Journal(record_data) catalog = self._get_catalog( save_version=run_id, journal=journal, load_versions=load_versions ) # Run the runner runner = runner or SequentialRunner() return runner.run(filtered_pipeline, catalog)
def run( # pylint: disable=too-many-arguments,too-many-locals self, tags: Iterable[str] = None, runner: AbstractRunner = None, node_names: Iterable[str] = None, from_nodes: Iterable[str] = None, to_nodes: Iterable[str] = None, from_inputs: Iterable[str] = None, load_versions: Dict[str, str] = None, pipeline_name: str = None, only_missing: bool = False, ) -> Dict[str, Any]: """Runs the pipeline with a specified runner. Args: tags: An optional list of node tags which should be used to filter the nodes of the ``Pipeline``. If specified, only the nodes containing *any* of these tags will be run. runner: An optional parameter specifying the runner that you want to run the pipeline with. node_names: An optional list of node names which should be used to filter the nodes of the ``Pipeline``. If specified, only the nodes with these names will be run. from_nodes: An optional list of node names which should be used as a starting point of the new ``Pipeline``. to_nodes: An optional list of node names which should be used as an end point of the new ``Pipeline``. from_inputs: An optional list of input datasets which should be used as a starting point of the new ``Pipeline``. load_versions: An optional flag to specify a particular dataset version timestamp to load. pipeline_name: Name of the ``Pipeline`` to execute. Defaults to "__default__". only_missing: An option to run only missing nodes. Raises: KedroContextError: If the resulting ``Pipeline`` is empty or incorrect tags are provided. Exception: Any uncaught exception will be re-raised after being passed to``on_pipeline_error``. Returns: Any node outputs that cannot be processed by the ``DataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ # Report project name logging.info("** Kedro project %s", self.project_path.name) try: pipeline = self._get_pipeline(name=pipeline_name) except NotImplementedError: common_migration_message = ( "`ProjectContext._get_pipeline(self, name)` method is expected. " "Please refer to the 'Modular Pipelines' section of the documentation." ) if pipeline_name: raise KedroContextError( "The project is not fully migrated to use multiple pipelines. " + common_migration_message ) warn( "You are using the deprecated pipeline construction mechanism. " + common_migration_message, DeprecationWarning, ) pipeline = self.pipeline filtered_pipeline = self._filter_pipeline( pipeline=pipeline, tags=tags, from_nodes=from_nodes, to_nodes=to_nodes, node_names=node_names, from_inputs=from_inputs, ) if hasattr(self, "_save_pipeline_json"): self._save_pipeline_json(filtered_pipeline) save_version = self._get_save_version() run_id = self.run_id or save_version record_data = { "run_id": run_id, "project_path": str(self.project_path), "env": self.env, "kedro_version": self.project_version, "tags": tags, "from_nodes": from_nodes, "to_nodes": to_nodes, "node_names": node_names, "from_inputs": from_inputs, "load_versions": load_versions, "pipeline_name": pipeline_name, "extra_params": self._extra_params, } journal = Journal(record_data) catalog = self._get_catalog( save_version=save_version, journal=journal, load_versions=load_versions ) # Run the runner runner = runner or SequentialRunner() self._hook_manager.hook.before_pipeline_run( # pylint: disable=no-member run_params=record_data, pipeline=filtered_pipeline, catalog=catalog ) try: run_method = runner.run_only_missing if only_missing else runner.run if run_method.__code__.co_argcount >= 4: # if kedro.__version__ >= "0.16.0" run_result = run_method(filtered_pipeline, catalog, run_id) else: # if kedro.__version__ <= "0.15.9" run_result = run_method(filtered_pipeline, catalog) except Exception as error: self._hook_manager.hook.on_pipeline_error( # pylint: disable=no-member error=error, run_params=record_data, pipeline=filtered_pipeline, catalog=catalog, ) raise error self._hook_manager.hook.after_pipeline_run( # pylint: disable=no-member run_params=record_data, run_result=run_result, pipeline=filtered_pipeline, catalog=catalog, ) return run_result