def run_only_missing( self, pipeline: Pipeline, catalog: DataCatalog ) -> Dict[str, Any]: """Run only the missing outputs from the ``Pipeline`` using the ``DataSet``s provided by ``catalog`` and save results back to the same objects. Args: pipeline: The ``Pipeline`` to run. catalog: The ``DataCatalog`` from which to fetch data. Raises: ValueError: Raised when ``Pipeline`` inputs cannot be satisfied. Returns: Any node outputs that cannot be processed by the ``DataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ free_outputs = pipeline.outputs() - set(catalog.list()) missing = {ds for ds in catalog.list() if not catalog.exists(ds)} to_build = free_outputs | missing to_rerun = pipeline.only_nodes_with_outputs(*to_build) + pipeline.from_inputs( *to_build ) # we also need any memory data sets that feed into that # including chains of memory data sets memory_sets = pipeline.data_sets() - set(catalog.list()) output_to_memory = pipeline.only_nodes_with_outputs(*memory_sets) input_from_memory = to_rerun.inputs() & memory_sets to_rerun += output_to_memory.to_outputs(*input_from_memory) return self.run(to_rerun, catalog)
def run(self, pipeline: Pipeline, catalog: DataCatalog, run_id: str = None) -> Dict[str, Any]: """ Run the ``Pipeline`` using the ``DataSet``s provided by ``catalog``. Parameters ---------- pipeline: Pipeline The ``Pipeline`` to run catalog: DataCatalog The ``DataCatalog`` from which to fetch data. run_id: str The id of the run. Returns ------- dict Any node outputs that cannot be processed by the ``DataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ # If missing flag run missing_output pipeline and its child nodes if self.only_missing: to_build = {ds for ds in catalog.list() if not catalog.exists(ds)}.intersection(pipeline.data_sets()) pipeline = pipeline.only_nodes_with_outputs(*to_build) + pipeline.from_inputs(*to_build) return super(DatalabRunner, self).run(pipeline, catalog, run_id)