Exemplo n.º 1
0
    def run_only_missing(
        self, pipeline: Pipeline, catalog: DataCatalog
    ) -> Dict[str, Any]:
        """Run only the missing outputs from the ``Pipeline`` using the
        ``DataSet``s provided by ``catalog`` and save results back to the same
        objects.

        Args:
            pipeline: The ``Pipeline`` to run.
            catalog: The ``DataCatalog`` from which to fetch data.
        Raises:
            ValueError: Raised when ``Pipeline`` inputs cannot be satisfied.

        Returns:
            Any node outputs that cannot be processed by the ``DataCatalog``.
            These are returned in a dictionary, where the keys are defined
            by the node outputs.

        """
        free_outputs = pipeline.outputs() - set(catalog.list())
        missing = {ds for ds in catalog.list() if not catalog.exists(ds)}
        to_build = free_outputs | missing
        to_rerun = pipeline.only_nodes_with_outputs(*to_build) + pipeline.from_inputs(
            *to_build
        )

        # we also need any memory data sets that feed into that
        # including chains of memory data sets
        memory_sets = pipeline.data_sets() - set(catalog.list())
        output_to_memory = pipeline.only_nodes_with_outputs(*memory_sets)
        input_from_memory = to_rerun.inputs() & memory_sets
        to_rerun += output_to_memory.to_outputs(*input_from_memory)

        return self.run(to_rerun, catalog)
Exemplo n.º 2
0
    def run(self, pipeline: Pipeline, catalog: DataCatalog, run_id: str = None) -> Dict[str, Any]:
        """
        Run the ``Pipeline`` using the ``DataSet``s provided by ``catalog``.

        Parameters
        ----------
        pipeline: Pipeline
            The ``Pipeline`` to run
        catalog: DataCatalog
            The ``DataCatalog`` from which to fetch data.
        run_id: str
            The id of the run.

        Returns
        -------
        dict
            Any node outputs that cannot be processed by the ``DataCatalog``.
            These are returned in a dictionary, where the keys are defined
            by the node outputs.

        """
        # If missing flag run missing_output pipeline and its child nodes
        if self.only_missing:
            to_build = {ds for ds in catalog.list() if not catalog.exists(ds)}.intersection(pipeline.data_sets())
            pipeline = pipeline.only_nodes_with_outputs(*to_build) + pipeline.from_inputs(*to_build)

        return super(DatalabRunner, self).run(pipeline, catalog, run_id)