Exemplo n.º 1
0
    def bundle_inputs(self):
        """
        Given this pipe, return the set of bundles that this task used as input.
        Return a list of tuples that contain (processing_name, uuid, arg_name)

        NOTE: Calls task.deps which calls task._requires which calls task.requires()

        Args:
            self (disdat.PipeTask):  The pipe task in question

        Returns:
            (dict(str:`disdat.api.Bundle`)):  {arg_name: bundle, ...}
        """

        input_bundles = {}
        for task in self.deps():
            if isinstance(task, ExternalDepTask):
                b = api.get(self.data_context.get_local_name(),
                            None,
                            uuid=task.uuid)
            else:
                b = PathCache.get_path_cache(task).bundle
            assert b is not None
            input_bundles[task.user_arg_name] = b
        return input_bundles
Exemplo n.º 2
0
 def bundle_output(self):
     """
     Return the bundle being made or re-used by this task.
     NOTE: Currently un-used.  Consider removing.
     Returns:
         (dict): <user_arg_name>:<bundle>
     """
     pce = PathCache.get_path_cache(self)
     return {self.user_arg_name: pce.b}
Exemplo n.º 3
0
    def get_hframe_uuid(self):
        """ Return the unique ID for this tasks current output hyperframe

        Returns:
            hframe_uuid (str): The unique identifier for this task's hyperframe
        """

        pce = PathCache.get_path_cache(self)
        assert (pce is not None)

        return pce.uuid
Exemplo n.º 4
0
    def get_output_dir(self):
        """
        Disdat Pipe API Function

        Retrieve the output directory for this task's bundle.  You may place
        files directly into this directory.

        Returns:
            output_dir (str):  The bundle's output directory

        """
        pce = PathCache.get_path_cache(self)
        assert (pce is not None)
        return pce.path
Exemplo n.º 5
0
    def prepare_pipe_kwargs(self, for_run=False):
        """ Each upstream task produces a bundle.  Prepare that bundle as input
        to the user's pipe_run function.

        Args:
            for_run (bool): prepare args for run -- at that point all upstream tasks have completed.

        Returns:
            (dict): A dictionary with the arguments.

        """
        kwargs = dict()

        # Place upstream task outputs into the kwargs.  Thus the user does not call
        # self.inputs().  If they did, they would get a list of output targets for the bundle
        if for_run:

            # Reset the stored tags, in case this instance is run multiple times.
            self._input_tags = {}
            self._input_bundle_uuids = {}

            upstream_tasks = [(t.user_arg_name, PathCache.get_path_cache(t))
                              for t in self.deps()]
            for user_arg_name, pce in [
                    u for u in upstream_tasks if u[1] is not None
            ]:

                b = api.get(self.data_context.get_local_name(),
                            None,
                            uuid=pce.uuid)
                assert b.is_presentable

                # Download data that is not local (the linked files are not present).
                # This is the default behavior when running in a container.
                if self.incremental_pull:
                    b.pull(localize=True)

                if pce.instance.user_arg_name in kwargs:
                    _logger.warning(
                        'Task human name {} reused when naming task dependencies: Dependency hyperframe shadowed'
                        .format(pce.instance.user_arg_name))

                self._input_tags[user_arg_name] = b.tags
                self._input_bundle_uuids[user_arg_name] = pce.uuid
                kwargs[user_arg_name] = b.data

        return kwargs
Exemplo n.º 6
0
    def create_output_file(self, filename):
        """
        Disdat Pipe API Function

        Pass in the name of your file, and get back a Luigi target object to which you can write.

        Args:
            filename (str, dict, list): A basename, dictionary of basenames, or list of basenames.

        Returns:
            (`luigi.LocalTarget`): Singleton, list, or dictionary of Luigi Target objects.
        """

        pce = PathCache.get_path_cache(self)
        assert (pce is not None)
        output_dir = pce.path
        return self.filename_to_luigi_targets(output_dir, filename)
Exemplo n.º 7
0
    def create_remote_output_file(self, filename):
        """
        Disdat Pipe API Function

        Pass in the name of your file, and get back an object to which you can write on S3.

        NOTE: Managed S3 paths are created only if a) remote is set (otherwise where would we put them?)
        and b) incremental_push flag is True  (if we don't push bundle metadata, then the locations may be lost).

        Args:
            filename (str, dict, list): A basename, dictionary of basenames, or list of basenames.

        Returns:
            (`luigi.contrib.s3.S3Target`): Singleton, list, or dictionary of Luigi Target objects.

        """
        pce = PathCache.get_path_cache(self)
        assert (pce is not None)
        output_dir = self.get_remote_output_dir()
        return self.filename_to_luigi_targets(output_dir, filename)
Exemplo n.º 8
0
    def get_remote_output_dir(self):
        """
        Disdat Pipe API Function

        Retrieve the output directory for this task's bundle.  You may place
        files directly into this directory.

        Returns:
            output_dir (str):  The bundle's output directory on S3

        """
        pce = PathCache.get_path_cache(self)
        assert (pce is not None)
        if self.data_context.remote_ctxt_url and self.incremental_push:
            output_dir = os.path.join(
                self.data_context.get_remote_object_dir(), pce.uuid)
        else:
            raise Exception(
                'Managed S3 path creation needs a) remote context and b) incremental push to be set'
            )
        return output_dir
Exemplo n.º 9
0
    def add_bundle_meta_files(pipe_task):
        """
        Given a pipe or driver task, create the bundle metaoutput files and Luigi
        output targets for them.

        Use the pipe_task (or driver task) to get the name of the bundle.
        Use the name of the bundle to look up the output path in the pipe cache in the
        PipeFS class object.

        Create an hframe.  The individual frame records have to be written out before hand.

        Args:
            pipe_task: The pipe task that will use these outputs

        Returns:
            [ luigi output for meta file, luigi output for lineage file ]

        """
        pce = PathCache.get_path_cache(pipe_task)

        if pce is None:
            # This can happen when the pipe has been created with non-deterministic parameters
            _logger.error(
                "add_bundle_meta_files: could not find pce for task {}".format(
                    pipe_task.processing_id()))
            _logger.error(
                "It is possible one of your tasks is parameterized in a non-deterministic fashion."
            )
            raise Exception(
                "add_bundle_meta_files: Unable to find pce for task {}".format(
                    pipe_task.processing_id()))

        hframe = {
            PipeBase.HFRAME:
            luigi.LocalTarget(
                os.path.join(pce.path,
                             HyperFrameRecord.make_filename(pce.uuid)))
        }

        return hframe
Exemplo n.º 10
0
    def run(self):
        """
        Call users run function.
        1.) prepare the arguments
        2.) run and gather user result
        3.) interpret and wrap in a HyperFrame

        Returns:
            None
        """
        kwargs = self.prepare_pipe_kwargs(for_run=True)
        pce = PathCache.get_path_cache(self)
        assert (pce is not None)
        """ NOTE: If a user changes a task param in run(), and that param parameterizes a dependency in requires(), 
        then running requires() post run() will give different tasks.  To be safe we record the inputs before run() 
        """
        cached_bundle_inputs = self.bundle_inputs()

        try:
            start = time.time()  # P3 datetime.now().timestamp()
            user_rtn_val = self.pipe_run(**kwargs)
            stop = time.time()  # P3 datetime.now().timestamp()
        except Exception as error:
            """ If user's pipe fails for any reason, remove bundle dir and raise """
            try:
                _logger.error(
                    "User pipe_run encountered exception: {}".format(error))
                pce.bundle.abandon()
            except OSError as ose:
                _logger.error(
                    "User pipe_run encountered error, and error on remove bundle: {}"
                    .format(ose))
            raise

        try:
            # Add any output tags to the user tag dict
            if self.output_tags:
                self.user_tags.update(self.output_tags)

            # If this is the root_task, identify it as so in the tag dict
            if isinstance(self.calling_task, DriverTask):
                self.user_tags.update({'root_task': 'True'})
            """ if we have a pce, we have a new bundle that we need to add info to and close """
            pce.bundle.add_data(user_rtn_val)

            pce.bundle.add_timing(start, stop)

            pce.bundle.add_dependencies(cached_bundle_inputs.values(),
                                        cached_bundle_inputs.keys())

            pce.bundle.name = self.human_id()

            pce.bundle.processing_name = self.processing_id()

            pce.bundle.add_params(self._get_subcls_params())

            pce.bundle.add_tags(self.user_tags)

            pce.bundle.add_code_ref('{}.{}'.format(self.__class__.__module__,
                                                   self.__class__.__name__))

            pipeline_path = os.path.dirname(
                sys.modules[self.__class__.__module__].__file__)
            cv = DisdatFS.get_pipe_version(pipeline_path)
            pce.bundle.add_git_info(cv.url, cv.hash, cv.branch)

            pce.bundle.close()  # Write out the bundle
            """ Incrementally push the completed bundle """
            if self.incremental_push and (BUNDLE_TAG_TRANSIENT
                                          not in pce.bundle.tags):
                self.pfs.commit(None,
                                None,
                                uuid=pce.bundle.uuid,
                                data_context=self.data_context)
                self.pfs.push(uuid=pce.uuid, data_context=self.data_context)

        except Exception as error:
            """ If we fail for any reason, remove bundle dir and raise """
            pce.bundle.abandon()
            raise

        return None