Exemplo n.º 1
0
    def make_managed_path(self, uuid=None):
        """
        Create a managed path for files created within this context.
        When there is a bound remote context, we will return a key
        including the UUID for that path.

        TODO: Do we create the "directory" on s3?  At the moment we just return the path

        Returns:
            local path, uuid, remote_path (tuple:(str,str)): A tuple with the path and uuid

        """
        # TODO: Hateful local import -- fix
        from disdat.fs import DisdatFS

        assert (self.is_valid())

        if uuid is None:
            _provided_uuid = DisdatFS.disdat_uuid()
        else:
            _provided_uuid = uuid

        dir = os.path.join("file:///", self.get_object_dir(),
                           _provided_uuid)  # @ReservedAssignment
        if os.path.exists(dir):
            raise Exception('Caught UUID collision {}'.format(uuid))
        os.makedirs(dir)

        if self.remote_ctxt_url is not None:
            remote_dir = os.path.join(self.get_remote_object_dir(),
                                      _provided_uuid)
        else:
            remote_dir = None

        return dir, _provided_uuid, remote_dir
Exemplo n.º 2
0
    def add_bundle_meta_files(pipe_task):
        """
        Given a pipe or driver task, create the bundle metaoutput files and Luigi
        output targets for them.

        Use the pipe_task (or driver task) to get the name of the bundle.
        Use the name of the bundle to look up the output path in the pipe cache in the
        PipeFS class object.

        Create an hframe.  The individual frame records have to be written out before hand.

        Args:
            pipe_task: The pipe task that will use these outputs

        Returns:
            [ luigi output for meta file, luigi output for lineage file ]

        """
        pce = DisdatFS.get_path_cache(pipe_task)

        if pce is None:
            # This can happen when the pipe has been created with non-deterministic parameters
            _logger.error("add_bundle_meta_files: could not find pce for task {}".format(pipe_task.pipe_id()))
            _logger.error("It is possible one of your tasks is parameterized in a non-deterministic fashion.")
            raise Exception("add_bundle_meta_files: Unable to find pce for task {}".format(pipe_task.pipe_id()))

        hframe = {PipeBase.HFRAME: luigi.LocalTarget(os.path.join(pce.path, HyperFrameRecord.make_filename(pce.uuid)))}

        return hframe
Exemplo n.º 3
0
    def _close(self):
        """ Write out this bundle as a hyperframe.

        Parse the data, set presentation, create lineage, and
        write to disk.

        This closes the bundle so it may not be re-used.

        Returns:
            None
        """

        try:

            presentation, frames = PipeBase.parse_return_val(self.uuid, self.data, self.data_context)

            self.add_frames(frames)

            self.pb.presentation = presentation

            # TODO: we should let user decide which file under git or explicitly set hash
            if False:
                pipeline_path = os.path.dirname(sys.modules[BundleWrapperTask.__module__].__file__)
                cv = DisdatFS().get_pipe_version(pipeline_path)
            else:
                cv = disdat.fs.CodeVersion(semver="0.1.0", hash="unknown", tstamp="unknown", branch="unknown",
                                      url="unknown", dirty="unknown")

            lr = LineageRecord(hframe_name=self._set_processing_name(), # <--- setting processing name
                               hframe_uuid=self.uuid,
                               code_repo=cv.url,
                               code_name='unknown',
                               code_semver=cv.semver,
                               code_hash=cv.hash,
                               code_branch=cv.branch,
                               code_method='unknown', # TODO: capture pkg.mod.class.method that creates bundle
                               depends_on=self.depends_on)

            self.add_lineage(lr)

            self.replace_tags(self.tags)

            self.data_context.write_hframe(self)

        except Exception as error:
            """ If we fail for any reason, remove bundle dir and raise """
            PipeBase.rm_bundle_dir(self.local_dir, self.uuid, []) # [] means no db-targets
            raise

        self.closed = True
        self.open = False

        return self
Exemplo n.º 4
0
    def prepare_pipe_kwargs(self, for_run=False):
        """ Each upstream task produces a bundle.  Prepare that bundle as input
        to the user's pipe_run function.

        Args:
            for_run (bool): prepare args for run -- at that point all upstream tasks have completed.

        Returns:
            (dict): A dictionary with the arguments.

        """

        kwargs = dict()

        # Place upstream task outputs into the kwargs.  Thus the user does not call
        # self.inputs().  If they did, they would get a list of output targets for the bundle
        # that isn't very helpful.
        if for_run:

            # Reset the stored tags, in case this instance is run multiple times.
            self._input_tags = {}
            self._input_bundle_uuids = {}

            upstream_tasks = [(t.user_arg_name, self.pfs.get_path_cache(t))
                              for t in self.requires()]
            for user_arg_name, pce in [
                    u for u in upstream_tasks if u[1] is not None
            ]:
                hfr = self.pfs.get_hframe_by_uuid(
                    pce.uuid, data_context=self.data_context)
                assert hfr.is_presentable()

                # Download any data that is not local (the linked files are not present).
                # This is the default behavior when running in a container.
                # The non-default is to download and localize ALL bundles in the context before we run.
                # That's in-efficient.   We only need meta-data to determine what to re-run.
                if self.incremental_pull:
                    DisdatFS()._localize_hfr(hfr, pce.uuid, self.data_context)

                if pce.instance.user_arg_name in kwargs:
                    _logger.warning(
                        'Task human name {} reused when naming task dependencies: Dependency hyperframe shadowed'
                        .format(pce.instance.user_arg_name))

                self._input_tags[user_arg_name] = hfr.tag_dict
                self._input_bundle_uuids[user_arg_name] = pce.uuid
                kwargs[user_arg_name] = self.data_context.present_hfr(hfr)

        return kwargs
Exemplo n.º 5
0
    def get_all_pipesline_output_bundles():
        """
        Find all output bundles for the pipes attached to the driver task

        The DisdatFS object has a cache of [(pipe instance, path, rerun)]

        Note: This does not include the driver's output bundle.

        :return: list of [(bundle_name, PipeCacheEntry) ... ]
        """
        all_bundles = defaultdict(PipeCacheEntry)

        pcache = DisdatFS.path_cache()

        for p_name, p_entry in pcache.items():  # @UnusedVariable
            all_bundles[p_entry.instance.name_output_bundle()] = p_entry

        return all_bundles
Exemplo n.º 6
0
 def pfs(self):
     return DisdatFS()
Exemplo n.º 7
0
    def make_hframe(output_frames,
                    output_bundle_uuid,
                    depends_on,
                    human_name,
                    processing_name,
                    class_to_version,
                    start_ts=0,
                    stop_ts=0,
                    tags=None,
                    presentation=hyperframe_pb2.DEFAULT):
        """
        Create HyperFrameRecord or HFR
        HFR contains a LineageRecord
        HFR contains UUIDs of FrameRecords or FRs
        FR contains data or LinkRecords

        Use the pipe_task to look in the path cache for the output directory
        Use the pipe_task outputs to find the named file for the final HF proto buf file.
        Write out all Frames, and at the very last moment, write out the HF proto buff.

        Args:
            output_frames (:list:`FrameRecord`):  List of frames to be placed in bundle / hframe
            output_bundle_uuid:
            depends_on (:list:tuple):  must be the processing_name, uuid of the upstream pipes / base bundles
            human_name:
            processing_name:
            class_to_version: A python class whose file is under git control
            start_ts (float): timestamp of task start time
            stop_ts (float): timestamp of task stop time
            tags:
            presentation (enum):  how to present this hframe when we use it as input to a function -- default None

            That default means it will be a HF, but it wasn't a "presentable" hyperframe.

        Returns:
            `HyperFrameRecord`
        """

        # Grab code version and path cache entry -- only called if we ran
        code_method = class_to_version.__module__
        pipeline_path = os.path.dirname(sys.modules[code_method].__file__)
        cv = DisdatFS().get_pipe_version(pipeline_path)

        lr = LineageRecord(hframe_name=processing_name,
                           hframe_uuid=output_bundle_uuid,
                           code_repo=cv.url,
                           code_name='unknown',
                           code_semver=cv.semver,
                           code_hash=cv.hash,
                           code_branch=cv.branch,
                           code_method=code_method,
                           depends_on=depends_on,
                           start_ts=start_ts,
                           stop_ts=stop_ts)

        hfr = HyperFrameRecord(owner=getpass.getuser(),
                               human_name=human_name,
                               processing_name=processing_name,
                               uuid=output_bundle_uuid,
                               frames=output_frames,
                               lin_obj=lr,
                               tags=tags,
                               presentation=presentation)

        return hfr
Exemplo n.º 8
0
    def run(self):
        """
        Call users run function.
        1.) prepare the arguments
        2.) run and gather user result
        3.) interpret and wrap in a HyperFrame

        Returns:
            None
        """
        kwargs = self.prepare_pipe_kwargs(for_run=True)
        pce = PathCache.get_path_cache(self)
        assert (pce is not None)
        """ NOTE: If a user changes a task param in run(), and that param parameterizes a dependency in requires(), 
        then running requires() post run() will give different tasks.  To be safe we record the inputs before run() 
        """
        cached_bundle_inputs = self.bundle_inputs()

        try:
            start = time.time()  # P3 datetime.now().timestamp()
            user_rtn_val = self.pipe_run(**kwargs)
            stop = time.time()  # P3 datetime.now().timestamp()
        except Exception as error:
            """ If user's pipe fails for any reason, remove bundle dir and raise """
            try:
                _logger.error(
                    "User pipe_run encountered exception: {}".format(error))
                pce.bundle.abandon()
            except OSError as ose:
                _logger.error(
                    "User pipe_run encountered error, and error on remove bundle: {}"
                    .format(ose))
            raise

        try:
            # Add any output tags to the user tag dict
            if self.output_tags:
                self.user_tags.update(self.output_tags)

            # If this is the root_task, identify it as so in the tag dict
            if isinstance(self.calling_task, DriverTask):
                self.user_tags.update({'root_task': 'True'})
            """ if we have a pce, we have a new bundle that we need to add info to and close """
            pce.bundle.add_data(user_rtn_val)

            pce.bundle.add_timing(start, stop)

            pce.bundle.add_dependencies(cached_bundle_inputs.values(),
                                        cached_bundle_inputs.keys())

            pce.bundle.name = self.human_id()

            pce.bundle.processing_name = self.processing_id()

            pce.bundle.add_params(self._get_subcls_params())

            pce.bundle.add_tags(self.user_tags)

            pce.bundle.add_code_ref('{}.{}'.format(self.__class__.__module__,
                                                   self.__class__.__name__))

            pipeline_path = os.path.dirname(
                sys.modules[self.__class__.__module__].__file__)
            cv = DisdatFS.get_pipe_version(pipeline_path)
            pce.bundle.add_git_info(cv.url, cv.hash, cv.branch)

            pce.bundle.close()  # Write out the bundle
            """ Incrementally push the completed bundle """
            if self.incremental_push and (BUNDLE_TAG_TRANSIENT
                                          not in pce.bundle.tags):
                self.pfs.commit(None,
                                None,
                                uuid=pce.bundle.uuid,
                                data_context=self.data_context)
                self.pfs.push(uuid=pce.uuid, data_context=self.data_context)

        except Exception as error:
            """ If we fail for any reason, remove bundle dir and raise """
            pce.bundle.abandon()
            raise

        return None