예제 #1
0
파일: api.py 프로젝트: pombredanne/disdat
    def _close(self):
        """ Write out this bundle as a hyperframe.

        Parse the data, set presentation, create lineage, and
        write to disk.

        This closes the bundle so it may not be re-used.

        Returns:
            None
        """

        try:

            presentation, frames = PipeBase.parse_return_val(self.uuid, self.data, self.data_context)

            self.add_frames(frames)

            self.pb.presentation = presentation

            # TODO: we should let user decide which file under git or explicitly set hash
            if False:
                pipeline_path = os.path.dirname(sys.modules[BundleWrapperTask.__module__].__file__)
                cv = DisdatFS().get_pipe_version(pipeline_path)
            else:
                cv = disdat.fs.CodeVersion(semver="0.1.0", hash="unknown", tstamp="unknown", branch="unknown",
                                      url="unknown", dirty="unknown")

            lr = LineageRecord(hframe_name=self._set_processing_name(), # <--- setting processing name
                               hframe_uuid=self.uuid,
                               code_repo=cv.url,
                               code_name='unknown',
                               code_semver=cv.semver,
                               code_hash=cv.hash,
                               code_branch=cv.branch,
                               code_method='unknown', # TODO: capture pkg.mod.class.method that creates bundle
                               depends_on=self.depends_on)

            self.add_lineage(lr)

            self.replace_tags(self.tags)

            self.data_context.write_hframe(self)

        except Exception as error:
            """ If we fail for any reason, remove bundle dir and raise """
            PipeBase.rm_bundle_dir(self.local_dir, self.uuid, []) # [] means no db-targets
            raise

        self.closed = True
        self.open = False

        return self
예제 #2
0
파일: api.py 프로젝트: jonathanlunt/disdat
    def _close(self):
        """ Write out this bundle as a hyperframe.

        Parse the data, set presentation, create lineage, and
        write to disk.

        This closes the bundle so it may not be re-used.

        Returns:
            None
        """

        try:

            presentation, frames = PipeBase.parse_return_val(
                self.uuid, self.data, self.data_context)

            self.add_frames(frames)

            self.pb.presentation = presentation

            cv = get_pipe_version(BundleWrapperTask)

            lr = LineageRecord(
                hframe_name=self._set_processing_name(
                ),  # <--- setting processing name
                hframe_uuid=self.uuid,
                code_repo=cv.url,
                code_name='unknown',
                code_semver=cv.semver,
                code_hash=cv.hash,
                code_branch=cv.branch,
                depends_on=self.depends_on)

            self.add_lineage(lr)

            self.replace_tags(self.tags)

            self.data_context.write_hframe(self)

        except Exception as error:
            """ If we fail for any reason, remove bundle dir and raise """
            PipeBase.rm_bundle_dir(self.local_dir, self.uuid, self.db_targets)
            raise

        self.closed = True
        self.open = False

        return self
예제 #3
0
파일: api.py 프로젝트: jonathanlunt/disdat
    def add_file(self, filename):
        """
        Create a file-like object to write to called 'filename' and automatically add
        it to the output bundle.  This is useful if your bundle only contains output files
        and you don't wish to include any other information in your bundle.

        The file will be placed in the output bundle directory.

        Note: if you call `bundle.add_data()` it will overwrite any previous file adds.

        Arguments:
            filename (str,list,dict): filename to create in the bundle

        Returns:
            `luigi.LocalTarget` or `luigi.contrib.s3.S3Target`
        """
        assert (self.open and not self.closed)

        target = PipeBase.filename_to_luigi_targets(self.local_dir, filename)

        if self.data is None:
            self.data = target
        elif isinstance(self.data, list):
            self.data.append(target)
        else:
            assert (isinstance(self.data, (luigi.LocalTarget, s3.S3Target)))
            self.data = [self.data, target]

        return target
예제 #4
0
파일: run.py 프로젝트: wontonswaggie/disdat
    def output(self):
        """ The driver output only the bundle meta files.  The actual driver bundle
        consists of these files plus the output bundles of all pipes in the pipesline.

        :return: {PipeBase.BUNDLE_META: luigifileobj, PipeBase.BUNDLE_LINEAGE, luigifileobj}
        """

        return PipeBase.add_bundle_meta_files(self)
예제 #5
0
    def output(self):
        """
        This is the *only* output function for all pipes.  It declares the creation of the
        one HyperFrameRecord pb and that's it.  Remember, has to be idempotent.

        Return:
            (list:str):

        """

        return PipeBase.add_bundle_meta_files(self)
예제 #6
0
    def make_file(self, filename):
        """ Create a file target called "filename" that will exist in the bundle.  This is used when you have
        data in memory and wish to write it to a file, e.g., create a parquet file.

        To use, you must a.) write data into this file-like object (a 'target'), and b.) you must add this
        target to the bundle via `bundle.add_data(bundle.make_file("my_file"))`

        Arguments:
            filename (str,list,dict): filename to create in the bundle

        Returns:
            `luigi.LocalTarget` or `luigi.s3.S3Target`
        """
        assert (self.open and not self.closed)

        return PipeBase.filename_to_luigi_targets(self.local_dir, filename)
예제 #7
0
파일: api.py 프로젝트: jonathanlunt/disdat
    def make_file(self, filename):
        """
        Create a file-like object to write to called 'filename'

        The file will be placed in the output bundle directory.  However it won't be
        recorded as part of the bundle unless the path or this target is placed
        in the output.  I.e., `bundle.add_data(bundle.make_file("my_file"))`

        Arguments:
            filename (str,list,dict): filename to create in the bundle

        Returns:
            `luigi.LocalTarget` or `luigi.s3.S3Target`
        """
        assert (self.open and not self.closed)

        return PipeBase.filename_to_luigi_targets(self.local_dir, filename)
예제 #8
0
파일: api.py 프로젝트: pombredanne/disdat
    def copy_in_file(self, existing_file):
        """ This function copies the file 'existing_file' into the output bundle.  This is used when you have
        an existing file on disk and wish to add it to the bundle.

        To use, you must record this as part of the bundle with `bundle.add_data(bundle.copy_in_file("my_file"))`

        Args:
            existing_file (str): Path to an existing file

        Returns:
            `luigi.LocalTarget` or `luigi.s3.S3Target`

        """
        assert (self.open and not self.closed)

        file_basename = os.path.basename(existing_file)
        target = PipeBase.filename_to_luigi_targets(self.local_dir, file_basename)

        with target.temporary_path() as temp_path:
            shutil.copyfile(existing_file, temp_path)

        return target
예제 #9
0
    def run(self):
        """

        Call users run function.
        1.) prepare the arguments
        2.) run and gather user result
        3.) interpret and wrap in a HyperFrame

        Returns:
            (`hyperframe.HyperFrame`):

        """

        kwargs = self.prepare_pipe_kwargs(for_run=True)

        pce = self.pfs.get_path_cache(self)

        assert (pce is not None)

        try:
            start = time.time()  #P3 datetime.now().timestamp()
            user_rtn_val = self.pipe_run(**kwargs)
            stop = time.time()  #P3 datetime.now().timestamp()
        except Exception as error:
            """ If user's pipe fails for any reason, remove bundle dir and raise """
            try:
                _logger.error(
                    "User pipe_run encountered exception: {}".format(error))
                PipeBase.rm_bundle_dir(pce.path, pce.uuid, self.db_targets)
            except OSError as ose:
                _logger.error(
                    "User pipe_run encountered error, and error on remove bundle: {}"
                    .format(ose))
            raise

        try:
            presentation, frames = PipeBase.parse_return_val(
                pce.uuid, user_rtn_val, self.data_context)

            hfr = PipeBase.make_hframe(frames,
                                       pce.uuid,
                                       self.bundle_inputs(),
                                       self.pipeline_id(),
                                       self.pipe_id(),
                                       self,
                                       start_ts=start,
                                       stop_ts=stop,
                                       tags={"presentable": "True"},
                                       presentation=presentation)

            # Add Luigi Task parameters -- Only add the class parameters.  These are Disdat special params.
            self.user_tags.update(self._get_subcls_params(self))

            if self.output_tags:
                self.user_tags.update(self.output_tags)

            if isinstance(self.calling_task, DriverTask):
                self.user_tags.update({'root_task': 'True'})

            if self.user_tags:
                hfr.replace_tags(self.user_tags)

            self.data_context.write_hframe(hfr)

            transient = False
            if hfr.get_tag(BUNDLE_TAG_TRANSIENT) is not None:
                transient = True

            if self.incremental_push and not transient:
                self.pfs.commit(None,
                                None,
                                uuid=pce.uuid,
                                data_context=self.data_context)
                self.pfs.push(uuid=pce.uuid, data_context=self.data_context)

        except Exception as error:
            """ If we fail for any reason, remove bundle dir and raise """
            PipeBase.rm_bundle_dir(pce.path, pce.uuid, self.db_targets)
            raise

        return hfr
예제 #10
0
    def run(self):
        """

        Call users run function.
        1.) prepare the arguments
        2.) run and gather user result
        3.) interpret and wrap in a HyperFrame

        Returns:
            (`hyperframe.HyperFrame`):

        """

        kwargs = self.prepare_pipe_kwargs(for_run=True)

        pce = self.pfs.get_path_cache(self)

        assert (pce is not None)
        """ NOTE: If a user changes a task param in run(), and that param parameterizes a dependency in requires(), 
        then running requires() post run() will give different tasks.  To be safe we record the inputs before run() 
        """
        cached_bundle_inputs = self.bundle_inputs()

        try:
            start = time.time()  # P3 datetime.now().timestamp()
            user_rtn_val = self.pipe_run(**kwargs)
            stop = time.time()  # P3 datetime.now().timestamp()
        except Exception as error:
            """ If user's pipe fails for any reason, remove bundle dir and raise """
            try:
                _logger.error(
                    "User pipe_run encountered exception: {}".format(error))
                PipeBase.rm_bundle_dir(pce.path, pce.uuid, self.db_targets)
            except OSError as ose:
                _logger.error(
                    "User pipe_run encountered error, and error on remove bundle: {}"
                    .format(ose))
            raise

        try:
            presentation, frames = PipeBase.parse_return_val(
                pce.uuid, user_rtn_val, self.data_context)

            hfr = PipeBase.make_hframe(frames,
                                       pce.uuid,
                                       cached_bundle_inputs,
                                       self.pipeline_id(),
                                       self.pipe_id(),
                                       self,
                                       start_ts=start,
                                       stop_ts=stop,
                                       tags={"presentable": "True"},
                                       presentation=presentation)

            # Add any output tags to the user tag dict
            if self.output_tags:
                self.user_tags.update(self.output_tags)

            # If this is the root_task, identify it as so in the tag dict
            if isinstance(self.calling_task, DriverTask):
                self.user_tags.update({'root_task': 'True'})

            # Lastly add any parameters associated with this class as tags.
            # They are differentiated by a special prefix in the key
            self.user_tags.update(self._get_subcls_params())

            # Overwrite the hyperframe tags with the complete set of tags
            hfr.replace_tags(self.user_tags)

            self.data_context.write_hframe(hfr)

            transient = False
            if hfr.get_tag(BUNDLE_TAG_TRANSIENT) is not None:
                transient = True

            if self.incremental_push and not transient:
                self.pfs.commit(None,
                                None,
                                uuid=pce.uuid,
                                data_context=self.data_context)
                self.pfs.push(uuid=pce.uuid, data_context=self.data_context)

        except Exception as error:
            """ If we fail for any reason, remove bundle dir and raise """
            PipeBase.rm_bundle_dir(pce.path, pce.uuid, self.db_targets)
            raise

        return hfr
예제 #11
0
    def run(self):
        """

        Call users run function.
        1.) prepare the arguments
        2.) run and gather user result
        3.) interpret and wrap in a HyperFrame

        Returns:
            (`hyperframe.HyperFrame`):

        """

        kwargs = self.prepare_pipe_kwargs(for_run=True)

        pce = self.pfs.get_path_cache(self)

        assert (pce is not None)

        try:
            user_rtn_val = self.pipe_run(**kwargs)
        except Exception as error:
            """ If user's pipe fails for any reason, remove bundle dir and raise """
            PipeBase.rm_bundle_dir(pce.path, pce.uuid, self.db_targets)
            raise

        try:
            hfr = PipeBase.parse_pipe_return_val(pce.uuid, user_rtn_val,
                                                 self.data_context, self)

            # Add Luigi Task parameters -- Only add the class parameters.  These are Disdat special params.
            self.user_tags.update(self._get_subcls_params(self))

            if self.output_tags:
                self.user_tags.update(self.output_tags)

            if isinstance(self.calling_task, DriverTask):
                self.user_tags.update({'root_task': 'True'})

            if self.user_tags:
                hfr.replace_tags(self.user_tags)

            self.data_context.write_hframe(hfr)

            transient = False
            if hfr.get_tag(BUNDLE_TAG_TRANSIENT) is not None:
                transient = True

            if self.incremental_push and not transient:
                self.pfs.commit(None,
                                None,
                                uuid=pce.uuid,
                                data_context=self.data_context)
                self.pfs.push(uuid=pce.uuid, data_context=self.data_context)

        except Exception as error:
            """ If we fail for any reason, remove bundle dir and raise """
            PipeBase.rm_bundle_dir(pce.path, pce.uuid, self.db_targets)
            raise

        return hfr