def _close(self): """ Write out this bundle as a hyperframe. Parse the data, set presentation, create lineage, and write to disk. This closes the bundle so it may not be re-used. Returns: None """ try: presentation, frames = PipeBase.parse_return_val(self.uuid, self.data, self.data_context) self.add_frames(frames) self.pb.presentation = presentation # TODO: we should let user decide which file under git or explicitly set hash if False: pipeline_path = os.path.dirname(sys.modules[BundleWrapperTask.__module__].__file__) cv = DisdatFS().get_pipe_version(pipeline_path) else: cv = disdat.fs.CodeVersion(semver="0.1.0", hash="unknown", tstamp="unknown", branch="unknown", url="unknown", dirty="unknown") lr = LineageRecord(hframe_name=self._set_processing_name(), # <--- setting processing name hframe_uuid=self.uuid, code_repo=cv.url, code_name='unknown', code_semver=cv.semver, code_hash=cv.hash, code_branch=cv.branch, code_method='unknown', # TODO: capture pkg.mod.class.method that creates bundle depends_on=self.depends_on) self.add_lineage(lr) self.replace_tags(self.tags) self.data_context.write_hframe(self) except Exception as error: """ If we fail for any reason, remove bundle dir and raise """ PipeBase.rm_bundle_dir(self.local_dir, self.uuid, []) # [] means no db-targets raise self.closed = True self.open = False return self
def _close(self): """ Write out this bundle as a hyperframe. Parse the data, set presentation, create lineage, and write to disk. This closes the bundle so it may not be re-used. Returns: None """ try: presentation, frames = PipeBase.parse_return_val( self.uuid, self.data, self.data_context) self.add_frames(frames) self.pb.presentation = presentation cv = get_pipe_version(BundleWrapperTask) lr = LineageRecord( hframe_name=self._set_processing_name( ), # <--- setting processing name hframe_uuid=self.uuid, code_repo=cv.url, code_name='unknown', code_semver=cv.semver, code_hash=cv.hash, code_branch=cv.branch, depends_on=self.depends_on) self.add_lineage(lr) self.replace_tags(self.tags) self.data_context.write_hframe(self) except Exception as error: """ If we fail for any reason, remove bundle dir and raise """ PipeBase.rm_bundle_dir(self.local_dir, self.uuid, self.db_targets) raise self.closed = True self.open = False return self
def add_file(self, filename): """ Create a file-like object to write to called 'filename' and automatically add it to the output bundle. This is useful if your bundle only contains output files and you don't wish to include any other information in your bundle. The file will be placed in the output bundle directory. Note: if you call `bundle.add_data()` it will overwrite any previous file adds. Arguments: filename (str,list,dict): filename to create in the bundle Returns: `luigi.LocalTarget` or `luigi.contrib.s3.S3Target` """ assert (self.open and not self.closed) target = PipeBase.filename_to_luigi_targets(self.local_dir, filename) if self.data is None: self.data = target elif isinstance(self.data, list): self.data.append(target) else: assert (isinstance(self.data, (luigi.LocalTarget, s3.S3Target))) self.data = [self.data, target] return target
def output(self): """ The driver output only the bundle meta files. The actual driver bundle consists of these files plus the output bundles of all pipes in the pipesline. :return: {PipeBase.BUNDLE_META: luigifileobj, PipeBase.BUNDLE_LINEAGE, luigifileobj} """ return PipeBase.add_bundle_meta_files(self)
def output(self): """ This is the *only* output function for all pipes. It declares the creation of the one HyperFrameRecord pb and that's it. Remember, has to be idempotent. Return: (list:str): """ return PipeBase.add_bundle_meta_files(self)
def make_file(self, filename): """ Create a file target called "filename" that will exist in the bundle. This is used when you have data in memory and wish to write it to a file, e.g., create a parquet file. To use, you must a.) write data into this file-like object (a 'target'), and b.) you must add this target to the bundle via `bundle.add_data(bundle.make_file("my_file"))` Arguments: filename (str,list,dict): filename to create in the bundle Returns: `luigi.LocalTarget` or `luigi.s3.S3Target` """ assert (self.open and not self.closed) return PipeBase.filename_to_luigi_targets(self.local_dir, filename)
def make_file(self, filename): """ Create a file-like object to write to called 'filename' The file will be placed in the output bundle directory. However it won't be recorded as part of the bundle unless the path or this target is placed in the output. I.e., `bundle.add_data(bundle.make_file("my_file"))` Arguments: filename (str,list,dict): filename to create in the bundle Returns: `luigi.LocalTarget` or `luigi.s3.S3Target` """ assert (self.open and not self.closed) return PipeBase.filename_to_luigi_targets(self.local_dir, filename)
def copy_in_file(self, existing_file): """ This function copies the file 'existing_file' into the output bundle. This is used when you have an existing file on disk and wish to add it to the bundle. To use, you must record this as part of the bundle with `bundle.add_data(bundle.copy_in_file("my_file"))` Args: existing_file (str): Path to an existing file Returns: `luigi.LocalTarget` or `luigi.s3.S3Target` """ assert (self.open and not self.closed) file_basename = os.path.basename(existing_file) target = PipeBase.filename_to_luigi_targets(self.local_dir, file_basename) with target.temporary_path() as temp_path: shutil.copyfile(existing_file, temp_path) return target
def run(self): """ Call users run function. 1.) prepare the arguments 2.) run and gather user result 3.) interpret and wrap in a HyperFrame Returns: (`hyperframe.HyperFrame`): """ kwargs = self.prepare_pipe_kwargs(for_run=True) pce = self.pfs.get_path_cache(self) assert (pce is not None) try: start = time.time() #P3 datetime.now().timestamp() user_rtn_val = self.pipe_run(**kwargs) stop = time.time() #P3 datetime.now().timestamp() except Exception as error: """ If user's pipe fails for any reason, remove bundle dir and raise """ try: _logger.error( "User pipe_run encountered exception: {}".format(error)) PipeBase.rm_bundle_dir(pce.path, pce.uuid, self.db_targets) except OSError as ose: _logger.error( "User pipe_run encountered error, and error on remove bundle: {}" .format(ose)) raise try: presentation, frames = PipeBase.parse_return_val( pce.uuid, user_rtn_val, self.data_context) hfr = PipeBase.make_hframe(frames, pce.uuid, self.bundle_inputs(), self.pipeline_id(), self.pipe_id(), self, start_ts=start, stop_ts=stop, tags={"presentable": "True"}, presentation=presentation) # Add Luigi Task parameters -- Only add the class parameters. These are Disdat special params. self.user_tags.update(self._get_subcls_params(self)) if self.output_tags: self.user_tags.update(self.output_tags) if isinstance(self.calling_task, DriverTask): self.user_tags.update({'root_task': 'True'}) if self.user_tags: hfr.replace_tags(self.user_tags) self.data_context.write_hframe(hfr) transient = False if hfr.get_tag(BUNDLE_TAG_TRANSIENT) is not None: transient = True if self.incremental_push and not transient: self.pfs.commit(None, None, uuid=pce.uuid, data_context=self.data_context) self.pfs.push(uuid=pce.uuid, data_context=self.data_context) except Exception as error: """ If we fail for any reason, remove bundle dir and raise """ PipeBase.rm_bundle_dir(pce.path, pce.uuid, self.db_targets) raise return hfr
def run(self): """ Call users run function. 1.) prepare the arguments 2.) run and gather user result 3.) interpret and wrap in a HyperFrame Returns: (`hyperframe.HyperFrame`): """ kwargs = self.prepare_pipe_kwargs(for_run=True) pce = self.pfs.get_path_cache(self) assert (pce is not None) """ NOTE: If a user changes a task param in run(), and that param parameterizes a dependency in requires(), then running requires() post run() will give different tasks. To be safe we record the inputs before run() """ cached_bundle_inputs = self.bundle_inputs() try: start = time.time() # P3 datetime.now().timestamp() user_rtn_val = self.pipe_run(**kwargs) stop = time.time() # P3 datetime.now().timestamp() except Exception as error: """ If user's pipe fails for any reason, remove bundle dir and raise """ try: _logger.error( "User pipe_run encountered exception: {}".format(error)) PipeBase.rm_bundle_dir(pce.path, pce.uuid, self.db_targets) except OSError as ose: _logger.error( "User pipe_run encountered error, and error on remove bundle: {}" .format(ose)) raise try: presentation, frames = PipeBase.parse_return_val( pce.uuid, user_rtn_val, self.data_context) hfr = PipeBase.make_hframe(frames, pce.uuid, cached_bundle_inputs, self.pipeline_id(), self.pipe_id(), self, start_ts=start, stop_ts=stop, tags={"presentable": "True"}, presentation=presentation) # Add any output tags to the user tag dict if self.output_tags: self.user_tags.update(self.output_tags) # If this is the root_task, identify it as so in the tag dict if isinstance(self.calling_task, DriverTask): self.user_tags.update({'root_task': 'True'}) # Lastly add any parameters associated with this class as tags. # They are differentiated by a special prefix in the key self.user_tags.update(self._get_subcls_params()) # Overwrite the hyperframe tags with the complete set of tags hfr.replace_tags(self.user_tags) self.data_context.write_hframe(hfr) transient = False if hfr.get_tag(BUNDLE_TAG_TRANSIENT) is not None: transient = True if self.incremental_push and not transient: self.pfs.commit(None, None, uuid=pce.uuid, data_context=self.data_context) self.pfs.push(uuid=pce.uuid, data_context=self.data_context) except Exception as error: """ If we fail for any reason, remove bundle dir and raise """ PipeBase.rm_bundle_dir(pce.path, pce.uuid, self.db_targets) raise return hfr
def run(self): """ Call users run function. 1.) prepare the arguments 2.) run and gather user result 3.) interpret and wrap in a HyperFrame Returns: (`hyperframe.HyperFrame`): """ kwargs = self.prepare_pipe_kwargs(for_run=True) pce = self.pfs.get_path_cache(self) assert (pce is not None) try: user_rtn_val = self.pipe_run(**kwargs) except Exception as error: """ If user's pipe fails for any reason, remove bundle dir and raise """ PipeBase.rm_bundle_dir(pce.path, pce.uuid, self.db_targets) raise try: hfr = PipeBase.parse_pipe_return_val(pce.uuid, user_rtn_val, self.data_context, self) # Add Luigi Task parameters -- Only add the class parameters. These are Disdat special params. self.user_tags.update(self._get_subcls_params(self)) if self.output_tags: self.user_tags.update(self.output_tags) if isinstance(self.calling_task, DriverTask): self.user_tags.update({'root_task': 'True'}) if self.user_tags: hfr.replace_tags(self.user_tags) self.data_context.write_hframe(hfr) transient = False if hfr.get_tag(BUNDLE_TAG_TRANSIENT) is not None: transient = True if self.incremental_push and not transient: self.pfs.commit(None, None, uuid=pce.uuid, data_context=self.data_context) self.pfs.push(uuid=pce.uuid, data_context=self.data_context) except Exception as error: """ If we fail for any reason, remove bundle dir and raise """ PipeBase.rm_bundle_dir(pce.path, pce.uuid, self.db_targets) raise return hfr