def make_managed_path(self, uuid=None): """ Create a managed path for files created within this context. When there is a bound remote context, we will return a key including the UUID for that path. TODO: Do we create the "directory" on s3? At the moment we just return the path Returns: local path, uuid, remote_path (tuple:(str,str)): A tuple with the path and uuid """ # TODO: Hateful local import -- fix from disdat.fs import DisdatFS assert (self.is_valid()) if uuid is None: _provided_uuid = DisdatFS.disdat_uuid() else: _provided_uuid = uuid dir = os.path.join("file:///", self.get_object_dir(), _provided_uuid) # @ReservedAssignment if os.path.exists(dir): raise Exception('Caught UUID collision {}'.format(uuid)) os.makedirs(dir) if self.remote_ctxt_url is not None: remote_dir = os.path.join(self.get_remote_object_dir(), _provided_uuid) else: remote_dir = None return dir, _provided_uuid, remote_dir
def add_bundle_meta_files(pipe_task): """ Given a pipe or driver task, create the bundle metaoutput files and Luigi output targets for them. Use the pipe_task (or driver task) to get the name of the bundle. Use the name of the bundle to look up the output path in the pipe cache in the PipeFS class object. Create an hframe. The individual frame records have to be written out before hand. Args: pipe_task: The pipe task that will use these outputs Returns: [ luigi output for meta file, luigi output for lineage file ] """ pce = DisdatFS.get_path_cache(pipe_task) if pce is None: # This can happen when the pipe has been created with non-deterministic parameters _logger.error("add_bundle_meta_files: could not find pce for task {}".format(pipe_task.pipe_id())) _logger.error("It is possible one of your tasks is parameterized in a non-deterministic fashion.") raise Exception("add_bundle_meta_files: Unable to find pce for task {}".format(pipe_task.pipe_id())) hframe = {PipeBase.HFRAME: luigi.LocalTarget(os.path.join(pce.path, HyperFrameRecord.make_filename(pce.uuid)))} return hframe
def _close(self): """ Write out this bundle as a hyperframe. Parse the data, set presentation, create lineage, and write to disk. This closes the bundle so it may not be re-used. Returns: None """ try: presentation, frames = PipeBase.parse_return_val(self.uuid, self.data, self.data_context) self.add_frames(frames) self.pb.presentation = presentation # TODO: we should let user decide which file under git or explicitly set hash if False: pipeline_path = os.path.dirname(sys.modules[BundleWrapperTask.__module__].__file__) cv = DisdatFS().get_pipe_version(pipeline_path) else: cv = disdat.fs.CodeVersion(semver="0.1.0", hash="unknown", tstamp="unknown", branch="unknown", url="unknown", dirty="unknown") lr = LineageRecord(hframe_name=self._set_processing_name(), # <--- setting processing name hframe_uuid=self.uuid, code_repo=cv.url, code_name='unknown', code_semver=cv.semver, code_hash=cv.hash, code_branch=cv.branch, code_method='unknown', # TODO: capture pkg.mod.class.method that creates bundle depends_on=self.depends_on) self.add_lineage(lr) self.replace_tags(self.tags) self.data_context.write_hframe(self) except Exception as error: """ If we fail for any reason, remove bundle dir and raise """ PipeBase.rm_bundle_dir(self.local_dir, self.uuid, []) # [] means no db-targets raise self.closed = True self.open = False return self
def prepare_pipe_kwargs(self, for_run=False): """ Each upstream task produces a bundle. Prepare that bundle as input to the user's pipe_run function. Args: for_run (bool): prepare args for run -- at that point all upstream tasks have completed. Returns: (dict): A dictionary with the arguments. """ kwargs = dict() # Place upstream task outputs into the kwargs. Thus the user does not call # self.inputs(). If they did, they would get a list of output targets for the bundle # that isn't very helpful. if for_run: # Reset the stored tags, in case this instance is run multiple times. self._input_tags = {} self._input_bundle_uuids = {} upstream_tasks = [(t.user_arg_name, self.pfs.get_path_cache(t)) for t in self.requires()] for user_arg_name, pce in [ u for u in upstream_tasks if u[1] is not None ]: hfr = self.pfs.get_hframe_by_uuid( pce.uuid, data_context=self.data_context) assert hfr.is_presentable() # Download any data that is not local (the linked files are not present). # This is the default behavior when running in a container. # The non-default is to download and localize ALL bundles in the context before we run. # That's in-efficient. We only need meta-data to determine what to re-run. if self.incremental_pull: DisdatFS()._localize_hfr(hfr, pce.uuid, self.data_context) if pce.instance.user_arg_name in kwargs: _logger.warning( 'Task human name {} reused when naming task dependencies: Dependency hyperframe shadowed' .format(pce.instance.user_arg_name)) self._input_tags[user_arg_name] = hfr.tag_dict self._input_bundle_uuids[user_arg_name] = pce.uuid kwargs[user_arg_name] = self.data_context.present_hfr(hfr) return kwargs
def get_all_pipesline_output_bundles(): """ Find all output bundles for the pipes attached to the driver task The DisdatFS object has a cache of [(pipe instance, path, rerun)] Note: This does not include the driver's output bundle. :return: list of [(bundle_name, PipeCacheEntry) ... ] """ all_bundles = defaultdict(PipeCacheEntry) pcache = DisdatFS.path_cache() for p_name, p_entry in pcache.items(): # @UnusedVariable all_bundles[p_entry.instance.name_output_bundle()] = p_entry return all_bundles
def pfs(self): return DisdatFS()
def make_hframe(output_frames, output_bundle_uuid, depends_on, human_name, processing_name, class_to_version, start_ts=0, stop_ts=0, tags=None, presentation=hyperframe_pb2.DEFAULT): """ Create HyperFrameRecord or HFR HFR contains a LineageRecord HFR contains UUIDs of FrameRecords or FRs FR contains data or LinkRecords Use the pipe_task to look in the path cache for the output directory Use the pipe_task outputs to find the named file for the final HF proto buf file. Write out all Frames, and at the very last moment, write out the HF proto buff. Args: output_frames (:list:`FrameRecord`): List of frames to be placed in bundle / hframe output_bundle_uuid: depends_on (:list:tuple): must be the processing_name, uuid of the upstream pipes / base bundles human_name: processing_name: class_to_version: A python class whose file is under git control start_ts (float): timestamp of task start time stop_ts (float): timestamp of task stop time tags: presentation (enum): how to present this hframe when we use it as input to a function -- default None That default means it will be a HF, but it wasn't a "presentable" hyperframe. Returns: `HyperFrameRecord` """ # Grab code version and path cache entry -- only called if we ran code_method = class_to_version.__module__ pipeline_path = os.path.dirname(sys.modules[code_method].__file__) cv = DisdatFS().get_pipe_version(pipeline_path) lr = LineageRecord(hframe_name=processing_name, hframe_uuid=output_bundle_uuid, code_repo=cv.url, code_name='unknown', code_semver=cv.semver, code_hash=cv.hash, code_branch=cv.branch, code_method=code_method, depends_on=depends_on, start_ts=start_ts, stop_ts=stop_ts) hfr = HyperFrameRecord(owner=getpass.getuser(), human_name=human_name, processing_name=processing_name, uuid=output_bundle_uuid, frames=output_frames, lin_obj=lr, tags=tags, presentation=presentation) return hfr
def run(self): """ Call users run function. 1.) prepare the arguments 2.) run and gather user result 3.) interpret and wrap in a HyperFrame Returns: None """ kwargs = self.prepare_pipe_kwargs(for_run=True) pce = PathCache.get_path_cache(self) assert (pce is not None) """ NOTE: If a user changes a task param in run(), and that param parameterizes a dependency in requires(), then running requires() post run() will give different tasks. To be safe we record the inputs before run() """ cached_bundle_inputs = self.bundle_inputs() try: start = time.time() # P3 datetime.now().timestamp() user_rtn_val = self.pipe_run(**kwargs) stop = time.time() # P3 datetime.now().timestamp() except Exception as error: """ If user's pipe fails for any reason, remove bundle dir and raise """ try: _logger.error( "User pipe_run encountered exception: {}".format(error)) pce.bundle.abandon() except OSError as ose: _logger.error( "User pipe_run encountered error, and error on remove bundle: {}" .format(ose)) raise try: # Add any output tags to the user tag dict if self.output_tags: self.user_tags.update(self.output_tags) # If this is the root_task, identify it as so in the tag dict if isinstance(self.calling_task, DriverTask): self.user_tags.update({'root_task': 'True'}) """ if we have a pce, we have a new bundle that we need to add info to and close """ pce.bundle.add_data(user_rtn_val) pce.bundle.add_timing(start, stop) pce.bundle.add_dependencies(cached_bundle_inputs.values(), cached_bundle_inputs.keys()) pce.bundle.name = self.human_id() pce.bundle.processing_name = self.processing_id() pce.bundle.add_params(self._get_subcls_params()) pce.bundle.add_tags(self.user_tags) pce.bundle.add_code_ref('{}.{}'.format(self.__class__.__module__, self.__class__.__name__)) pipeline_path = os.path.dirname( sys.modules[self.__class__.__module__].__file__) cv = DisdatFS.get_pipe_version(pipeline_path) pce.bundle.add_git_info(cv.url, cv.hash, cv.branch) pce.bundle.close() # Write out the bundle """ Incrementally push the completed bundle """ if self.incremental_push and (BUNDLE_TAG_TRANSIENT not in pce.bundle.tags): self.pfs.commit(None, None, uuid=pce.bundle.uuid, data_context=self.data_context) self.pfs.push(uuid=pce.uuid, data_context=self.data_context) except Exception as error: """ If we fail for any reason, remove bundle dir and raise """ pce.bundle.abandon() raise return None