def complete(self): """ If the task has any outputs, return true if all outputs exists. Otherwise, return whether or not the task has run or not """ if self.print_config: return True outputs = flatten(self.output()) inputs = flatten(self.input()) if self.dry_run: return False if self.restart: return False if len(outputs) == 0: # TODO: unclear if tasks without outputs should always run or never run warnings.warn("Task %r without outputs has no custom complete() method" % self) return False for output in outputs: if not output.exists(): return False # Local addition: if any dependency is newer, then run if any([os.stat(x.fn).st_mtime > os.stat(output.fn).st_mtime for x in inputs if x.exists()]): return False else: return True
def complete(self): from luigi.task import flatten for output in flatten(self.output()): if not output.exists(): return False for dep in flatten(self.deps()): if not dep.complete(): return False return True
def _run_get_new_deps(self): try: t0 = time.time() task_gen = self.task.run() finally: self.task.trigger_event( Event.PROCESSING_TIME, self.task, time.time() - t0) if not isinstance(task_gen, types.GeneratorType): return None next_send = None while True: try: if next_send is None: requires = six.next(task_gen) else: requires = task_gen.send(next_send) except StopIteration: return None new_req = flatten(requires) new_deps = [(t.task_module, t.task_family, t.to_str_params()) for t in new_req] if all(t.complete() for t in new_req): next_send = getpaths(requires) else: return new_deps
def _run_get_new_deps(self): # set task callbacks before running for reporter_attr, task_attr in six.iteritems(self.forward_reporter_callbacks): setattr(self.task, task_attr, getattr(self.status_reporter, reporter_attr)) task_gen = self.task.run() # reset task callbacks for reporter_attr, task_attr in six.iteritems(self.forward_reporter_callbacks): setattr(self.task, task_attr, None) if not isinstance(task_gen, types.GeneratorType): return None next_send = None while True: try: if next_send is None: requires = six.next(task_gen) else: requires = task_gen.send(next_send) except StopIteration: return None new_req = flatten(requires) if all(t.complete() for t in new_req): next_send = getpaths(requires) else: new_deps = [(t.task_module, t.task_family, t.to_str_params()) for t in new_req] return new_deps
def _run_get_new_deps(self): run_again = False try: task_gen = self.task.run(tracking_url_callback=self.tracking_url_callback) except TypeError as ex: if 'unexpected keyword argument' not in getattr(ex, 'message', ex.args[0]): raise run_again = True if run_again: task_gen = self.task.run() if not isinstance(task_gen, types.GeneratorType): return None next_send = None while True: try: if next_send is None: requires = six.next(task_gen) else: requires = task_gen.send(next_send) except StopIteration: return None new_req = flatten(requires) new_deps = [(t.task_module, t.task_family, t.to_str_params()) for t in new_req] if all(t.complete() for t in new_req): next_send = getpaths(requires) else: return new_deps
def _run_get_new_deps(self): self.task.set_tracking_url = self.tracking_url_callback self.task.set_status_message = self.status_message_callback task_gen = self.task.run() self.task.set_tracking_url = None self.task.set_status_message = None if not isinstance(task_gen, types.GeneratorType): return None next_send = None while True: try: if next_send is None: requires = six.next(task_gen) else: requires = task_gen.send(next_send) except StopIteration: return None new_req = flatten(requires) if all(t.complete() for t in new_req): next_send = getpaths(requires) else: new_deps = [(t.task_module, t.task_family, t.to_str_params()) for t in new_req] return new_deps
def generate_tasks_manual(): """ Return a formatted listing of all tasks with their descriptions. """ from siskin.sources import * from siskin.workflows import * output = StringIO.StringIO() # task_tuples = sorted(Register.get_reg().iteritems()) task_names = Register.task_names() output.write(MAN_HEADER) output.write(' {0} tasks found\n\n'.format(len(task_names))) for name in task_names: klass = Register.get_task_cls(name) doc = klass.__doc__ or colors.red("@todo: docs") output.write('{0} {1}\n'.format(colors.green(name), doc)) try: deps = flatten(klass().requires()) except Exception: # TODO: tasks that have required arguments will fail here formatted = colors.yellow("\tUnavailable since task has required parameters.") else: formatted = '\t{0}'.format(pprint.pformat(deps).replace('\n', '\n\t')) output.write(colors.magenta('\n\tDependencies ({0}):\n\n{1}\n\n'.format(len(deps), formatted))) return output.getvalue()
def args(self): """returns an array of args to pass to the job.""" arglist = [] for k, v in self.requires_hadoop().iteritems(): arglist.append('--' + k) arglist.extend([t.output().path for t in flatten(v)]) arglist.extend(['--output', self.output()]) arglist.extend(self.job_args()) return arglist
def get_test_data_path(f=None): path = Path(__file__).parent.absolute() if f: from luigi.task import flatten for x in flatten(f): path = path.joinpath(x) return str(path)
def complete(self): if not self.output().exists(): return False deps = flatten(self.requires()) logging.debug("Deps: %s", [d.complete() for d in deps]) for dep in deps: if not dep.complete(): logging.info("Dependencies are incomplete, rerunning task.") return False inputs = flatten(self.input()) output_ts = os.path.getmtime(self.output().path) input_ts = max([os.path.getmtime(in_file.path) for in_file in inputs]) if input_ts > output_ts: logging.info("Dependencies are newer, rerunning task.") return False return True
def convert_tasks_to_manifest_if_necessary(input_tasks): # pylint: disable=invalid-name """ Provide a manifest for the input paths if there are too many of them. The configuration section "manifest" can contain a "threshold" option which, when exceeded, causes this function to return a URLManifestTask instead of the original input_tasks. """ all_input_tasks = task.flatten(input_tasks) targets = task.flatten(task.getpaths(all_input_tasks)) threshold = configuration.get_config().getint(CONFIG_SECTION, 'threshold', -1) if threshold > 0 and len(targets) >= threshold: log.debug( 'Using manifest since %d inputs are greater than or equal to the threshold %d', len(targets), threshold ) return [URLManifestTask(urls=[target.path for target in targets])] else: log.debug( 'Directly processing files since %d inputs are less than the threshold %d', len(targets), threshold ) return all_input_tasks
def prepare_outputs(self, job): """ Called before job is started If output is a `FileSystemTarget`, create parent directories so the hive command won't fail """ outputs = flatten(job.output()) for o in outputs: if isinstance(o, FileSystemTarget): parent_dir = os.path.dirname(o.path) if not o.fs.exists(parent_dir): logger.info("Creating parent directory %r" % (parent_dir,)) try: # there is a possible race condition # which needs to be handled here o.fs.mkdir(parent_dir) except FileAlreadyExists: pass
def _run_get_new_deps(self): self.task.set_tracking_url = self.tracking_url_callback self.task.set_status_message = self.status_message_callback def deprecated_tracking_url_callback(*args, **kwargs): warnings.warn("tracking_url_callback in run() args is deprecated, use " "set_tracking_url instead.", DeprecationWarning) self.tracking_url_callback(*args, **kwargs) run_again = False try: task_gen = self.task.run(tracking_url_callback=deprecated_tracking_url_callback) except TypeError as ex: if 'unexpected keyword argument' not in str(ex): raise run_again = True if run_again: task_gen = self.task.run() self.task.set_tracking_url = None self.task.set_status_message = None if not isinstance(task_gen, types.GeneratorType): return None next_send = None while True: try: if next_send is None: requires = six.next(task_gen) else: requires = task_gen.send(next_send) except StopIteration: return None new_req = flatten(requires) new_deps = [(t.task_module, t.task_family, t.to_str_params()) for t in new_req] if all(t.complete() for t in new_req): next_send = getpaths(requires) else: return new_deps
def walk_tree(task, target_task): task_name = task.__class__.__name__ outputs = [o.path for o in luigi.task.flatten_output(task)] if task_name == target_task: return outputs children = flatten(task.requires()) ret = [] for c in children: files = walk_tree(c, target_task) # only propagate up output targets if the target task was encountered if files: ret.extend(files) if ret: ret.extend(outputs) return ret
def print_tree(task, indent='', last=True): ''' Return a string representation of the tasks, their statuses/parameters in a dependency tree format ''' # dont bother printing out warnings about tasks with no output with warnings.catch_warnings(): warnings.filterwarnings(action='ignore', message='Task .* without outputs has no custom complete\\(\\) method') is_task_complete = task.complete() is_complete = (bcolors.OKGREEN + 'COMPLETE' if is_task_complete else bcolors.OKBLUE + 'PENDING') + bcolors.ENDC name = task.__class__.__name__ params = task.to_str_params(only_significant=True) result = '\n' + indent if(last): result += '└─--' indent += ' ' else: result += '|--' indent += '| ' result += '[{0}-{1} ({2})]'.format(name, params, is_complete) children = flatten(task.requires()) for index, child in enumerate(children): result += print_tree(child, indent, (index+1) == len(children)) return result
def _init_local(self): def up(p): return os.path.dirname(os.path.normpath(p)) unstale = lambda f: f if os.path.exists(f) else unstale(up(f)) self.unstales = map(lambda x: unstale(x.path), flatten(self.output())) # Set up temp folder in shared directory (trim to max filename length) base_tmp_dir = self.shared_tmp_dir random_id = '%016x' % random.getrandbits(64) folder_name = _clean_task_id(self.task_id) + '-' + random_id self.tmp_dir = os.path.join(base_tmp_dir, folder_name) max_filename_length = os.fstatvfs(0).f_namemax self.tmp_dir = self.tmp_dir[:max_filename_length] logger.info("Tmp dir: %s", self.tmp_dir) os.makedirs(self.tmp_dir) # Dump the code to be run into a pickle file logging.debug("Dumping pickled class") self._dump(self.tmp_dir) # Make sure that all the class's dependencies are tarred and available logging.debug("Tarballing dependencies") # Grab luigi and the module containing the code to be run packages = [luigi] + [__import__(self.__module__, None, None, 'dummy')] luigi.hadoop.create_packages_archive(packages, os.path.join(self.tmp_dir, "packages.tar"))
def _get_input_schema(self): '''Arbitrarily picks an object in input and reads the Avro schema from it.''' assert avro, 'avro module required' input_target = flatten(self.input())[0] input_fs = input_target.fs if hasattr(input_target, 'fs') else GCSClient() input_uri = self.source_uris()[0] if '*' in input_uri: file_uris = list(input_fs.list_wildcard(input_uri)) if file_uris: input_uri = file_uris[0] else: raise RuntimeError('No match for ' + input_uri) schema = [] exception_reading_schema = [] def read_schema(fp): # fp contains the file part downloaded thus far. We rely on that the DataFileReader # initializes itself fine as soon as the file header with schema is downloaded, without # requiring the remainder of the file... try: reader = avro.datafile.DataFileReader(fp, avro.io.DatumReader()) schema[:] = [reader.datum_reader.writers_schema] except Exception as e: # Save but assume benign unless schema reading ultimately fails. The benign # exception in case of insufficiently big downloaded file part seems to be: # TypeError('ord() expected a character, but string of length 0 found',). exception_reading_schema[:] = [e] return False return True input_fs.download(input_uri, 64 * 1024, read_schema).close() if not schema: raise exception_reading_schema[0] return schema[0]
def produce_output(self): with self.output().open('w') as o: o.write(repr([self.task_id] + sorted([eval_contents(i) for i in flatten(self.input())])))
def get_workflow_alarm_puts(task): puts = get_task_alarm_puts(task) req = flatten(task.requires()) for t in req: puts += get_workflow_alarm_puts(t) return puts
def on_success(self): pipeline_hash = utils.hash_pipeline(self) for o in flatten(self.output()): if isinstance(o, CommittedTarget): o.commit(task_family=self.task_family, pipeline_hash=pipeline_hash)
def run(self): for t in task.flatten(self.subtasks()): t.run() task_that_delegates.run(self)
def main(): workflow = Arg25BandStatisticsWorkflow() workflow.x_min = 125 workflow.x_max = 126 workflow.y_min = -35 workflow.y_max = -34 workflow.acq_min = parse_date_min("1985") workflow.acq_max = parse_date_max("2014") workflow.epoch = 5 workflow.seasons = Season # workflow.seasons = [Season.SPRING] workflow.satellites = [Satellite.LS5, Satellite.LS7] workflow.output_directory = "/Users/simon/tmp/cube/output/test/arg25_stats_tasks" workflow.output_directory = "/Users/simon/tmp/cube/output/test/arg25_stats_tasks/ARG25_125_126_-035_-034_1985_2014_SUMMER_AUTUMN_WINTER_SPRING" workflow.mask_pqa_apply = True workflow.mask_pqa_mask = [PqaMask.PQ_MASK_SATURATION, PqaMask.PQ_MASK_CONTIGUITY, PqaMask.PQ_MASK_CLOUD] # workflow.local_scheduler = None # workflow.workers = None workflow.dataset_type = [DatasetType.ARG25] workflow.bands = Ls57Arg25Bands workflow.x_chunk_size = 1000 workflow.y_chunk_size = 1000 workflow.statistics = [Statistic.PERCENTILE_25, Statistic.PERCENTILE_50, Statistic.PERCENTILE_75] from luigi.task import flatten tasks = flatten(workflow.create_tasks()) print tasks for task in tasks: _log.info("task = %s", task) path = os.path.join(workflow.output_directory, task.output().path.replace("_STATS.tif", "")) os.makedirs(path) for output in flatten(task.output()): _log.info("output %s", output.path) # print output.path.replace("_STATS.tif", "") chunk_tasks = flatten(task.requires()) for chunk_task in chunk_tasks: _log.info("chunk task %s", chunk_task) for output in flatten(chunk_task.output()): _log.info("output %s", output.path) # print "\t" + output.path.replace(".npy", "") os.makedirs(os.path.join(path, output.path.replace(".npy", "")))
def get_task_requires(task): return set(flatten(task.requires()))
def get_local_files(self, task): # recursively gets local files from each task's requires() r = flatten(task.output()) for dependency in flatten(task.requires()): r += self.get_local_files(dependency) return r
def complete(self): return all(r.complete() for r in flatten(self.requires()))
def outputs(self) -> List[Target]: return flatten(self.output())
def source_uris(self): return [self._avro_uri(x) for x in flatten(self.input())]
def run(self): logger.info('[pid %s] Worker %s running %s', os.getpid(), self.worker_id, self.task.task_id) if self.random_seed: # Need to have different random seeds if running in separate processes random.seed((os.getpid(), time.time())) status = FAILED error_message = '' missing = [] new_deps = [] try: # Verify that all the tasks are fulfilled! missing = [ dep.task_id for dep in self.task.deps() if not dep.complete() ] if missing: deps = 'dependency' if len(missing) == 1 else 'dependencies' raise RuntimeError('Unfulfilled %s at run time: %s' % (deps, ', '.join(missing))) self.task.trigger_event(Event.START, self.task) t0 = time.time() status = None try: task_gen = self.task.run() if isinstance(task_gen, types.GeneratorType): # new deps next_send = None while True: try: if next_send is None: requires = six.next(task_gen) else: requires = task_gen.send(next_send) except StopIteration: break new_req = flatten(requires) status = (RUNNING if all( t.complete() for t in new_req) else SUSPENDED) new_deps = [(t.task_module, t.task_family, t.to_str_params()) for t in new_req] if status == RUNNING: self.result_queue.put((self.task.task_id, status, '', missing, new_deps)) next_send = getpaths(requires) new_deps = [] else: logger.info( '[pid %s] Worker %s new requirements %s', os.getpid(), self.worker_id, self.task.task_id) return finally: if status != SUSPENDED: self.task.trigger_event(Event.PROCESSING_TIME, self.task, time.time() - t0) error_message = json.dumps(self.task.on_success()) logger.info('[pid %s] Worker %s done %s', os.getpid(), self.worker_id, self.task.task_id) self.task.trigger_event(Event.SUCCESS, self.task) status = DONE except KeyboardInterrupt: raise except BaseException as ex: status = FAILED logger.exception("[pid %s] Worker %s failed %s", os.getpid(), self.worker_id, self.task) error_message = notifications.wrap_traceback( self.task.on_failure(ex)) self.task.trigger_event(Event.FAILURE, self.task, ex) subject = "Luigi: %s FAILED" % self.task notifications.send_error_email(subject, error_message) finally: self.result_queue.put( (self.task.task_id, status, error_message, missing, new_deps))
def complete(self): if all(req.complete() for req in flatten(self.requires())): logger.info('Batch info is unusable for %s.', self.experiment_id) return not self.is_batch_info_usable() or super().complete() else: return super().complete()
def requires(self): return flatten([b.requires() for b in self.builders])
def deps(self): # Overrides method in base class return task.flatten(self.requires()) + task.flatten([t.deps() for t in task.flatten(self.subtasks())])
def deps(self): # Overrides method in base class return task.flatten(self.requires()) + task.flatten( [t.deps() for t in task.flatten(self.subtasks())])
def dependencies(self) -> List[Task]: return flatten(self.requires())
def test_create_tasks(): TILE_COUNTS = { 1985: {Season.SUMMER: 34, Season.AUTUMN: 29, Season.WINTER: 36, Season.SPRING: 34}, 1990: {Season.SUMMER: 53, Season.AUTUMN: 65, Season.WINTER: 65, Season.SPRING: 57} } workflow = Arg25BandStatisticsWorkflow() workflow.x_min = workflow.x_max = TEST_X workflow.y_min = workflow.y_max = TEST_Y workflow.acq_min = parse_date_min("1985") workflow.acq_max = parse_date_max("1994") workflow.epoch = EpochParameter(5, 5) workflow.seasons = Season # workflow.seasons = [Season.SPRING] workflow.satellites = [Satellite.LS5, Satellite.LS7] workflow.output_directory = "/tmp" workflow.mask_pqa_apply = True workflow.mask_pqa_mask = [PqaMask.PQ_MASK_SATURATION, PqaMask.PQ_MASK_CONTIGUITY, PqaMask.PQ_MASK_CLOUD] # workflow.local_scheduler = None # workflow.workers = None workflow.dataset_type = DatasetType.ARG25 workflow.bands = Ls57Arg25Bands workflow.x_chunk_size = 4000 workflow.y_chunk_size = 4000 workflow.statistics = [Statistic.PERCENTILE_25, Statistic.PERCENTILE_50, Statistic.PERCENTILE_75] from luigi.task import flatten tasks = flatten(workflow.create_tasks()) assert(len(tasks) == len(workflow.seasons) * len(TILE_COUNTS)) for task in tasks: _log.info("task = %s", task) for output in flatten(task.output()): _log.info("output %s", output.path) chunk_tasks = flatten(task.requires()) assert(len(chunk_tasks) == len(Ls57Arg25Bands)) for chunk_task in chunk_tasks: _log.info("chunk task %s", chunk_task) for output in flatten(chunk_task.output()): _log.info("output %s", output.path) tiles = list(chunk_task.get_tiles()) _log.info("Found %d tiles", len(tiles)) assert (len(tiles) == TILE_COUNTS[chunk_task.acq_min.year][chunk_task.season]) for tile in tiles: _log.info("\t%s", tile.end_datetime)
def produce_output(self): with self.output().open('w') as o: o.write( repr([self.task_id] + sorted([eval_contents(i) for i in flatten(self.input())])))