def _targets_to_remove(self): outs = [] # original data # TODO: check if data is local, we don't want to delete that download_task = DownloadExperiment(self.experiment_id, source='gemma') outs.extend(flatten_output(download_task)) # any data resulting from trimming raw reads trim_task = TrimExperiment(self.experiment_id, source='gemma') outs.extend(flatten_output(trim_task)) return outs
def infer_bulk_complete_from_fs(task_cls, finite_datehours): """ Efficiently determines missing datehours by filesystem listing. The current implementation works for the common case of a task writing output to a FileSystemTarget whose path is built using strftime with format like '...%Y...%m...%d...%H...', without custom complete() or exists(). (Eventually Luigi could have ranges of completion as first-class citizens. Then this listing business could be factored away/be provided for explicitly in target API or some kind of a history server.) TODO support RangeDaily """ filesystems_and_globs_by_location = _get_filesystems_and_globs(task_cls) paths_by_datehour = [[o.path for o in flatten_output(task_cls(d))] for d in finite_datehours] listing = set() for (f, g), p in zip( filesystems_and_globs_by_location, zip(*paths_by_datehour) ): # transposed, so here we're iterating over logical outputs, not datehours listing |= _list_existing(f, g, p) # quickly learn everything that's missing missing_datehours = [] for d, p in zip(finite_datehours, paths_by_datehour): if not set(p) <= listing: missing_datehours.append(d) return missing_datehours
def infer_bulk_complete_from_fs(datetimes, datetime_to_task, datetime_to_re): """ Efficiently determines missing datetimes by filesystem listing. The current implementation works for the common case of a task writing output to a FileSystemTarget whose path is built using strftime with format like '...%Y...%m...%d...%H...', without custom complete() or exists(). (Eventually Luigi could have ranges of completion as first-class citizens. Then this listing business could be factored away/be provided for explicitly in target API or some kind of a history server.) """ filesystems_and_globs_by_location = _get_filesystems_and_globs(datetime_to_task, datetime_to_re) paths_by_datetime = [[o.path for o in flatten_output(datetime_to_task(d))] for d in datetimes] listing = set() for (f, g), p in zip( filesystems_and_globs_by_location, zip(*paths_by_datetime) ): # transposed, so here we're iterating over logical outputs, not datetimes listing |= _list_existing(f, g, p) # quickly learn everything that's missing missing_datetimes = [] for d, p in zip(datetimes, paths_by_datetime): if not set(p) <= listing: missing_datetimes.append(d) return missing_datetimes
def _get_filesystems_and_globs(datetime_to_task, datetime_to_re): """ Yields a (filesystem, glob) tuple per every output location of task. The task can have one or several FileSystemTarget outputs. For convenience, the task can be a luigi.WrapperTask, in which case outputs of all its dependencies are considered. """ # probe some scattered datetimes unlikely to all occur in paths, other than by being sincere datetime parameter's representations # TODO limit to [self.start, self.stop) so messages are less confusing? Done trivially it can kill correctness sample_datetimes = [datetime(y, m, d, h) for y in range(2000, 2050, 10) for m in range(1, 4) for d in range(5, 8) for h in range(21, 24)] regexes = [re.compile(datetime_to_re(d)) for d in sample_datetimes] sample_tasks = [datetime_to_task(d) for d in sample_datetimes] sample_outputs = [flatten_output(t) for t in sample_tasks] for o, t in zip(sample_outputs, sample_tasks): if len(o) != len(sample_outputs[0]): raise NotImplementedError("Outputs must be consistent over time, sorry; was %r for %r and %r for %r" % (o, t, sample_outputs[0], sample_tasks[0])) # TODO fall back on requiring last couple of days? to avoid astonishing blocking when changes like that are deployed # erm, actually it's not hard to test entire hours_back..hours_forward and split into consistent subranges FIXME? for target in o: if not isinstance(target, FileSystemTarget): raise NotImplementedError("Output targets must be instances of FileSystemTarget; was %r for %r" % (target, t)) for o in zip(*sample_outputs): # transposed, so here we're iterating over logical outputs, not datetimes glob = _get_per_location_glob(sample_tasks, o, regexes) yield o[0].fs, glob
def _get_filesystems_and_globs(task_cls): """ Yields a (filesystem, glob) tuple per every output location of task_cls. task_cls can have one or several FileSystemTarget outputs. For convenience, task_cls can be a wrapper task, in which case outputs of all its dependencies are considered. """ # probe some scattered datehours unlikely to all occur in paths, other than by being sincere datehour parameter's representations # TODO limit to [self.start, self.stop) so messages are less confusing? Done trivially it can kill correctness sample_datehours = [datetime(y, m, d, h) for y in range(2000, 2050, 10) for m in range(1, 4) for d in range(5, 8) for h in range(21, 24)] regexes = [re.compile('(%04d).*(%02d).*(%02d).*(%02d)' % (d.year, d.month, d.day, d.hour)) for d in sample_datehours] sample_tasks = [task_cls(d) for d in sample_datehours] sample_outputs = [flatten_output(t) for t in sample_tasks] for o, t in zip(sample_outputs, sample_tasks): if len(o) != len(sample_outputs[0]): raise NotImplementedError("Outputs must be consistent over time, sorry; was %r for %r and %r for %r" % (o, t, sample_outputs[0], sample_tasks[0])) # TODO fall back on requiring last couple of days? to avoid astonishing blocking when changes like that are deployed # erm, actually it's not hard to test entire hours_back..hours_forward and split into consistent subranges FIXME? for target in o: if not isinstance(target, FileSystemTarget): raise NotImplementedError("Output targets must be instances of FileSystemTarget; was %r for %r" % (target, t)) for o in zip(*sample_outputs): # transposed, so here we're iterating over logical outputs, not datehours glob = _get_per_location_glob(sample_tasks, o, regexes) yield o[0].fs, glob
def remove_task_output(task): logger.info('Cleaning up %s...', repr(task)) for out in flatten_output(task): if hasattr(out, 'remove') and out.exists(): try: out.remove() logger.info('Removed %s.', repr(out)) except: logger.exception('Failed to remove %s.', repr(out))
def on_failure(self, err): logger.info('Removing task output of %s due to failure.', repr(self)) for out in flatten_output(self): if out.exists() and hasattr(out, 'remove'): try: out.remove() except: logger.exception('Failed to remove output %s while cleaning up %s.', repr(out), repr(self)) return super(RemoveTaskOutputOnFailureMixin, self).on_failure(err)
def missing_datehours(self, task_cls, finite_datehours): """Infers them by listing the task output target(s) filesystem. """ filesystems_and_globs_by_location = self._get_filesystems_and_globs(task_cls) paths_by_datehour = [[o.path for o in flatten_output(task_cls(d))] for d in finite_datehours] listing = set() for (f, g), p in zip(filesystems_and_globs_by_location, zip(*paths_by_datehour)): # transposed, so here we're iterating over logical outputs, not datehours listing |= self._list_existing(f, g, p) # quickly learn everything that's missing missing_datehours = [] for d, p in zip(finite_datehours, paths_by_datehour): if not set(p) <= listing: missing_datehours.append(d) return missing_datehours
def missing_datehours(self, task_cls, finite_datehours): """Infers them by listing the task output target(s) filesystem. """ filesystems_and_globs_by_location = self._get_filesystems_and_globs( task_cls) paths_by_datehour = [[o.path for o in flatten_output(task_cls(d))] for d in finite_datehours] listing = set() for (f, g), p in zip( filesystems_and_globs_by_location, zip(*paths_by_datehour) ): # transposed, so here we're iterating over logical outputs, not datehours listing |= self._list_existing(f, g, p) # quickly learn everything that's missing missing_datehours = [] for d, p in zip(finite_datehours, paths_by_datehour): if not set(p) <= listing: missing_datehours.append(d) return missing_datehours
def run(self): for out in flatten_output(self): out.makedirs() return super(CreateTaskOutputDirectoriesBeforeRunMixin, self).run()