Пример #1
0
 def complete(self):
     """
     If the task has any outputs, return true if all outputs exists.
     Otherwise, return whether or not the task has run or not
     """
     if self.print_config:
         return True
     outputs = flatten(self.output())
     inputs = flatten(self.input())
     if self.dry_run:
         return False
     if self.restart:
         return False
     if len(outputs) == 0:
         # TODO: unclear if tasks without outputs should always run or never run
         warnings.warn("Task %r without outputs has no custom complete() method" % self)
         return False
     for output in outputs:
         if not output.exists():
             return False
         # Local addition: if any dependency is newer, then run
         if any([os.stat(x.fn).st_mtime > os.stat(output.fn).st_mtime for x in inputs if x.exists()]):
             return False
     else:
         return True
Пример #2
0
    def complete(self):
        from luigi.task import flatten

        for output in flatten(self.output()):
            if not output.exists():
                return False

        for dep in flatten(self.deps()):
            if not dep.complete():
                return False

        return True
Пример #3
0
    def _run_get_new_deps(self):
        try:
            t0 = time.time()
            task_gen = self.task.run()
        finally:
            self.task.trigger_event(
                Event.PROCESSING_TIME, self.task, time.time() - t0)

        if not isinstance(task_gen, types.GeneratorType):
            return None

        next_send = None
        while True:
            try:
                if next_send is None:
                    requires = six.next(task_gen)
                else:
                    requires = task_gen.send(next_send)
            except StopIteration:
                return None

            new_req = flatten(requires)
            new_deps = [(t.task_module, t.task_family, t.to_str_params())
                        for t in new_req]
            if all(t.complete() for t in new_req):
                next_send = getpaths(requires)
            else:
                return new_deps
Пример #4
0
    def _run_get_new_deps(self):
        # set task callbacks before running
        for reporter_attr, task_attr in six.iteritems(self.forward_reporter_callbacks):
            setattr(self.task, task_attr, getattr(self.status_reporter, reporter_attr))

        task_gen = self.task.run()

        # reset task callbacks
        for reporter_attr, task_attr in six.iteritems(self.forward_reporter_callbacks):
            setattr(self.task, task_attr, None)

        if not isinstance(task_gen, types.GeneratorType):
            return None

        next_send = None
        while True:
            try:
                if next_send is None:
                    requires = six.next(task_gen)
                else:
                    requires = task_gen.send(next_send)
            except StopIteration:
                return None

            new_req = flatten(requires)
            if all(t.complete() for t in new_req):
                next_send = getpaths(requires)
            else:
                new_deps = [(t.task_module, t.task_family, t.to_str_params())
                            for t in new_req]
                return new_deps
Пример #5
0
    def _run_get_new_deps(self):
        run_again = False
        try:
            task_gen = self.task.run(tracking_url_callback=self.tracking_url_callback)
        except TypeError as ex:
            if 'unexpected keyword argument' not in getattr(ex, 'message', ex.args[0]):
                raise
            run_again = True
        if run_again:
            task_gen = self.task.run()
        if not isinstance(task_gen, types.GeneratorType):
            return None

        next_send = None
        while True:
            try:
                if next_send is None:
                    requires = six.next(task_gen)
                else:
                    requires = task_gen.send(next_send)
            except StopIteration:
                return None

            new_req = flatten(requires)
            new_deps = [(t.task_module, t.task_family, t.to_str_params())
                        for t in new_req]
            if all(t.complete() for t in new_req):
                next_send = getpaths(requires)
            else:
                return new_deps
Пример #6
0
    def _run_get_new_deps(self):
        self.task.set_tracking_url = self.tracking_url_callback
        self.task.set_status_message = self.status_message_callback

        task_gen = self.task.run()

        self.task.set_tracking_url = None
        self.task.set_status_message = None

        if not isinstance(task_gen, types.GeneratorType):
            return None

        next_send = None
        while True:
            try:
                if next_send is None:
                    requires = six.next(task_gen)
                else:
                    requires = task_gen.send(next_send)
            except StopIteration:
                return None

            new_req = flatten(requires)
            if all(t.complete() for t in new_req):
                next_send = getpaths(requires)
            else:
                new_deps = [(t.task_module, t.task_family, t.to_str_params())
                            for t in new_req]
                return new_deps
Пример #7
0
def generate_tasks_manual():
    """ Return a formatted listing of all tasks with their descriptions. """
    from siskin.sources import *
    from siskin.workflows import *

    output = StringIO.StringIO()
    # task_tuples = sorted(Register.get_reg().iteritems())
    task_names = Register.task_names()
    output.write(MAN_HEADER)
    output.write('  {0} tasks found\n\n'.format(len(task_names)))

    for name in task_names:
        klass = Register.get_task_cls(name)
        doc = klass.__doc__ or colors.red("@todo: docs")
        output.write('{0} {1}\n'.format(colors.green(name), doc))

        try:
            deps = flatten(klass().requires())
        except Exception:
            # TODO: tasks that have required arguments will fail here
            formatted = colors.yellow("\tUnavailable since task has required parameters.")
        else:
            formatted = '\t{0}'.format(pprint.pformat(deps).replace('\n', '\n\t'))
        output.write(colors.magenta('\n\tDependencies ({0}):\n\n{1}\n\n'.format(len(deps), formatted)))

    return output.getvalue()
Пример #8
0
 def args(self):
     """returns an array of args to pass to the job."""
     arglist = []
     for k, v in self.requires_hadoop().iteritems():
         arglist.append('--' + k)
         arglist.extend([t.output().path for t in flatten(v)])
     arglist.extend(['--output', self.output()])
     arglist.extend(self.job_args())
     return arglist
Пример #9
0
def get_test_data_path(f=None):

    path = Path(__file__).parent.absolute()

    if f:
        from luigi.task import flatten

        for x in flatten(f):
            path = path.joinpath(x)

    return str(path)
Пример #10
0
    def complete(self):
        if not self.output().exists():
            return False

        deps = flatten(self.requires())
        logging.debug("Deps: %s", [d.complete() for d in deps])
        for dep in deps:
            if not dep.complete():
                logging.info("Dependencies are incomplete, rerunning task.")
                return False

        inputs = flatten(self.input())
        output_ts = os.path.getmtime(self.output().path)
        input_ts = max([os.path.getmtime(in_file.path) for in_file in inputs])

        if input_ts > output_ts:
            logging.info("Dependencies are newer, rerunning task.")
            return False

        return True
Пример #11
0
def convert_tasks_to_manifest_if_necessary(input_tasks):  # pylint: disable=invalid-name
    """
    Provide a manifest for the input paths if there are too many of them.

    The configuration section "manifest" can contain a "threshold" option which, when exceeded, causes this function
    to return a URLManifestTask instead of the original input_tasks.
    """
    all_input_tasks = task.flatten(input_tasks)
    targets = task.flatten(task.getpaths(all_input_tasks))
    threshold = configuration.get_config().getint(CONFIG_SECTION, 'threshold', -1)
    if threshold > 0 and len(targets) >= threshold:
        log.debug(
            'Using manifest since %d inputs are greater than or equal to the threshold %d', len(targets), threshold
        )
        return [URLManifestTask(urls=[target.path for target in targets])]
    else:
        log.debug(
            'Directly processing files since %d inputs are less than the threshold %d', len(targets), threshold
        )
        return all_input_tasks
Пример #12
0
    def prepare_outputs(self, job):
        """ Called before job is started

        If output is a `FileSystemTarget`, create parent directories so the hive command won't fail
        """
        outputs = flatten(job.output())
        for o in outputs:
            if isinstance(o, FileSystemTarget):
                parent_dir = os.path.dirname(o.path)
                if not o.fs.exists(parent_dir):
                    logger.info("Creating parent directory %r" % (parent_dir,))
                    try:
                        # there is a possible race condition
                        # which needs to be handled here
                        o.fs.mkdir(parent_dir)
                    except FileAlreadyExists:
                        pass
Пример #13
0
    def _run_get_new_deps(self):
        self.task.set_tracking_url = self.tracking_url_callback
        self.task.set_status_message = self.status_message_callback

        def deprecated_tracking_url_callback(*args, **kwargs):
            warnings.warn("tracking_url_callback in run() args is deprecated, use "
                          "set_tracking_url instead.", DeprecationWarning)
            self.tracking_url_callback(*args, **kwargs)

        run_again = False
        try:
            task_gen = self.task.run(tracking_url_callback=deprecated_tracking_url_callback)
        except TypeError as ex:
            if 'unexpected keyword argument' not in str(ex):
                raise
            run_again = True
        if run_again:
            task_gen = self.task.run()

        self.task.set_tracking_url = None
        self.task.set_status_message = None

        if not isinstance(task_gen, types.GeneratorType):
            return None

        next_send = None
        while True:
            try:
                if next_send is None:
                    requires = six.next(task_gen)
                else:
                    requires = task_gen.send(next_send)
            except StopIteration:
                return None

            new_req = flatten(requires)
            new_deps = [(t.task_module, t.task_family, t.to_str_params())
                        for t in new_req]
            if all(t.complete() for t in new_req):
                next_send = getpaths(requires)
            else:
                return new_deps
def walk_tree(task, target_task):
    task_name = task.__class__.__name__

    outputs = [o.path for o in luigi.task.flatten_output(task)]

    if task_name == target_task:
        return outputs

    children = flatten(task.requires())

    ret = []
    for c in children:
        files = walk_tree(c, target_task)
        # only propagate up output targets if the target task was encountered
        if files:
            ret.extend(files)

    if ret:
        ret.extend(outputs)
    return ret
Пример #15
0
def print_tree(task, indent='', last=True):
    '''
    Return a string representation of the tasks, their statuses/parameters in a dependency tree format
    '''
    # dont bother printing out warnings about tasks with no output
    with warnings.catch_warnings():
        warnings.filterwarnings(action='ignore', message='Task .* without outputs has no custom complete\\(\\) method')
        is_task_complete = task.complete()
    is_complete = (bcolors.OKGREEN + 'COMPLETE' if is_task_complete else bcolors.OKBLUE + 'PENDING') + bcolors.ENDC
    name = task.__class__.__name__
    params = task.to_str_params(only_significant=True)
    result = '\n' + indent
    if(last):
        result += '└─--'
        indent += '   '
    else:
        result += '|--'
        indent += '|  '
    result += '[{0}-{1} ({2})]'.format(name, params, is_complete)
    children = flatten(task.requires())
    for index, child in enumerate(children):
        result += print_tree(child, indent, (index+1) == len(children))
    return result
Пример #16
0
    def _init_local(self):
        def up(p): return os.path.dirname(os.path.normpath(p))
        unstale = lambda f:  f if os.path.exists(f) else unstale(up(f))
        self.unstales = map(lambda x: unstale(x.path), flatten(self.output()))

        # Set up temp folder in shared directory (trim to max filename length)
        base_tmp_dir = self.shared_tmp_dir
        random_id = '%016x' % random.getrandbits(64)
        folder_name = _clean_task_id(self.task_id) + '-' + random_id
        self.tmp_dir = os.path.join(base_tmp_dir, folder_name)
        max_filename_length = os.fstatvfs(0).f_namemax
        self.tmp_dir = self.tmp_dir[:max_filename_length]
        logger.info("Tmp dir: %s", self.tmp_dir)
        os.makedirs(self.tmp_dir)

        # Dump the code to be run into a pickle file
        logging.debug("Dumping pickled class")
        self._dump(self.tmp_dir)

        # Make sure that all the class's dependencies are tarred and available
        logging.debug("Tarballing dependencies")
        # Grab luigi and the module containing the code to be run
        packages = [luigi] + [__import__(self.__module__, None, None, 'dummy')]
        luigi.hadoop.create_packages_archive(packages, os.path.join(self.tmp_dir, "packages.tar"))
Пример #17
0
    def _get_input_schema(self):
        '''Arbitrarily picks an object in input and reads the Avro schema from it.'''
        assert avro, 'avro module required'

        input_target = flatten(self.input())[0]
        input_fs = input_target.fs if hasattr(input_target, 'fs') else GCSClient()
        input_uri = self.source_uris()[0]
        if '*' in input_uri:
            file_uris = list(input_fs.list_wildcard(input_uri))
            if file_uris:
                input_uri = file_uris[0]
            else:
                raise RuntimeError('No match for ' + input_uri)

        schema = []
        exception_reading_schema = []

        def read_schema(fp):
            # fp contains the file part downloaded thus far. We rely on that the DataFileReader
            # initializes itself fine as soon as the file header with schema is downloaded, without
            # requiring the remainder of the file...
            try:
                reader = avro.datafile.DataFileReader(fp, avro.io.DatumReader())
                schema[:] = [reader.datum_reader.writers_schema]
            except Exception as e:
                # Save but assume benign unless schema reading ultimately fails. The benign
                # exception in case of insufficiently big downloaded file part seems to be:
                # TypeError('ord() expected a character, but string of length 0 found',).
                exception_reading_schema[:] = [e]
                return False
            return True

        input_fs.download(input_uri, 64 * 1024, read_schema).close()
        if not schema:
            raise exception_reading_schema[0]
        return schema[0]
Пример #18
0
 def produce_output(self):
     with self.output().open('w') as o:
         o.write(repr([self.task_id] + sorted([eval_contents(i) for i in flatten(self.input())])))
Пример #19
0
def get_workflow_alarm_puts(task):
    puts = get_task_alarm_puts(task)
    req = flatten(task.requires())
    for t in req:
        puts += get_workflow_alarm_puts(t)
    return puts
Пример #20
0
 def on_success(self):
     pipeline_hash = utils.hash_pipeline(self)
     for o in flatten(self.output()):
         if isinstance(o, CommittedTarget):
             o.commit(task_family=self.task_family,
                      pipeline_hash=pipeline_hash)
Пример #21
0
 def run(self):
     for t in task.flatten(self.subtasks()):
         t.run()
     task_that_delegates.run(self)
Пример #22
0
def main():

    workflow = Arg25BandStatisticsWorkflow()

    workflow.x_min = 125
    workflow.x_max = 126

    workflow.y_min = -35
    workflow.y_max = -34

    workflow.acq_min = parse_date_min("1985")
    workflow.acq_max = parse_date_max("2014")

    workflow.epoch = 5

    workflow.seasons = Season
    # workflow.seasons = [Season.SPRING]

    workflow.satellites = [Satellite.LS5, Satellite.LS7]

    workflow.output_directory = "/Users/simon/tmp/cube/output/test/arg25_stats_tasks"
    workflow.output_directory = "/Users/simon/tmp/cube/output/test/arg25_stats_tasks/ARG25_125_126_-035_-034_1985_2014_SUMMER_AUTUMN_WINTER_SPRING"

    workflow.mask_pqa_apply = True
    workflow.mask_pqa_mask = [PqaMask.PQ_MASK_SATURATION, PqaMask.PQ_MASK_CONTIGUITY, PqaMask.PQ_MASK_CLOUD]

    # workflow.local_scheduler = None
    # workflow.workers = None

    workflow.dataset_type = [DatasetType.ARG25]
    workflow.bands = Ls57Arg25Bands

    workflow.x_chunk_size = 1000
    workflow.y_chunk_size = 1000

    workflow.statistics = [Statistic.PERCENTILE_25, Statistic.PERCENTILE_50, Statistic.PERCENTILE_75]

    from luigi.task import flatten

    tasks = flatten(workflow.create_tasks())

    print tasks

    for task in tasks:
        _log.info("task = %s", task)

        path = os.path.join(workflow.output_directory, task.output().path.replace("_STATS.tif", ""))
        os.makedirs(path)

        for output in flatten(task.output()):
            _log.info("output %s", output.path)
            # print output.path.replace("_STATS.tif", "")

        chunk_tasks = flatten(task.requires())

        for chunk_task in chunk_tasks:
            _log.info("chunk task %s", chunk_task)

            for output in flatten(chunk_task.output()):
                _log.info("output %s", output.path)
                # print "\t" + output.path.replace(".npy", "")
                os.makedirs(os.path.join(path, output.path.replace(".npy", "")))
Пример #23
0
def get_task_requires(task):
    return set(flatten(task.requires()))
Пример #24
0
 def get_local_files(self, task):
     # recursively gets local files from each task's requires()
     r = flatten(task.output())
     for dependency in flatten(task.requires()):
         r += self.get_local_files(dependency)
     return r
Пример #25
0
 def complete(self):
     return all(r.complete() for r in flatten(self.requires()))
Пример #26
0
 def outputs(self) -> List[Target]:
     return flatten(self.output())
Пример #27
0
 def source_uris(self):
     return [self._avro_uri(x) for x in flatten(self.input())]
Пример #28
0
    def run(self):
        logger.info('[pid %s] Worker %s running   %s', os.getpid(),
                    self.worker_id, self.task.task_id)

        if self.random_seed:
            # Need to have different random seeds if running in separate processes
            random.seed((os.getpid(), time.time()))

        status = FAILED
        error_message = ''
        missing = []
        new_deps = []
        try:
            # Verify that all the tasks are fulfilled!
            missing = [
                dep.task_id for dep in self.task.deps() if not dep.complete()
            ]
            if missing:
                deps = 'dependency' if len(missing) == 1 else 'dependencies'
                raise RuntimeError('Unfulfilled %s at run time: %s' %
                                   (deps, ', '.join(missing)))
            self.task.trigger_event(Event.START, self.task)
            t0 = time.time()
            status = None
            try:
                task_gen = self.task.run()
                if isinstance(task_gen, types.GeneratorType):  # new deps
                    next_send = None
                    while True:
                        try:
                            if next_send is None:
                                requires = six.next(task_gen)
                            else:
                                requires = task_gen.send(next_send)
                        except StopIteration:
                            break

                        new_req = flatten(requires)
                        status = (RUNNING if all(
                            t.complete() for t in new_req) else SUSPENDED)
                        new_deps = [(t.task_module, t.task_family,
                                     t.to_str_params()) for t in new_req]
                        if status == RUNNING:
                            self.result_queue.put((self.task.task_id, status,
                                                   '', missing, new_deps))
                            next_send = getpaths(requires)
                            new_deps = []
                        else:
                            logger.info(
                                '[pid %s] Worker %s new requirements      %s',
                                os.getpid(), self.worker_id, self.task.task_id)
                            return
            finally:
                if status != SUSPENDED:
                    self.task.trigger_event(Event.PROCESSING_TIME, self.task,
                                            time.time() - t0)
            error_message = json.dumps(self.task.on_success())
            logger.info('[pid %s] Worker %s done      %s', os.getpid(),
                        self.worker_id, self.task.task_id)
            self.task.trigger_event(Event.SUCCESS, self.task)
            status = DONE

        except KeyboardInterrupt:
            raise
        except BaseException as ex:
            status = FAILED
            logger.exception("[pid %s] Worker %s failed    %s", os.getpid(),
                             self.worker_id, self.task)
            error_message = notifications.wrap_traceback(
                self.task.on_failure(ex))
            self.task.trigger_event(Event.FAILURE, self.task, ex)
            subject = "Luigi: %s FAILED" % self.task
            notifications.send_error_email(subject, error_message)
        finally:
            self.result_queue.put(
                (self.task.task_id, status, error_message, missing, new_deps))
Пример #29
0
 def complete(self):
     if all(req.complete() for req in flatten(self.requires())):
         logger.info('Batch info is unusable for %s.', self.experiment_id)
         return not self.is_batch_info_usable() or super().complete()
     else:
         return super().complete()
Пример #30
0
 def requires(self):
     return flatten([b.requires() for b in self.builders])
Пример #31
0
 def deps(self):
     # Overrides method in base class
     return task.flatten(self.requires()) + task.flatten([t.deps() for t in task.flatten(self.subtasks())])
Пример #32
0
 def run(self):
     for t in task.flatten(self.subtasks()):
         t.run()
     task_that_delegates.run(self)
Пример #33
0
 def deps(self):
     # Overrides method in base class
     return task.flatten(self.requires()) + task.flatten(
         [t.deps() for t in task.flatten(self.subtasks())])
Пример #34
0
 def dependencies(self) -> List[Task]:
     return flatten(self.requires())
Пример #35
0
 def source_uris(self):
     return [self._avro_uri(x) for x in flatten(self.input())]
Пример #36
0
def test_create_tasks():
    
    TILE_COUNTS = {
        1985: {Season.SUMMER: 34, Season.AUTUMN: 29, Season.WINTER: 36, Season.SPRING: 34},
        1990: {Season.SUMMER: 53, Season.AUTUMN: 65, Season.WINTER: 65, Season.SPRING: 57}
    }

    workflow = Arg25BandStatisticsWorkflow()
    
    workflow.x_min = workflow.x_max = TEST_X
    workflow.y_min = workflow.y_max = TEST_Y

    workflow.acq_min = parse_date_min("1985")
    workflow.acq_max = parse_date_max("1994")

    workflow.epoch = EpochParameter(5, 5)

    workflow.seasons = Season
    # workflow.seasons = [Season.SPRING]

    workflow.satellites = [Satellite.LS5, Satellite.LS7]

    workflow.output_directory = "/tmp"

    workflow.mask_pqa_apply = True
    workflow.mask_pqa_mask = [PqaMask.PQ_MASK_SATURATION, PqaMask.PQ_MASK_CONTIGUITY, PqaMask.PQ_MASK_CLOUD]

    # workflow.local_scheduler = None
    # workflow.workers = None

    workflow.dataset_type = DatasetType.ARG25
    workflow.bands = Ls57Arg25Bands

    workflow.x_chunk_size = 4000
    workflow.y_chunk_size = 4000

    workflow.statistics = [Statistic.PERCENTILE_25, Statistic.PERCENTILE_50, Statistic.PERCENTILE_75]

    from luigi.task import flatten

    tasks = flatten(workflow.create_tasks())

    assert(len(tasks) == len(workflow.seasons) * len(TILE_COUNTS))

    for task in tasks:
        _log.info("task = %s", task)

        for output in flatten(task.output()):
            _log.info("output %s", output.path)

        chunk_tasks = flatten(task.requires())

        assert(len(chunk_tasks) == len(Ls57Arg25Bands))

        for chunk_task in chunk_tasks:
            _log.info("chunk task %s", chunk_task)

            for output in flatten(chunk_task.output()):
                _log.info("output %s", output.path)

            tiles = list(chunk_task.get_tiles())

            _log.info("Found %d tiles", len(tiles))

            assert (len(tiles) == TILE_COUNTS[chunk_task.acq_min.year][chunk_task.season])

            for tile in tiles:
                _log.info("\t%s", tile.end_datetime)
Пример #37
0
def get_task_requires(task):
    return set(flatten(task.requires()))
Пример #38
0
 def complete(self):
     return all(r.complete() for r in flatten(self.requires()))
Пример #39
0
 def produce_output(self):
     with self.output().open('w') as o:
         o.write(
             repr([self.task_id] +
                  sorted([eval_contents(i)
                          for i in flatten(self.input())])))