def on_running(self): """ Log memory consumption as the computation goes on; it only works when the environment variable OQ_NO_DISTRIBUTE is set, since it is intended for debugging purposes. """ if no_distribute(): logs.LOG.warn('PyMem: %d mb, PgMem: %d mb' % self.mem_peaks)
def map_reduce(task, task_args, agg, acc): """ Given a task and an iterable of positional arguments, apply the task function to the arguments in parallel and return an aggregate result depending on the initial value of the accumulator and on the aggregation function. To save memory, the order is not preserved and there is no list with the intermediated results: the accumulator is incremented as soon as a task result comes. NB: if the environment variable OQ_NO_DISTRIBUTE is set the tasks are run sequentially in the current process and then map_reduce(task, task_args, agg, acc) is the same as reduce(agg, itertools.starmap(task, task_args), acc). Users of map_reduce should be aware of the fact that when thousands of tasks are spawned and large arguments are passed or large results are returned they may incur in memory issue: this is way the calculators limit the queue with the `concurrent_task` concept. :param task: a `celery` task callable. :param task_args: an iterable over positional arguments :param agg: the aggregation function, (acc, val) -> new acc :param acc: the initial value of the accumulator :returns: the final value of the accumulator """ if no_distribute(): for the_args in task_args: result, exctype = safely_call(task.task_func, the_args) if exctype: raise RuntimeError(result) acc = agg(acc, result) else: backend = current_app().backend unpik = 0 job_id = task_args[0][0] taskname = task.__name__ mon = LightMonitor("unpickling %s" % taskname, job_id, task) to_send = 0 pickled_args = [] for args in task_args: piks = pickle_sequence(args) pickled_args.append(piks) to_send += sum(len(p) for p in piks) logs.LOG.info("Sending %dM", to_send / ONE_MB) taskset = TaskSet(tasks=map(task.subtask, pickled_args)) for task_id, result_dict in taskset.apply_async().iter_native(): check_mem_usage() # log a warning if too much memory is used result_pik = result_dict["result"] with mon: result, exctype = result_pik.unpickle() if exctype: raise RuntimeError(result) unpik += len(result_pik) acc = agg(acc, result) del backend._cache[task_id] # work around a celery bug logs.LOG.info("Unpickled %dM of received data in %s seconds", unpik / ONE_MB, mon.duration) return acc
def _map_reduce(task_func, task_args, agg, acc): """ Given a callable and an iterable of positional arguments, apply the callable to the arguments in parallel and return an aggregate result depending on the initial value of the accumulator and on the aggregation function. To save memory, the order is not preserved and there is no list with the intermediated results: the accumulator is incremented as soon as a task result comes. :param task_func: a `celery` task callable. :param task_args: an iterable over positional arguments :param agg: the aggregation function, (acc, val) -> new acc :param acc: the initial value of the accumulator :returns: the final value of the accumulator NB: if the environment variable OQ_NO_DISTRIBUTE is set the tasks are run sequentially in the current process and then map_reduce(task_func, task_args, agg, acc) is the same as reduce(agg, itertools.starmap(task_func, task_args), acc). Users of map_reduce should be aware of the fact that when thousands of tasks are spawned and large arguments are passed or large results are returned they may incur in memory issue: this is way the calculators limit the queue with the `concurrent_task` concept. """ if no_distribute(): for the_args in task_args: acc = agg(acc, task_func(*the_args)) else: taskset = TaskSet(tasks=map(task_func.subtask, task_args)) for result in taskset.apply_async(): if isinstance(result, Exception): # TODO: kill all the other tasks raise result acc = agg(acc, result) return acc