async def func(work=work): while True: while ( isinstance(work.input_queue, tuple) and all(not q for q in work.input_queue) ) or not work.input_queue: await trio.sleep(self.sleep_time) if self._has_to_stop: return t_start = time.time() log_memory_usage( f"{time.time() - self.t_start:.2f} s. Launch work " + work.name_no_space + f" (?). mem usage" ) work.func_or_cls(work.input_queue, work.output_queue) if self._has_to_stop: return await trio.sleep(self.sleep_time) logger.info( f"work {work.name_no_space} " f"done in {time.time() - t_start:.3f} s" ) await trio.sleep(self.sleep_time)
def signal_handler(sig, frame): del sig, frame # unused logger.info("Ctrl+C signal received...") self._has_to_stop = True self.nursery.cancel_scope.cancel() # it seems that we don't need to raise the exception raise KeyboardInterrupt
def fill_queue_paths(self, input_queue, output_queue): """Fill the first queue (paths)""" assert input_queue is None serie = self.serie if not serie: logger.warning("add 0 image. No image to process.") return names = serie.get_name_arrays() for name in names: path_im_output = self.path_dir_result / name path_im_input = str(self.path_dir_src / name) if self.how_saving == "complete": if not path_im_output.exists(): output_queue[name] = path_im_input else: output_queue[name] = path_im_input if not names: if self.how_saving == "complete": logger.warning( 'topology in mode "complete" and work already done.') else: logger.warning("Nothing to do") return nb_names = len(names) logger.info(f"Add {nb_names} images to compute.") logger.info(f"First files to process: {names[:4]}") logger.debug(f"All files: {names}")
def _init_compute_log(self): log_memory_usage(time_as_str(2) + ": starting execution. mem usage") logger.info(f" topology: {str_short(type(self.topology))}") logger.info(f" executor: {str_short(type(self))}") logger.info(f" nb_cpus_allowed = {nb_cores}") logger.info(f" nb_max_workers = {self.nb_max_workers}") logger.info(f" path_dir_result = {self.path_dir_result}")
def exec_one_shot_works(self): """ Execute all "one shot" functions. """ for work in self.topology.works: if work.kind is not None and "one shot" in work.kind: pretty = str_short(work.func_or_cls.__func__) logger.info(f'Running "one_shot" job "{work.name}" ({pretty})') work.func_or_cls(work.input_queue, work.output_queue)
def in_time_loop(self): t_tmp = time() for worker in workers: if (isinstance(worker, self.cls_to_be_updated) and worker.fill_destination()): workers.remove(worker) t_tmp = time() - t_tmp if t_tmp > 0.2: logger.info("update list of workers with fill_destination " "done in {:.3f} s".format(t_tmp)) sleep(dt_update)
def signal_handler(sig, frame): del sig, frame logger.info("Ctrl+C signal received...") for worker in self.workers: worker.terminate() self._has_to_stop = True self.nursery.cancel_scope.cancel() # we need to raise the exception raise KeyboardInterrupt
async def async_run_work_cpu(self, work): """Executes the work on an item (key, obj), and add the result on work.output_queue. Parameters ---------- work : A work from the topology key : hashable The key of the dictionnary item to be process obj : object The value of the dictionnary item to be process """ self.nb_working_workers_cpu += 1 try: key, obj = work.input_queue.pop_first_item() except KeyError: self.nb_working_workers_cpu -= 1 return if work.check_exception(key, obj): self.nb_working_workers_cpu -= 1 return t_start = time.time() log_memory_usage(f"{time.time() - self.t_start:.2f} s. Launch work " + work.name_no_space + f" ({key}). mem usage") # pylint: disable=W0703 try: # here we do something very bad from the async point of view: # we launch a potentially long blocking function: ret = work.func_or_cls(obj) except Exception as error: self.log_exception(error, work.name_no_space, key) if self.stop_if_error: raise ret = error else: logger.info(f"work {work.name_no_space} ({key}) " f"done in {time.time() - t_start:.3f} s") if work.output_queue is not None: work.output_queue[key] = ret self.nb_working_workers_cpu -= 1
async def start_async_works(self): """Create a trio nursery and start all async functions. """ async with trio.open_nursery() as self.nursery: for af in reversed(self.async_funcs.values()): self.nursery.start_soon(af) self.nursery.start_soon(self.update_has_to_stop) logger.info("terminate the servers") for worker in self.workers: worker.terminate()
async def async_run_work_cpu(self, work): """Is destined to be started with a "trio.start_soon". Executes the work on an item (key, obj), and add the result on work.output_queue. Parameters ---------- work : A work from the topology """ self.nb_working_workers_cpu += 1 try: key, obj = work.input_queue.pop_first_item() except KeyError: self.nb_working_workers_cpu -= 1 return if work.check_exception(key, obj): self.nb_working_workers_cpu -= 1 return t_start = time.time() log_memory_usage( f"{time.time() - self.t_start:.2f} s. Launch work " + work.name_no_space + f" ({key}). mem usage" ) # pylint: disable=W0703 try: ret = await trio.run_sync_in_worker_thread(work.func_or_cls, obj) except Exception as error: self.log_exception(error, work.name_no_space, key) if self.stop_if_error: raise ret = error else: logger.info( f"work {work.name_no_space} ({key}) " f"done in {time.time() - t_start:.3f} s" ) if work.output_queue is not None: work.output_queue[key] = ret self.nb_working_workers_cpu -= 1
def wait_for_all_processes(self): """logging + wait for all processes to finish""" logger.info( f"logging files: {[log_path.name for log_path in self.log_paths]}") # wait until end of all processes self.topology.results = results_all = [] for process in self.processes: results = process.connection.recv() if results is not None: results_all.extend(results) for process in self.processes: process.join()
def run_process(): # we do this complicate thing because there may be a strange bug def start_process_and_check(index_attempt): process = Process( target=exec_work_and_comm, args=(work.func_or_cls, obj, child_conn, event), ) process.daemon = True process.start() # check whether the process has really started (possible bug!) if not event.wait(1): log_debug( f"problem: process {work.name_no_space} ({key}) " f"has not really started... (attempt {index_attempt})" ) process.terminate() return False return process really_started = False for index_attempt in range(10): process = start_process_and_check(index_attempt) if process: really_started = True break if not really_started: raise Exception( f"A process {work.name_no_space} ({key}) " "has not started after 10 attempts" ) # todo: use parent_conn.poll to implement a timeout # log_debug(f"waiting for result ({key})") result = parent_conn.recv() # log_debug(f"result ({key}) received") process.join(10 * self.sleep_time) if process.exitcode != 0: logger.info(f"process.exitcode: {process.exitcode}") process.terminate() return result
def fill_queue_paths(self, input_queue, output_queues): assert input_queue is None queue_paths = output_queues[0] queue_couples_of_names = output_queues[1] serie = self.serie if len(serie) == 0: logger.warning("add 0 image. No image to process.") return names = serie.get_name_arrays() for name in names: path_im_output = self.path_dir_result / name path_im_input = str(self.path_dir_src / name) if self.how_saving == "complete": if not path_im_output.exists(): queue_paths[name] = path_im_input else: queue_paths[name] = path_im_input if len(names) == 0: if self.how_saving == "complete": logger.warning( 'topology in mode "complete" and work already done.') else: logger.warning("Nothing to do") return nb_names = len(names) logger.info(f"Add {nb_names} images to compute.") logger.info("First files to process: " + str(names[:4])) logger.debug("All files: " + str(names)) series = self.series if not series: logger.warning("add 0 couple. No phase to correct.") return nb_series = len(series) logger.info(f"Add {nb_series} phase to correct.") for iserie, serie in enumerate(series): if iserie > 1: break logger.info("Files of serie {}: {}".format( iserie, serie.get_name_arrays())) # for the first corrected angle : corrected_angle = angle ind_serie, serie = next(series.items()) name = serie.get_name_arrays()[0] queue_couples_of_names[ind_serie - 1] = (name, name) for ind_serie, serie in series.items(): queue_couples_of_names[ind_serie] = serie.get_name_arrays()
def fill_couples_of_names_and_paths(self, input_queue, output_queues): """Fill the two first queues""" assert input_queue is None queue_couples_of_names = output_queues[0] queue_paths = output_queues[1] series = self.series if not series: logger.warning("add 0 couple. No PIV to compute.") return if self.how_saving == "complete": index_series = [] for ind_serie, serie in self.series.items(): name_piv = get_name_piv(serie, prefix="piv") if not (self.path_dir_result / name_piv).exists(): index_series.append(ind_serie) if not index_series: logger.warning( 'topology in mode "complete" and work already done.') return series.set_index_series(index_series) if logger.isEnabledFor(DEBUG): logger.debug( repr([serie.get_name_arrays() for serie in series])) nb_series = len(series) logger.info(f"Add {nb_series} PIV fields to compute.") for iserie, serie in enumerate(series): if iserie > 1: break logger.info("Files of serie {}: {}".format( iserie, serie.get_name_arrays())) for ind_serie, serie in series.items(): queue_couples_of_names[ind_serie] = serie.get_name_arrays() for name, path in serie.get_name_path_arrays(): queue_paths[name] = path
def _run_works(self): while not all([len(queue) == 0 for queue in self.topology.queues]): for work in self.works: # global functions if work.kind is not None and "global" in work.kind: if len(work.output_queue) > self.nb_items_queue_max: continue work.func_or_cls(work.input_queue, work.output_queue) else: if not work.input_queue: continue key, obj = work.input_queue.pop_first_item() if work.check_exception(key, obj): continue t_start = time.time() log_memory_usage( f"{time.time() - self.t_start:.2f} s. Launch work " + work.name_no_space + f" ({key}). mem usage") # pylint: disable=W0703 try: ret = work.func_or_cls(obj) except Exception as error: self.log_exception(error, work.name_no_space, key) if self.stop_if_error: raise ret = error else: logger.info(f"work {work.name_no_space} ({key}) " f"done in {time.time() - t_start:.3f} s") if work.output_queue is not None: work.output_queue[key] = ret
def __init__(self, queues, path_output=None, logging_level="info", nb_max_workers=None): if path_output is not None: if not os.path.exists(path_output): os.makedirs(path_output) self.path_output = path_output log = os.path.join( path_output, "log_" + time_as_str() + "_" + str(os.getpid()) + ".txt", ) self._log_file = open(log, "w") stdout = sys.stdout if isinstance(stdout, MultiFile): stdout = sys.__stdout__ stderr = sys.stderr if isinstance(stderr, MultiFile): stderr = sys.__stderr__ sys.stdout = MultiFile([stdout, self._log_file]) sys.stderr = MultiFile([stderr, self._log_file]) if logging_level is not None: reset_logger() config_logging(logging_level, file=sys.stdout) if nb_max_workers is None: nb_max_workers = _nb_max_workers self.nb_max_workers_io = max(int(nb_max_workers * 0.8), 2) self.nb_max_launch = max(int(self.nb_max_workers_io), 1) if nb_max_workers < 1: raise ValueError("nb_max_workers < 1") logger.info(f" nb_cpus_allowed = {nb_cores}") logger.info(f" nb_max_workers = {nb_max_workers}") logger.info(f" nb_max_workers_io = {self.nb_max_workers_io}") self.queues = queues self.nb_max_workers = nb_max_workers self.nb_cores = nb_cores self.nb_items_lim = max(2 * nb_max_workers, 2) self._has_to_stop = False if sys.platform != "win32": def handler_signals(signal_number, stack): print("signal {} received: set _has_to_stop to True".format( signal_number)) self._has_to_stop = True signal.signal(12, handler_signals)
def view(self, path, title=None, hide_crosshair=True): """ ImageView, a high-level widget for displaying and analyzing 2D and 3D data. ImageView provides: 1. A zoomable region (ViewBox) for displaying the image 2. A combination histogram and gradient editor (HistogramLUTItem) for controlling the visual appearance of the image 3. A timeline for selecting the currently displayed frame (for 3D data only). 4. Tools for very basic analysis of image data (see ROI and Norm buttons) """ imv = pg.ImageView() win = self._win(title) self._add_gfx_item(win, imv) if not isinstance(path, str): data = [] for p in path: logger.info(f"Viewing {p}") data.append(imread(p).transpose()) data = np.array(data) imv.setImage(data, xvals=np.linspace(0, len(path), data.shape[0])) elif Path(path).is_dir(): raise ValueError("Expected files not directory.") else: logger.info(f"Viewing {path}") try: data = imread(path).transpose() except AttributeError: raise ValueError(f"Is {path} an image?") imv.setImage(data) vb = imv.imageItem.getViewBox() self._add_crosshair(win, imv, vb, hide_lines=hide_crosshair)
def init_series(self) -> List[str]: """Initializes the SeriesOfArrays object `self.series` based on input parameters.""" series = self.series if not series: logger.warning( "encountered empty series. No images to preprocess.") return if self.how_saving == "complete": index_subsets = [] for ind_subset, subset in self.series.items(): names_serie = subset.get_name_arrays() name_preproc = get_name_preproc( subset, names_serie, ind_subset, series.nb_series, self.params.saving.format, ) if not (self.path_dir_result / name_preproc).exists(): index_subsets.append(ind_subset) series.set_index_series(index_subsets) if logger.isEnabledFor(DEBUG): logger.debug( repr([subset.get_name_arrays() for subset in series])) nb_subsets = len(series) if nb_subsets == 0: logger.warning( 'topology in mode "complete" and work already done.') return elif nb_subsets == 1: plurial = "" else: plurial = "s" logger.info(f"Add {nb_subsets} image serie{plurial} to compute.")
def compute(self, sequential=None, has_to_exit=True): """Compute (run all works to be done). Parameters ---------- sequential : None If bool(sequential) is True, the computations are run in sequential (useful for debugging). has_to_exit : True If bool(has_to_exit) is True and if the computation has to stop because of a signal 12 (cluster), a signal 99 is sent at exit. """ if hasattr(self, "path_output"): logger.info("path results:\n" + str(self.path_output)) if hasattr(self, "params"): tmp_path_params = str( self.path_output / ("params_" + time_as_str() + f"_{os.getpid()}")) if not os.path.exists(tmp_path_params + ".xml"): path_params = tmp_path_params + ".xml" else: i = 1 while os.path.exists(tmp_path_params + "_" + str(i) + ".xml"): i += 1 path_params = tmp_path_params + "_" + str(i) + ".xml" self.params._save_as_xml(path_params) self.t_start = time() log_memory_usage(time_as_str(2) + ": starting execution. mem usage") self.nb_workers_cpu = 0 self.nb_workers_io = 0 workers = [] class CheckWorksThread(threading.Thread): cls_to_be_updated = threading.Thread def __init__(self): self.has_to_stop = False super().__init__() self.exitcode = None self.daemon = True def in_time_loop(self): t_tmp = time() for worker in workers: if (isinstance(worker, self.cls_to_be_updated) and worker.fill_destination()): workers.remove(worker) t_tmp = time() - t_tmp if t_tmp > 0.2: logger.info("update list of workers with fill_destination " "done in {:.3f} s".format(t_tmp)) sleep(dt_update) def run(self): try: while not self.has_to_stop: self.in_time_loop() except Exception as e: print("Exception in UpdateThread") self.exitcode = 1 self.exception = e class CheckWorksProcess(CheckWorksThread): cls_to_be_updated = Process def in_time_loop(self): # weird bug subprocessing py3 for worker in workers: if not worker.really_started: # print('check if worker has really started.' + # worker.key) try: worker.really_started = ( worker.comm_started.get_nowait()) except queue.Empty: pass if (not worker.really_started and time() - worker.t_start > 10): # bug! The worker does not work. We kill it! :-) logger.error( cstring( "Mysterious bug multiprocessing: " "a launched worker has not started. " "We kill it! ({}, key: {}).".format( worker.work_name, worker.key), color="FAIL", )) # the case of this worker has been worker.really_started = True worker.terminate() super().in_time_loop() self.thread_check_works_t = CheckWorksThread() self.thread_check_works_t.start() self.thread_check_works_p = CheckWorksProcess() self.thread_check_works_p.start() while not self._has_to_stop and (any( [not q.is_empty() for q in self.queues]) or len(workers) > 0): # debug # if logger.level == 10 and \ # all([q.is_empty() for q in self.queues]) and len(workers) == 1: # for worker in workers: # try: # is_alive = worker.is_alive() # except AttributeError: # is_alive = None # logger.debug( # str((worker, worker.key, worker.exitcode, is_alive))) # if time() - worker.t_start > 60: # from fluiddyn import ipydebug # ipydebug() self.nb_workers = len(workers) # slow down this loop... sleep(dt_small) if self.nb_workers_cpu >= nb_max_workers: logger.debug( cstring( ("The workers are saturated: " "{}, sleep {} s").format(self.nb_workers_cpu, dt), color="WARNING", )) sleep(dt) for q in self.queues: if not q.is_empty(): logger.debug(q) logger.debug("check_and_act for work: " + repr(q.work)) try: new_workers = q.check_and_act(sequential=sequential) except OSError: logger.error( cstring( "Memory full: to free some memory, no more " "computing job will be launched while the last " "(saving) waiting queue is not empty.", color="FAIL", )) log_memory_usage(color="FAIL", mode="error") self._clear_save_queue(workers, sequential) logger.info( cstring( "The last waiting queue has been emptied.", color="FAIL", )) log_memory_usage(color="FAIL", mode="info") continue if new_workers is not None: for worker in new_workers: workers.append(worker) logger.debug("workers: " + repr(workers)) if self.thread_check_works_t.exitcode: raise self.thread_check_works_t.exception if self.thread_check_works_p.exitcode: raise self.thread_check_works_p.exception if len(workers) != self.nb_workers: gc.collect() if self._has_to_stop: logger.info( cstring( "Will exist because of signal 12.", "Waiting for all workers to finish...", color="FAIL", )) self._clear_save_queue(workers, sequential) self.thread_check_works_t.has_to_stop = True self.thread_check_works_p.has_to_stop = True self.thread_check_works_t.join() self.thread_check_works_p.join() self.print_at_exit(time() - self.t_start) log_memory_usage(time_as_str(2) + ": end of `compute`. mem usage") if self._has_to_stop and has_to_exit: logger.info(cstring("Exit with signal 99.", color="FAIL")) exit(99) self._reset_std_as_default()
async def async_run_work_cpu(self, work): """Is destined to be started with a "trio.start_soon". Executes the work on an item (key, obj), and add the result on work.output_queue. Parameters ---------- work : A work from the topology """ self.nb_working_workers_cpu += 1 try: key, obj = work.input_queue.pop_first_item() except KeyError: self.nb_working_workers_cpu -= 1 if work.check_exception(key, obj): self.nb_working_workers_cpu -= 1 return t_start = time.time() log_memory_usage( f"{time.time() - self.t_start:.2f} s. Launch work " + work.name_no_space + f" ({key}). mem usage" ) def exec_work_and_comm(func, obj, child_conn, event): # log_debug(f"process ({key}) started") event.set() # pylint: disable=W0703 try: result = func(obj) except Exception as error: result = error # log_debug(f"in process, send result ({key}): {result}") child_conn.send(result) parent_conn, child_conn = Pipe() event = Event() def run_process(): # we do this complicate thing because there may be a strange bug def start_process_and_check(index_attempt): process = Process( target=exec_work_and_comm, args=(work.func_or_cls, obj, child_conn, event), ) process.daemon = True process.start() # check whether the process has really started (possible bug!) if not event.wait(1): log_debug( f"problem: process {work.name_no_space} ({key}) " f"has not really started... (attempt {index_attempt})" ) process.terminate() return False return process really_started = False for index_attempt in range(10): process = start_process_and_check(index_attempt) if process: really_started = True break if not really_started: raise Exception( f"A process {work.name_no_space} ({key}) " "has not started after 10 attempts" ) # todo: use parent_conn.poll to implement a timeout # log_debug(f"waiting for result ({key})") result = parent_conn.recv() # log_debug(f"result ({key}) received") process.join(10 * self.sleep_time) if process.exitcode != 0: logger.info(f"process.exitcode: {process.exitcode}") process.terminate() return result ret = await trio.run_sync_in_worker_thread(run_process) if isinstance(ret, Exception): self.log_exception(ret, work.name_no_space, key) if self.stop_if_error: raise ret else: logger.info( f"work {work.name_no_space} ({key}) " f"done in {time.time() - t_start:.3f} s" ) if work.output_queue is not None: work.output_queue[key] = ret self.nb_working_workers_cpu -= 1