def __init__(self, settings, summary_freq=4): """Constructor Parameters ---------- settings : Settings The settings of the simulator summary_freq : int Frequency (in number of experiment) at which summary messages are displayed """ self.settings = settings self.results = ResultSet() self.seq = SequenceNumber() self.exp_durations = collections.deque(maxlen=30) self.n_success = 0 self.n_fail = 0 self.summary_freq = summary_freq self._stop = False if self.settings.PARALLEL_EXECUTION: self.pool = mp.Pool(settings.N_PROCESSES)
class Orchestrator(object): """Orchestrator. It is responsible for orchestrating the execution of all experiments and aggregate results. """ def __init__(self, settings, summary_freq=4): """Constructor Parameters ---------- settings : Settings The settings of the simulator summary_freq : int Frequency (in number of experiment) at which summary messages are displayed """ self.settings = settings self.results = ResultSet() self.seq = SequenceNumber() self.exp_durations = collections.deque(maxlen=30) self.n_success = 0 self.n_fail = 0 self.summary_freq = summary_freq self._stop = False if self.settings.PARALLEL_EXECUTION: self.pool = mp.Pool(settings.N_PROCESSES) def stop(self): """Stop the execution of the orchestrator """ logger.info('Orchestrator is stopping') self._stop = True if self.settings.PARALLEL_EXECUTION: self.pool.terminate() self.pool.join() def run(self): """Run the orchestrator. This call is blocking, whether multiple processes are used or not. This methods returns only after all experiments are executed. """ # Create queue of experiment configurations queue = collections.deque(self.settings.EXPERIMENT_QUEUE) # Calculate number of experiments and number of processes self.n_exp = len(queue) * self.settings.N_REPLICATIONS self.n_proc = self.settings.N_PROCESSES \ if self.settings.PARALLEL_EXECUTION \ else 1 logger.info('Starting simulations: %d experiments, %d process(es)' % (self.n_exp, self.n_proc)) if self.settings.PARALLEL_EXECUTION: # This job queue is used only to keep track of which jobs have # finished and which are still running. Currently this information # is used only to handle keyboard interrupts correctly job_queue = collections.deque() # Schedule experiments from the queue while queue: experiment = queue.popleft() for _ in range(self.settings.N_REPLICATIONS): job_queue.append(self.pool.apply_async(run_scenario, args=(self.settings, experiment, self.seq.assign(), self.n_exp), callback=self.experiment_callback)) self.pool.close() # This solution is probably not optimal, but at least makes # KeyboardInterrupt work fine, which is crucial if launching the # simulation remotely via screen. # What happens here is that we keep waiting for possible # KeyboardInterrupts till the last process terminates successfully. # We may have to wait up to 5 seconds after the last process # terminates before exiting, which is really negligible try: while job_queue: job = job_queue.popleft() while not job.ready(): time.sleep(5) except KeyboardInterrupt: self.pool.terminate() self.pool.join() else: # Single-process execution while queue: experiment = queue.popleft() for _ in range(self.settings.N_REPLICATIONS): self.experiment_callback(run_scenario(self.settings, experiment, self.seq.assign(), self.n_exp)) if self._stop: self.stop() logger.info('END | Planned: %d, Completed: %d, Succeeded: %d, Failed: %d', self.n_exp, self.n_fail + self.n_success, self.n_success, self.n_fail) def experiment_callback(self, args): """Callback method called by run_scenario Parameters ---------- args : tuple Tuple of arguments """ # If args is None, that means that an exception was raised during the # execution of the experiment. In such case, ignore it if not args: self.n_fail += 1 return # Extract parameters params, results, duration = args self.n_success += 1 # Store results self.results.add(params, results) self.exp_durations.append(duration) if self.n_success % self.summary_freq == 0: # Number of experiments scheduled to be executed n_scheduled = self.n_exp - (self.n_fail + self.n_success) # Compute ETA n_cores = min(mp.cpu_count(), self.n_proc) mean_duration = sum(self.exp_durations)/len(self.exp_durations) eta = timestr(n_scheduled*mean_duration/n_cores, False) # Print summary logger.info('SUMMARY | Completed: %d, Failed: %d, Scheduled: %d, ETA: %s', self.n_success, self.n_fail, n_scheduled, eta)