def init_workers(self, job_id): try: log.info('Starting workers...') active_workers = [] workers_rpc_uris = [] for worker in filter(lambda w: w.enabled, self.master.workers): response = requests.post( 'http://{}:{}/api/internal/job'.format( worker.ip, worker.port), files={ 'solution': open(get_solution_path(self.job_home, job_id), 'rb') }, data={'job_id': job_id}) if response.status_code == 200: uri = response.json()['uri'] workers_rpc_uris.append(uri) active_workers.append(worker) else: log.warning('Unable to run RPC on %s:%d.', worker.ip, worker.port) log.debug('Obtained RPC urls: %s', workers_rpc_uris) rpc_workers = [] for uri in workers_rpc_uris: rpc_workers.append(Pyro4. async (Pyro4.Proxy(uri))) log.info('Started %d workers.', len(rpc_workers)) return rpc_workers, active_workers except Exception as e: return [], []
def init_workers(self, job_id): try: log.info('Starting workers...') active_workers = [] workers_rpc_uris = [] for worker in filter(lambda w: w.enabled, self.master.workers): response = requests.post( 'http://{}:{}/api/internal/job'.format(worker.ip, worker.port), files={'solution': open(get_solution_path(self.job_home, job_id), 'rb')}, data={'job_id': job_id} ) if response.status_code == 200: uri = response.json()['uri'] workers_rpc_uris.append(uri) active_workers.append(worker) else: log.warning('Unable to run RPC on %s:%d.', worker.ip, worker.port) log.debug('Obtained RPC urls: %s', workers_rpc_uris) rpc_workers = [] for uri in workers_rpc_uris: rpc_workers.append(Pyro4.async(Pyro4.Proxy(uri))) log.info('Started %d workers.', len(rpc_workers)) return rpc_workers, active_workers except Exception as e: return [], []
def run(self): while True: self.current_job = self.scheduled_jobs.get() if self.current_job.aborted: self.current_job.abort_job() continue job_id = self.current_job.id log.info('Job %d enqueued.' % job_id) selected_workers = [] try: self.current_job.start_job() rpc_workers, selected_workers = self.init_workers(job_id) if len(rpc_workers) == 0: raise NoWorkersException() solution_module_path = get_solution_path(self.job_home, job_id) solution_module = imp.load_source('solver_module_%d' % job_id, solution_module_path) log.info('Loaded solution from %s.' % solution_module_path) solver = solution_module.Solver( rpc_workers, get_input_path(self.job_home, job_id), get_output_path(self.job_home, job_id)) self.executor = SolutionThread(solver, job_id) self.executor.start() while not self.executor.is_finished(): if self.current_job.aborted: self.executor.terminate() else: time.sleep(1) if self.executor.is_terminated(): self.current_job.abort_job() else: if self.executor.status == SolutionThread.FAILURE_FINISHED: self.current_job.end_job(True, self.executor.status_message) else: self.current_job.end_job() except Exception as e: self.current_job.end_job(True, str(e)) finally: self.destroy_workers(selected_workers, job_id) self.current_job = None self.executor = None
def run(self): while True: self.current_job = self.scheduled_jobs.get() if self.current_job.aborted: self.current_job.abort_job() continue job_id = self.current_job.id log.info('Job %d enqueued.' % job_id) selected_workers = [] try: self.current_job.start_job() rpc_workers, selected_workers = self.init_workers(job_id) if len(rpc_workers) == 0: raise NoWorkersException() solution_module_path = get_solution_path(self.job_home, job_id) solution_module = imp.load_source('solver_module_%d' % job_id, solution_module_path) log.info('Loaded solution from %s.' % solution_module_path) solver = solution_module.Solver(rpc_workers, get_input_path(self.job_home, job_id), get_output_path(self.job_home, job_id)) self.executor = SolutionThread(solver, job_id) self.executor.start() while not self.executor.is_finished(): if self.current_job.aborted: self.executor.terminate() else: time.sleep(1) if self.executor.is_terminated(): self.current_job.abort_job() else: if self.executor.status == SolutionThread.FAILURE_FINISHED: self.current_job.end_job(True, self.executor.status_message) else: self.current_job.end_job() except Exception as e: self.current_job.end_job(True, str(e)) finally: self.destroy_workers(selected_workers, job_id) self.current_job = None self.executor = None