def work(self, *args, **kwargs): try: # FIXME: Disable because it lock the database: with core.api_util.DatabaseContext.using_conn(): for machine in models.provider_config.list_machines( self._provider.name): if self.should_stop(): return cost = models.provider_config.get_machine_provider_cost( self._provider.name, machine['machine_code']) cost = core.api_util.price_to_float( int(cost['cost_per_sec']) * 3600) prices = async_util.run_proc(SpotIndexUpdater.load_prices, self._provider.name, machine['machine_code']) if not prices: log.error("No price history found for machine " + str(machine['machine_code'])) continue price_variance = compute_variance(prices) price_max = max(prices) index = max(0, ((price_max - abs(price_variance)) / price_max) - (max(0, cost - price_max) / cost)**10) models.provider_config.set_spot_index(self._provider.name, machine['machine_code'], index) except error_util.all_errors as e: error_util.log_error(log, e)
def work(self, *args, **kwargs): try: if self.should_stop(): return conf = core.api_util.get_conf() currency_api_url = conf.get("currency", "currency_api_url") if type_util.is_array(currency_api_url): log.warning("This should not be a list: " + repr(currency_api_url)) currency_api_url = "".join(currency_api_url) currency_api_token = conf.get("currency", "currency_api_token") aws_pricing_api = conf.get("general", "provider_pricing_api") # FIXME: Disable because it lock the database: with core.api_util.DatabaseContext.using_conn(): models.currencies.update_currency_exchange_rates( currency_api_url, currency_api_token) if self.should_stop(): return for provider in core.api_util.get_all_providers(): if self.should_stop(): return models.provider_config.update_provider_costs( aws_pricing_api, provider.name) models.provider_config.update_machine_prices() except StandardError as e: error_util.log_error(log, e)
def work(self, *args, **kwargs): try: core.api_util.wait_for_redis() with core.api_util.RedisContext.using_pubsub_conn() as redis_conn: channel = core.api_util.RedisContext.get_channel("launcher") for message in redis_util.listen_pubsub( redis_conn, channel, datetime.timedelta(seconds=1)): if self.should_stop(): return if message is None: continue if not re.match(r"^.*_\d+$", message['data']): log.warning(message['data'] + ' format is invalid') continue task, jobid_str = message['data'].rsplit('_', 1) self._msg_queue.put({ 'type': REDIS_MESSAGE, "data": { 'task': task, 'jobid': int(jobid_str) } }) except StandardError as e: error_util.log_error(log, e)
def clean(self): if not self._shared_volume: return cmd = ['docker', 'volume', 'rm', self._shared_volume] try: subprocess.check_output(cmd, cwd=API_PATH, stderr=subprocess.PIPE) except error_util.all_errors as e: error_util.log_error(log, e)
def work(self, *args, **kwargs): try: if self._running_jobs.get_update_date( ) <= datetime.datetime.utcfromtimestamp(0): return running_jobs = [] if self._last_date < self._running_jobs.get_update_date(): self._last_date = self._running_jobs.get_update_date() running_jobs = self._running_jobs.get_list() for job_id in running_jobs: if self.should_stop(): return self._date_by_job_id[int(job_id)] = self._last_date if GC_DEBUG_MODE: if datetime.datetime.utcnow( ) - self._last_date > datetime.timedelta(hours=1): log.warning("GC: Should not happen:\n" + "\tlast_date: " + str(self._last_date) + "\n" + "\tnow: " + str(datetime.datetime.utcnow()) + "\n" + "\trunning jobs update date: " + str(self._running_jobs.get_update_date())) return unfinished_jobs = models.jobs.list_unfinished_jobs() unfinished_job_ids = set( [int(job['id']) for job in unfinished_jobs]) for job_id in unfinished_job_ids: if self.should_stop(): return if job_id not in self._date_by_job_id.keys(): self._date_by_job_id[job_id] = datetime.datetime.utcnow() for job_id in unfinished_job_ids: if self.should_stop(): return date = self._date_by_job_id[job_id] if datetime.datetime.utcnow() - date > datetime.timedelta( hours=2): if GC_DEBUG_MODE: if job_id in self._deleted_jobs: continue self._deleted_jobs.add(job_id) gc_debug("GC: Cleaning job " + str(job_id) + " because no worker is working on it.\n" + "Details: \n date:" + str(date) + "\nlast_date:" + str(self._last_date) + "\nrunning_jobs: " + repr(running_jobs) + "\nnow: " + str(datetime.datetime.utcnow())) else: log.warning("GC: Cleaning job " + str(job_id) + " because no worker is working on it") models.jobs.cancel_job(job_id) except error_util.abort_errors: raise except error_util.all_errors as e: error_util.log_error(log, e)
def work(self, *args, **kwargs): # Check if all files in a storage should exists try: a_week_ago = datetime.datetime.utcnow() - datetime.timedelta( days=7) files = async_util.run_proc(StorageCollector._list_files, self._storage.name) for filename in files: if self.should_stop(): return if models.projects.file_exists(filename): continue try: creation_date = async_util.run_proc( StorageCollector._get_file_creation_date, self._storage.name, filename) except core.storages.FileMissingError: continue # The file has been already deleted if creation_date > a_week_ago: continue if GC_DEBUG_MODE: if filename in self._dirty_files: continue self._dirty_files.add(filename) gc_debug("Removing old file " + filename + " on storage " + self._storage.name) else: log.warning("GC: Removing old file " + filename + " on storage " + self._storage.name) async_util.run_proc(StorageCollector._delete_file, self._storage.name, filename) except error_util.abort_errors: raise except error_util.all_errors as e: error_util.log_error(log, e) # Check if there is a missing file try: files = models.projects.list_files_on_storage(self._storage.name) for file_info in files: filename = file_info['filename'] if async_util.run_proc(StorageCollector._file_exists, self._storage.name, filename): continue if GC_DEBUG_MODE: if filename in self._missing_files: continue self._missing_files.add(filename) gc_debug("Missing file " + filename + " on storage " + self._storage.name) else: log.error("GC: Missing file " + filename + " on storage " + self._storage.name) except error_util.abort_errors: raise except error_util.all_errors as e: error_util.log_error(log, e)
def work(self, *args, **kwargs): try: if self.should_stop(): return now = datetime.datetime.utcnow() last_charge_limit = now - datetime.timedelta(minutes=1) # FIXME: Disable because it lock the database: with core.api_util.DatabaseContext.using_conn(): models.projects.charge_all(last_charge_limit) models.meshes.charge_all(last_charge_limit) models.calc.charge_all(last_charge_limit) except StandardError as e: error_util.log_error(log, e)
def work(self, *args, **kargs): try: time.sleep(5) # Don't start immediately evt_handler = SrcCodeEventHandler(self._msg_queue) observer = watchdog.observers.Observer() observer.schedule(evt_handler, os.path.join(API_PATH, "app"), recursive=True) observer.start() self.wait_for_stop() observer.stop() observer.join() except StandardError as e: error_util.log_error(log, e)
def work(self, *args, **kwargs): try: yesterday = datetime.datetime.utcnow() - datetime.timedelta(days=1) to_delete = [] for filename in os.listdir(self._uploaded_file_dir): if self.should_stop(): return if filename in ("/", "", ".", ".."): continue file_path = os.path.abspath( os.path.join(self._uploaded_file_dir, filename)) creation_date = datetime.datetime.utcfromtimestamp( os.path.getmtime(file_path)) if creation_date < yesterday: to_delete.append(file_path) for file_path in to_delete: if self.should_stop(): return if GC_DEBUG_MODE: try: if file_path in self._dirty_files: continue self._dirty_files.add(file_path) gc_debug("GC: Removing old temp file: " + str(file_path)) except error_util.abort_errors: raise except error_util.all_errors as e: log.error("Unable to remove old uploaded dir " + repr(file_path)) error_util.log_error(log, e) else: if os.path.isdir(file_path): try: log.warning("GC: Removing old temp folder: " + str(file_path)) shutil.rmtree(file_path) except error_util.abort_errors: raise except error_util.all_errors as e: log.error("Unable to remove old uploaded dir " + repr(file_path)) error_util.log_error(log, e) else: try: log.warning("GC: Removing old temp file: " + str(file_path)) os.remove(file_path) except error_util.abort_errors: raise except error_util.all_errors as e: log.error("Unable to remove old uploaded file " + repr(file_path)) error_util.log_error(log, e) except error_util.abort_errors: raise except error_util.all_errors as e: error_util.log_error(log, e)
def work(self, *args, **kwargs): if self._running_jobs.get_update_date( ) <= datetime.datetime.utcfromtimestamp(0): return if self._running_workers.get_update_date( ) <= datetime.datetime.utcfromtimestamp(0): return try: artefacts = self._fetch_artefacts() if self.should_stop(): return job_ids = [int(job_id) for job_id in self._running_jobs.get_list()] worker_ids = [ w.worker_id for w in self._running_workers.get_list() ] for artefact in artefacts: if self.should_stop(): return try: if artefact.job_id is not None: if artefact.job_id in job_ids: continue elif artefact.worker_id is not None: if artefact.worker_id in worker_ids: continue else: log.warning( "Bad artefact: no job or worker specified: " + str(artefact)) continue if GC_DEBUG_MODE: if artefact in self._dirty_artefacts: continue self._dirty_artefacts.add(artefact) gc_debug("GC: Cleaning " + str(artefact) + " because not worker use it anymore") else: log.warning("GC: Cleaning " + str(artefact) + " because not worker use it anymore") self._provider.delete_artefact(artefact) except StandardError as e: error_util.log_error(log, e) except error_util.abort_errors: raise except error_util.all_errors as e: error_util.log_error(log, e)
def try_append_file_to_project(user_id, project_codename, file_path, filename=None, key=None): generated = append_file_to_project(user_id, project_codename, file_path, filename, key) try: yield generated except error_util.all_errors: with error_util.before_raising(): try: remove_file_from_project(user_id, project_codename, generated['id']) except error_util.all_errors as e: log.error("Unable to remove file " + repr(generated)) error_util.log_error(log, e)
def run_pending_jobs(api_name, server_name, log_level, log_output): """ Run all pending jobs. It is useful at script startup or when redis server is down. :param api_name: The name of current API :type api_name: str :param server_name: The name of current server :type server_name: str :param log_level: The level of log we want :type log_level: int :param log_output: Where do we should output the logs. Should be "stdout", "stderr", "syslog" or a file :type log_output: str """ task_lib = models.jobs.list_tasks() for task in task_lib: try: run_task(api_name, server_name, task['task'], task['job_id'], log_level, log_output) except StandardError as e: error_util.log_error(log, e)
def init_process(fork, api_name, job_id, toolchain): if fork: try: in_child, parent_pid, child_pid = proc_util.double_fork() except exceptions.OSError as e: log.error("Unable to run command, fork failed, cause:") error_util.log_error(log, e) return False if not in_child: return False # Nothing to do in parent, we continue the main loop signal.signal(signal.SIGTERM, raise_keyboard_interrupt) signal.signal(signal.SIGINT, raise_keyboard_interrupt) try: setproctitle.setproctitle(api_name + " " + toolchain + " job " + str(job_id)) except StandardError as e: log.warning(str(e)) return True
def handle_error(e): code = 500 msg = str(e) if app.debug else "Internal Error" if isinstance(e, HTTPException): code = int(e.code) msg = e.name warning_msg = e.name if e.description and app.debug: msg = e.name + ": " + e.description if len(e.description) < 80: warning_msg += ": " + e.description if code != 404: log.warning(warning_msg) error_util.log_error(log, e) else: error_util.log_error(log, e) if code == 404 and request.path.strip("/").split("/")[0] in ("v1", "admin"): response = jsonify({"success": 0, "error_msgs": [msg], "data": None}) else: response = "<html><body><h1>" + escape(msg) + "</h1></body></html>" return response, code
def run_server(api_name, server_name, redis_host="localhost", redis_port=6379, data_db=0, pubsub_db=1, auto_reload=False, log_level=logging.INFO, log_output="syslog", pid_file=None): """ Main loop function. It listen to pending jobs and run them. It will burn the recurring prices for files saved of storages. It also run a garbage collector to ensure killed jobs or faulty workers are cleaned. :param api_name: The name of current API :type api_name: str :param server_name: The fqdn of current server :type server_name: str :param data_db: The redis database for data. Optional, default 0 :type data_db: int :param pubsub_db: The redis database for pubsub events. Optional, default 1 :type pubsub_db: int :param redis_host: The redis server to connect to. Optional, default "localhost" :type redis_host: str :param redis_port: The redis server port. Optional, default 6379 :type redis_port: int :param auto_reload: Do we want the server to restart if the source code changed? Optional, default False :type auto_reload: bool :param log_level: The level of log we want :type log_level: int :param log_output: Where do we should output the logs. Should be "stdout", "stderr", "syslog" or a file :type log_output: str :param pid_file: The pid file to create if any, None otherwise :type pid_file: str|None """ # Write the pid file if pid_file: pid_file = os.path.join("/var", "run", api_name, api_name + ".pid") with open(pid_file, "w") as fh: fh.write(str(os.getpid()) + "\n") log.info("Starting " + api_name + " server") core.api_util.DatabaseContext.load_conf() core.api_util.RedisContext.set_params(api_name, server_name, redis_host, redis_port, data_db, pubsub_db) core.api_util.wait_for_postgres() queue = async_util.create_thread_queue() running_threads = [] redis_thread = RedisReceiver(queue, api_name, server_name) running_threads.append(redis_thread) redis_thread.start() def reload_signal_handler(*args): queue.put({'type': SRC_CHANGED_MESSAGE, "data": None}) signal.signal(signal.SIGUSR1, reload_signal_handler) burner_thread = PriceBurner() running_threads.append(burner_thread) burner_thread.start() price_updater_thread = PriceUpdater() running_threads.append(price_updater_thread) price_updater_thread.start() for provider in core.api_util.get_all_providers(): if provider.type == "aws_spot": spot_thread = SpotIndexUpdater(provider) running_threads.append(spot_thread) spot_thread.start() if auto_reload: change_thread = SourceChangeNotifier(queue) running_threads.append(change_thread) change_thread.start() debug_util.register_for_debug() else: def ignore_signal(*args): log.info("Server not in debug mode, ignoring signal...") signal.signal(signal.SIGUSR2, ignore_signal) if GC_ENABLED: cmd = [ "python", os.path.join(API_PATH, "app", "garbage_collector.py"), "--log-level", logging.getLevelName(log_level), "--log-output", log_output ] if auto_reload: cmd.append("--debug") gc = subprocess.Popen(cmd) else: gc = None # Run all pending jobs we may have missed with a redis shutdown or a server.py shutdown run_pending_jobs(api_name, server_name, log_level, log_output) should_restart = False abort_exception = None while True: try: # Wait for events try: event = queue.get(block=True, timeout=60) except async_util.QueueEmpty: # No events during 1min, perhaps redis is dead so we check pending tasks run_pending_jobs(api_name, server_name, log_level, log_output) continue # Running msg_type = event['type'] msg_data = event['data'] if msg_type == REDIS_MESSAGE: run_task(api_name, server_name, msg_data['task'], msg_data['jobid'], log_level, log_output) elif msg_type == SRC_CHANGED_MESSAGE: should_restart = True break else: log.error("Unknown message received: " + str(msg_type)) except error_util.abort_errors as e: abort_exception = e break except StandardError as e: error_util.log_error(log, e) try: log.info("Exit confirmed, cleaning...") for thread in running_threads: thread.stop() if gc: proc_util.ensure_stop_proc(gc) for thread in running_threads: thread.join() finally: if pid_file: try: os.remove(pid_file) except StandardError: pass log.info("Everything is cleaned") if abort_exception: raise abort_exception elif should_restart: restart_server()
def work(self, *args, **kwargs): try: # Clean projects dirty_projects = models.projects.list_failed_and_dirty() if self.should_stop(): return for project in dirty_projects: if self.should_stop(): return if GC_DEBUG_MODE: if project['uid'] in self._dirty_projects: continue self._dirty_projects.add(project['uid']) gc_debug("GC: Setting project as not analyzed " + str(project['uid']) + " because no analysis succeeded") else: log.warning("GC: Setting project as not analyzed " + str(project['uid']) + " because no analysis succeeded") models.projects.set_project_status( project['user_id'], project['uid'], models.projects.PROJECT_STATUS_RAW) # Clean meshes if self.should_stop(): return dirty_meshes = models.meshes.list_failed_and_dirty() if self.should_stop(): return for mesh in dirty_meshes: if self.should_stop(): return if GC_DEBUG_MODE: if mesh['id'] in self._dirty_meshes: continue self._dirty_meshes.add(mesh['id']) gc_debug("GC: Setting mesh as failed " + str(mesh['id']) + " because mesh job failed") else: log.warning("GC: Setting mesh as failed " + str(mesh['id']) + " because mesh job failed") models.meshes.set_mesh_status(mesh['user_id'], mesh['project_uid_id'], mesh['name'], models.meshes.STATUS_KILLED) # Clean calculations if self.should_stop(): return dirty_calculations = models.calc.list_failed_and_dirty() if self.should_stop(): return for calc in dirty_calculations: if self.should_stop(): return if GC_DEBUG_MODE: if calc['id'] in self._dirty_calcs: continue self._dirty_calcs.add(calc['id']) gc_debug("GC: Setting calc as failed " + str(calc['id']) + " because calc job failed") else: log.warning("GC: Setting calc as failed " + str(calc['id']) + " because calc job failed") models.calc.set_calc_status(calc['user_id'], calc['project_uid_id'], calc['name'], models.calc.STATUS_KILLED) except error_util.abort_errors: raise except error_util.all_errors as e: error_util.log_error(log, e)
def work(self, *args, **kwargs): try: now = datetime.datetime.utcnow() # fetching data about running workers thread_list = [] workers = async_util.run_proc(WorkerCollector._list_workers, self._provider.name) self._running_workers.set_list(workers, now) for worker in workers: if self.should_stop(): return if worker.worker_id not in self._observers.keys(): self._observers[worker.worker_id] = WorkerObserver( worker, self._provider, self._api_name, self._server_name, WORKER_PROCESS_LAUNCHER) thread = threading.Thread( target=WorkerCollector._update_worker, args=(self._observers[worker.worker_id], worker)) thread.start() thread_list.append(thread) for thread in thread_list: if self.should_stop(): return thread.join() # Listing active jobs, and grouping cluster job_list = [] cluster_list = ClusterList() for worker in workers: if self.should_stop(): return observer = self._observers[worker.worker_id] if observer.jobid and observer.status not in ( Worker.Status.SHUTTING_DOWN, Worker.Status.TERMINATED): job_list.append(observer.jobid) if observer.is_cluster_master(): cluster_list.append_master(observer) elif observer.is_cluster_slave(): cluster_list.append_slave(observer) self._running_jobs.set_list(job_list, now) for worker in workers: if self.should_stop(): return observer = self._observers[worker.worker_id] sentence = judge_worker(observer, self._rules, cluster_list, now) if sentence is None: continue if sentence.penalty == Penalty.DEATH: msg = WorkerCollector.EMAIL_MSG % ( observer, sentence.description, "KILLING INSTANCE !!!", observer.description) if GC_DEBUG_MODE: if worker.worker_id in self._dirty_kill_workers: continue self._dirty_kill_workers.add(worker.worker_id) gc_debug(msg) else: log.warning("killing instance %s: %s" % (observer, sentence.description)) core.api_util.send_admin_email("Watchdog KILL", msg) async_util.run_proc(WorkerCollector._kill_worker, self._provider.name, worker) observer.mark_as_killed() elif sentence.penalty == Penalty.PROBATION: msg = WorkerCollector.EMAIL_MSG % ( observer, sentence.description, "", observer.description) if not GC_DEBUG_MODE: if worker.worker_id in self._dirty_warning_workers: continue self._dirty_warning_workers.add(worker.worker_id) gc_debug(msg) else: log.warning("Strange behaviour for instance %s: %s" % (observer, sentence.description)) core.api_util.send_admin_email("Watchdog warning", msg) except error_util.abort_errors: raise except error_util.all_errors as e: error_util.log_error(log, e)
def run_toolchain(api_name, server_name, job_id, toolchain): """ Execute a task, which could be to cancel a running toolchain or to launch a specific toolchain. It will launch a separated subprocess and return before task is completed :param api_name: The name of current API :type api_name: str :param server_name: The fqdn of current server :type server_name: str :param job_id: The is of the job the task is related to :type job_id: int :param toolchain: The task to launch :type toolchain: str """ # FIXME: Disable because it lock the database: with core.api_util.DatabaseContext.using_conn(): try: task = models.jobs.get_task_info(job_id) if not task: log.info("Job " + str(job_id) + " is already running, skipping") return models.jobs.dequeue_task(job_id) job = models.jobs.get_job(job_id) if not job: log.error("Unknown job " + str(job_id)) time.sleep(0.2) if int(job['status']) != models.jobs.JOB_STATUS_PENDING: log.info("Job " + str(job_id) + " already launched, skipping") return models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_LAUNCHING) with core.api_util.RedisContext.using_data_conn() as r: r.set( api_name + ":" + server_name + ":job-" + str(job_id) + "-pid", int(os.getpid())) if toolchain != task['task']: log.error("Bad task toolchain") return if toolchain == models.jobs.TASK_UPLOAD_AND_ANALYSE: commands.upload_and_analyze.run(api_name, server_name, job_id, **task["params"]) elif toolchain == models.jobs.TASK_UPLOAD_AND_LINK: commands.upload_and_link.run(api_name, server_name, job_id, **task["params"]) elif toolchain == models.jobs.TASK_MESH: commands.mesh.run(api_name, server_name, job_id, **task["params"]) elif toolchain == models.jobs.TASK_CALC: commands.calc.run(api_name, server_name, job_id, **task["params"]) elif toolchain == models.jobs.TASK_RESTART_CALC: commands.restart_calc.run(api_name, server_name, job_id, **task["params"]) else: log.error("Task not implemented: " + str(toolchain)) return log.info("Command successfully finished") except core.api_util.abort_errors: log.warning("Operation canceled") models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_CANCELED) except core.api_util.ToolchainError as e: log.error(str(e)) models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_KILLED) except error_util.all_errors as e: error_util.log_error(log, e) models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_KILLED) else: models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_FINISHED) finally: with core.api_util.RedisContext.using_data_conn() as redis_conn: redis_conn.delete(api_name + ":" + server_name + ":job-" + str(job_id) + "-pid") sys.exit(0)
def run_task(api_name, server_name, task_order, job_id, log_level, log_output): """ Execute a task, which could be to cancel a running toolchain or to launch a specific toolchain. It will launch a separated subprocess and return before task is completed :param api_name: The name of current API :type api_name: str :param server_name: The fqdn of current server :type server_name: str :param task_order: The task to do (launch or cancel) :type task_order: int :param job_id: The is of the job the task is related to :type job_id: int :param log_level: The level of log we want :type log_level: int :param log_output: Where do we should output the logs. Should be "stdout", "stderr", "syslog" or a file :type log_output: str """ if task_order == models.jobs.TASK_CANCEL: models.jobs.dequeue_task(job_id) proc = multiprocessing.Process(target=cancel_job, args=( api_name, server_name, job_id, )) proc.daemon = True proc.start() elif task_order in [ models.jobs.TASK_UPLOAD_AND_ANALYSE, models.jobs.TASK_UPLOAD_AND_LINK, models.jobs.TASK_MESH, models.jobs.TASK_CALC, models.jobs.TASK_RESTART_CALC ]: try: # The task will be dequeued by the process to get the task parameters subprocess.check_call([ "python", os.path.join(API_PATH, "app", "run_job.py"), "--fork", "--log-level", logging.getLevelName(log_level), "--log-output", log_output, '--redis-host', core.api_util.RedisContext.get_host(), '--redis-port', str(core.api_util.RedisContext.get_port()), '--redis-data-db', str(core.api_util.RedisContext.get_data_db()), '--redis-pubsub-db', str(core.api_util.RedisContext.get_pubsub_db()), str(job_id), str(task_order) ]) except core.api_util.abort_errors: models.jobs.dequeue_task(job_id) log.warning("Operation canceled") models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_CANCELED) except core.api_util.ToolchainError as e: models.jobs.dequeue_task(job_id) log.error(str(e)) models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_KILLED) except error_util.all_errors as e: models.jobs.dequeue_task(job_id) error_util.log_error(log, e) models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_KILLED) else: models.jobs.dequeue_task(job_id) log.error("Task not implemented: " + str(task_order)) return
def fetch_progress(conn, user_id, project_codename, calc_name, calc_id, storage, tmp_folder): """ Check if a progress file has been created on the main worker and save it if it exists :param conn: The ssh connection to the main worker :type conn: ssh.SshConnection :param user_id: The id of the job owner :type user_id: int :param project_codename: The project uuid :type project_codename: str :param calc_name: The name of the calculation :type calc_name: str :param calc_id: The id of the calculation :type calc_id: int :param storage: The storage of the project :type storage: core.ssh.Storage :param tmp_folder: A temporary folder to use :type tmp_folder: str :return: True if success, False if no file is found or a failure happens :rtype: bool """ status_file_name = project_codename + "_calc_" + calc_name + "_status.zip" status_file = cmd_util.ResultFile(project_codename, status_file_name) old_status_file = None try: calc_dir = util.path_join(api_util.WORKER_WORK_PATH, "ZephyTOOLS", "PROJECTS_CFD", project_codename, "CALC") if not conn.folder_exists(calc_dir): log.debug("calc folder " + calc_dir + " doesn't exists yet, skipping...") return True _, out, _ = conn.run([ "find", calc_dir, "-mindepth", "1", "-maxdepth", "1", "-type", "d" ]) out = out.strip() if not out or "\n" in out: # No results or more than one result log.warning("Unable to get the calculation output folder") return calc_dir = out.rstrip("/") zipper_command = util.path_join(api_util.WORKER_WORK_PATH, "ZephyTOOLS", "APPLI", "TMP", "CFD_CALC_ZIP_STATUS.py") old_status_file = models.calc.get_calc_status_file( user_id, project_codename, calc_id) status_file_path = util.path_join(api_util.WORKER_OUTPUT_PATH, status_file_name) conn.run( ["python", zipper_command, "-i", calc_dir, "-o", status_file_path]) worker_out_storage = storages.SshStorage(conn, api_util.WORKER_OUTPUT_PATH, IS_TOOLCHAIN_SECURED) if not status_file.exists(worker_out_storage): log.warning( "Unable to get calculation status file: file not found") return False status_file.save_on_storage(worker_out_storage, storage, tmp_folder) file_id = status_file.save_in_database(user_id) models.calc.save_status_file(user_id, project_codename, calc_id, file_id) except error_util.all_errors as e: with error_util.saved_stack() as error_stack: status_file.delete_from_distant(storage) if error_util.is_abort(e): error_stack.reraise() else: error_util.log_error(log, e) return False if old_status_file: models.projects.remove_file_from_project(user_id, project_codename, old_status_file['id']) return True
def run_garbage_collector(api_name, server_name, redis_host="localhost", redis_port=6379, data_db=0, pubsub_db=1): signal.signal(signal.SIGTERM, raise_keyboard_interrupt) signal.signal(signal.SIGINT, raise_keyboard_interrupt) core.api_util.DatabaseContext.load_conf() core.api_util.RedisContext.set_params(api_name, server_name, redis_host, redis_port, data_db, pubsub_db) # Loading providers and storages conf = api_util.get_conf() conf.read(os.path.join(API_PATH, 'config.conf')) allowed_providers = json.loads(conf.get("general", "allowed_providers")) providers = [] for provider_name in allowed_providers: providers.append(api_util.get_provider(provider_name)) allowed_storages = json.loads(conf.get("general", "allowed_storages")) storages = [] for storage_name in allowed_storages: storages.append(api_util.get_storage(storage_name)) running_jobs = RunningJobs() thread_list = [] for provider in providers: running_workers = RunningWorkers() worker_collector = WorkerCollector(provider, api_name, server_name, running_jobs, running_workers) worker_collector.start() thread_list.append(worker_collector) provider_artefact_collector = ProviderArtefactCollector( provider, running_jobs, running_workers) provider_artefact_collector.start() thread_list.append(provider_artefact_collector) job_collector = JobCollector(running_jobs) job_collector.start() thread_list.append(job_collector) model_collector = ModelCollector() model_collector.start() thread_list.append(model_collector) # FIXME: Disable for now # file_collector = FileCollector(api_name) # file_collector.start() # thread_list.append(file_collector) # # for storage in storages: # storage_collector = StorageCollector(storage) # storage_collector.start() # thread_list.append(storage_collector) try: while True: time.sleep(0.1) for proc in thread_list: if not proc.is_alive(): proc.reraise() except error_util.all_errors as e: with error_util.before_raising(): if error_util.is_abort(e): log.info("Signal received, exiting...") else: error_util.log_error(log, e) log.info("Garbage collection cleaning...") stop_and_join(thread_list) log.info("Garbage collection is cleaned") log.info("Garbage collection cleaning...") stop_and_join(thread_list) log.info("Garbage collection is cleaned")
def using_workers(api_name, provider, job_id, machine, nbr_machines, tags, debug_keep_instances_alive=False): machine_cost = models.provider_config.get_machine_provider_cost(provider.name, machine) if not machine_cost: raise RuntimeError("Unable to get the cost for provider " + str(provider.name)) instance_price = api_util.price_to_float(machine_cost["cost_per_sec"]) * 3600 # In $/h, for aws spots nbr_machines = int(nbr_machines) alive_thread = None if nbr_machines == 1: workers = [] try: log.info("Launching worker on provider " + str(provider.name)) workers = provider.create_workers(int(nbr_machines), machine=machine, spot_price=instance_price) log.info("worker created") main_worker = workers[0] if main_worker.specific_cost: models.jobs.set_job_specific_cost(job_id, provider.name, machine, main_worker.specific_cost, machine_cost["currency"], machine_cost["sec_granularity"], machine_cost["min_sec_granularity"]) # Tag instance provider.tag_workers(workers, {'Name': api_name + "_worker/job_" + str(job_id), "type": "worker"}) if not debug_keep_instances_alive: debug_keep_instances_alive = models.jobs.is_shutdown_disabled(job_id) tags = copy.copy(tags) tags['debug'] = "true" if debug_keep_instances_alive else "false" provider.tag_workers(workers, tags) # Connect to the worker ip = main_worker.public_ip if main_worker.public_ip else main_worker.private_ip log.info("Waiting for worker ssh connection to " + str(ip) + " ...") conn = ssh.SshConnection(ip, "aziugo", provider.get_key_path()) conn.wait_for_connection() log.info("Connection with worker established") alive_thread = KeepAliveWorkerThread(conn) alive_thread.start() yield RunningWorkers(provider, workers, conn, debug_keep_instances_alive) finally: if alive_thread: alive_thread.stop() alive_thread.join() if workers and provider: if not debug_keep_instances_alive: try: debug_keep_instances_alive = models.jobs.is_shutdown_disabled(job_id) except error_util.all_errors as e: log.warning(str(e)) if debug_keep_instances_alive: log.debug("Worker cleaning is disabled for debug purpose") else: log.info("Stopping workers...") cleanup_failed = False try: provider.terminate_workers(workers) except error_util.abort_errors: with error_util.before_raising(): try: provider.terminate_workers(workers) log.info("Workers stopped") except error_util.abort_errors: log.warning("Worker cleaned aborted.") msg = "Workers of job "+str(job_id)+" are not killed. Please kill them manually" log.error(msg) api_util.send_admin_email("Worker cleaned aborted.", msg) except error_util.all_errors as e: cleanup_failed = True msg = "Workers of job " + str(job_id) + " are not killed. Please kill them manually" log.error(msg) error_util.log_error(log, e) api_util.send_admin_email("Worker cleaned aborted.", msg) if not cleanup_failed: log.info("Workers stopped") else: machine_info = models.provider_config.get_machine(provider.name, machine) if not machine_info: raise RuntimeError("Unable to get the description of machine " + str(machine)) nbr_cores = int(machine_info['nbr_cores']) cluster_tags = copy.copy(tags) if not debug_keep_instances_alive: debug_keep_instances_alive = models.jobs.is_shutdown_disabled(job_id) cluster_tags.update({ "debug": "true" if debug_keep_instances_alive else "false", '%master%_Name': api_name + "_worker/job_" + str(job_id), '%master%_type': "cluster master", '%slave%_Name': api_name + "_worker/job_" + str(job_id) + " slave %slave_index%", '%slave%_type': "cluster slave", }) log.info("Launching worker on provider " + str(provider.name)) with core.cluster.Cluster(provider, "aziugo", nbr_cores, str(job_id), machine=machine, spot_price=instance_price, tags=cluster_tags, debug_no_terminate=debug_keep_instances_alive) as cluster: try: log.info("Main worker launched, with id " + str(cluster.master_id)) log.info("Launching " + str(nbr_machines - 1) + " slave workers...") cluster.add_slaves(nbr_machines - 1) log.info("Slave workers launched") # Connect to the worker log.info("Waiting for worker ssh connection to "+str(cluster.ip)+" ...") conn = ssh.SshConnection(cluster.ip, "aziugo", provider.get_key_path()) conn.wait_for_connection() log.info("Connection with worker established") alive_thread = KeepAliveClusterThread(cluster) alive_thread.start() yield RunningWorkers(provider, cluster.workers, conn, debug_keep_instances_alive) finally: if alive_thread: alive_thread.stop() alive_thread.join() if not debug_keep_instances_alive: try: debug_keep_instances_alive = models.jobs.is_shutdown_disabled(job_id) except error_util.all_errors as e: log.warning(str(e)) if debug_keep_instances_alive: cluster.disable_clean() if not debug_keep_instances_alive: log.info("Stopping workers...") if not debug_keep_instances_alive: log.info("Workers stopped")