def run(api_name, server_name, job_id, project_file, anal_file, storage, client_login, client_ip, api_version): """ Upload project file, analyse them and save analysis :param api_name: The name of the api :type api_name: str :param server_name: The name of the server (ex: apidev.zephycloud.com) :type server_name: str :param job_id: the id of the job to run :type job_id: int :param project_file: The raw project file to save :type project_file: str :param anal_file: The analysed project file :type anal_file: str :param storage: The name of the storage where the project will be located :type storage: str :param client_login: The login of the job owner :type client_login: str :param client_ip: The client ip address of the http request string this job :type client_ip: str :param api_version: The version of the http api where the user ask to launch this job :type api_version: str """ try: # Configure better logging name cmd_util.config_cmd_log(COMMAND_NAME, job_id) # Loading required information from database job = models.jobs.get_job(job_id) if not job: raise api_util.ToolchainError("Unknown job " + str(job_id)) models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_RUNNING) project = models.projects.get_project(job['user_id'], job['project_uid']) if not project: raise api_util.ToolchainError("Unknown project " + str(job['project_uid'])) try: models.projects.set_project_status( project["user_id"], job["project_uid"], models.projects.PROJECT_STATUS_ANALYSING) link(job, project, project_file, anal_file) models.projects.set_project_status( project["user_id"], job["project_uid"], models.projects.PROJECT_STATUS_ANALYSED) except error_util.all_errors: with error_util.before_raising(): models.projects.set_project_status( project["user_id"], job["project_uid"], models.projects.PROJECT_STATUS_RAW) finally: if os.path.exists(project_file): os.remove(project_file) if os.path.exists(anal_file): os.remove(anal_file)
def __enter__(self): proc = None try: log_info_file = util.path_join(api_util.WORKER_INPUT_PATH, "log_info.json") log_info = { "jobid": self._job_id, "job_type": self._command, "api_name": self._api_name, "server_name": self._server_name, "instance": self._running_worker.worker_ids[0], "provider": self._running_worker.provider_name } with file_util.temp_file(json.dumps(log_info)) as tmp_filepath: self.conn.send_file(tmp_filepath, api_util.WORKER_INPUT_PATH + "/log_info.json") self.conn.run(["chmod", "a+r", log_info_file]) proc = self.conn.run_async(["python", api_util.WORKER_RUNNER_PATH]) # Send the task parameter to the worker task_params_file = os.path.join(api_util.WORKER_INPUT_PATH, "task_params.json") task_params = { "jobid": self._job_id, "project_uid": self._project_uid, "toolchain": self._command, "params": json.dumps(self._params), "shutdown": "0" if self._running_worker.is_debug else "1" } with file_util.temp_file(json.dumps(task_params)) as tmp_filepath: self.conn.send_file(tmp_filepath, task_params_file) self.conn.run(["chmod", "a+r", task_params_file]) self._proc = proc except error_util.abort_errors: with error_util.before_raising(): log.info("Signal received, stopping process") if proc: proc_util.ensure_stop_proc(proc, 2) except error_util.all_errors: with error_util.before_raising(): try: if proc: proc_util.ensure_kill_proc(proc) except error_util.abort_errors: pass except error_util.all_errors as e: logging.getLogger("aziugo").exception(e) return self
def init(self): try: self._placement_group = self._cloud.create_placement_group( self.get_name()) self._security_group = self._cloud.create_security_group_for_cluster( self.get_name()) except error_util.all_errors: with error_util.before_raising(): self.clean()
def init(self): try: self._shared_volume = re.sub( "_+", "_", re.sub("[^a-z0-9_]+", "_", self._name.lower())) cmd = ['docker', 'volume', 'create', self._shared_volume] subprocess.check_output(cmd, cwd=API_PATH, stderr=subprocess.PIPE) except error_util.all_errors: with error_util.before_raising(): self.clean()
def try_append_file_to_project(user_id, project_codename, file_path, filename=None, key=None): generated = append_file_to_project(user_id, project_codename, file_path, filename, key) try: yield generated except error_util.all_errors: with error_util.before_raising(): try: remove_file_from_project(user_id, project_codename, generated['id']) except error_util.all_errors as e: log.error("Unable to remove file " + repr(generated)) error_util.log_error(log, e)
def uploading_file(self, local_src, dest_filename): """ Upload the file, yield for whatever you want and remove the file if something went wrong :param local_src: The local path of the file to send to this storage :type local_src: str :param dest_filename: The name of the file on the cloud storage :type dest_filename: str :return: The url of the file on the cloud storage, if any :rtype: str|None """ result = self.upload_file(local_src, dest_filename) try: yield result except error_util.all_errors: with error_util.before_raising(): self.delete_file(dest_filename)
def run_garbage_collector(api_name, server_name, redis_host="localhost", redis_port=6379, data_db=0, pubsub_db=1): signal.signal(signal.SIGTERM, raise_keyboard_interrupt) signal.signal(signal.SIGINT, raise_keyboard_interrupt) core.api_util.DatabaseContext.load_conf() core.api_util.RedisContext.set_params(api_name, server_name, redis_host, redis_port, data_db, pubsub_db) # Loading providers and storages conf = api_util.get_conf() conf.read(os.path.join(API_PATH, 'config.conf')) allowed_providers = json.loads(conf.get("general", "allowed_providers")) providers = [] for provider_name in allowed_providers: providers.append(api_util.get_provider(provider_name)) allowed_storages = json.loads(conf.get("general", "allowed_storages")) storages = [] for storage_name in allowed_storages: storages.append(api_util.get_storage(storage_name)) running_jobs = RunningJobs() thread_list = [] for provider in providers: running_workers = RunningWorkers() worker_collector = WorkerCollector(provider, api_name, server_name, running_jobs, running_workers) worker_collector.start() thread_list.append(worker_collector) provider_artefact_collector = ProviderArtefactCollector( provider, running_jobs, running_workers) provider_artefact_collector.start() thread_list.append(provider_artefact_collector) job_collector = JobCollector(running_jobs) job_collector.start() thread_list.append(job_collector) model_collector = ModelCollector() model_collector.start() thread_list.append(model_collector) # FIXME: Disable for now # file_collector = FileCollector(api_name) # file_collector.start() # thread_list.append(file_collector) # # for storage in storages: # storage_collector = StorageCollector(storage) # storage_collector.start() # thread_list.append(storage_collector) try: while True: time.sleep(0.1) for proc in thread_list: if not proc.is_alive(): proc.reraise() except error_util.all_errors as e: with error_util.before_raising(): if error_util.is_abort(e): log.info("Signal received, exiting...") else: error_util.log_error(log, e) log.info("Garbage collection cleaning...") stop_and_join(thread_list) log.info("Garbage collection is cleaned") log.info("Garbage collection cleaning...") stop_and_join(thread_list) log.info("Garbage collection is cleaned")
def run(api_name, server_name, job_id, project_codename, mesh_name, calc_id, calc_param_file, provider_name, machine, nbr_machines, split_results, client_login, client_ip, api_version): """ Do the upload_and_analyze job :param api_name: The name of the api :type api_name: str :param server_name: The name of the server (ex: apidev.zephycloud.com) :type server_name: str :param job_id: the id of the job to run :type job_id: int :param project_codename: The uid of the project :type project_codename: str :param mesh_name: The name of the mesh :type mesh_name: str :param calc_id: The id of the calculation to launch :type calc_id: int :param calc_param_file: The name of the param file :type calc_param_file: str :param provider_name: The name of the provider :type provider_name: str :param machine: The type of machine to launch :type machine: str :param nbr_machines: The number of machines to run :type nbr_machines: int :param split_results: Do you want the result file to be splitted ? :type split_results: bool :param client_login: The login of the job owner :type client_login: str :param client_ip: The client ip address of the http request string this job :type client_ip: str :param api_version: The version of the http api where the user ask to launch this job :type api_version: str """ try: # Configure better logging name cmd_util.config_cmd_log(COMMAND_NAME, job_id) # Loading required information from database job = models.jobs.get_job(job_id) if not job: raise api_util.ToolchainError("Unknown job " + str(job_id)) models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_RUNNING) project = models.projects.get_project(job['user_id'], job['project_uid']) if not project: raise api_util.ToolchainError("Unknown project " + str(job['project_uid'])) user_id = project["user_id"] mesh = models.meshes.get_mesh(user_id, project_codename, mesh_name) if not mesh: raise api_util.ToolchainError("Unknown mesh " + str(mesh_name) + " in project " + str(job['project_uid'])) calculation = models.calc.get_calc(user_id, project['uid'], calc_id) if not calculation: raise api_util.ToolchainError("Unknown calculation " + str(calc_id)) calc_name = calculation['name'] try: models.calc.set_job(user_id, project_codename, calc_name, job_id) models.calc.set_calc_status(user_id, project_codename, calc_name, models.calc.STATUS_RUNNING) calculate(api_name, server_name, job, project, mesh, calculation, calc_param_file, provider_name, machine, nbr_machines, split_results, client_login, client_ip, api_version) calculation = models.calc.get_calc(user_id, project['uid'], calc_id) if calculation['status'] != models.calc.STATUS_STOPPED: models.calc.set_calc_status(user_id, project_codename, calc_name, models.calc.STATUS_COMPUTED) except api_util.abort_errors: with error_util.before_raising(): models.calc.set_calc_status(user_id, project_codename, calc_name, models.calc.STATUS_CANCELED) except error_util.all_errors: with error_util.before_raising(): models.calc.set_calc_status(user_id, project_codename, calc_name, models.calc.STATUS_KILLED) finally: if os.path.exists(calc_param_file): os.remove(calc_param_file)
def calculate(api_name, server_name, job, project, mesh, calculation, calc_param_file, provider_name, machine, nbr_machines, split_results, client_login, client_ip, api_version): """ Launch the machine(s), send the files, start the worker script, wait for progress and results and saving results :param api_name: The name of the api :type api_name: str :param server_name: The name of the server (ex: apidev.zephycloud.com) :type server_name: str :param job: The job information :type job: dict[str, any] :param project: The main project :type project: dict[str, any] :param mesh: The mesh used for this calculation :type mesh: dict[str, any] :param calculation: The calculation to launch :type calculation: dict[str, any] :param calc_param_file: The main job parameter file :type calc_param_file: str :param provider_name: The name of the provider :type provider_name: str :param machine: The type of machine to launch :type machine: str :param nbr_machines: The number of machines to run :type nbr_machines: int :param split_results: Do you want the result file to be splitted ? :type split_results: bool :param client_login: The login of the job owner :type client_login: str :param client_ip: The client ip address of the http request string this job :type client_ip: str :param api_version: The version of the http api where the user ask to launch this job :type api_version: str """ job_id = int(job['id']) nbr_machines = int(nbr_machines) models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_RUNNING) project_codename = project['uid'] user_id = project["user_id"] calc_id = calculation['id'] tmp_folder = api_util.get_conf().get("general", "tmp_folder") provider = api_util.get_provider(provider_name) storage = api_util.get_storage(project['storage']) tags = { 'operation': "calc", 'job_id': str(job_id), 'server': server_name, 'api': api_name, 'api_version': api_version, 'client': client_login, 'client_ip': client_ip, 'debug': DO_NOT_KILL_INSTANCES, 'trusted': IS_TOOLCHAIN_SECURED } models.users.charge_user_fix_price(user_id, job_id, "Calculation storage cost") result_name = project_codename + "-calc-" + str(job_id) result_file = cmd_util.ResultFile(project_codename, result_name + ".zip") internal_file = cmd_util.ResultFile(project_codename, result_name + "_workfiles.zip") if split_results: iterations_file = cmd_util.ResultFile(project_codename, result_name + "_iterations.zip") reduce_file = cmd_util.ResultFile(project_codename, result_name + "_reduce.zip") # Uploading file on cloud storage log.info("Uploading param file to storage") models.calc.save_calc_param_file(user_id, project_codename, calculation['name'], calc_param_file) try: # Creating worker with cmd_util.using_workers( api_name, provider, job_id, machine, nbr_machines, tags, debug_keep_instances_alive=DO_NOT_KILL_INSTANCES) as workers: # Launch main script with cmd_util.TaskProcess(job_id, job["project_uid"], "calc", workers, [split_results]) as task_proc: conn = workers.ssh_connection # Charge user end_time = models.users.charge_user_computing( user_id, job_id, "Cloud computation cost") if models.users.get_credit(user_id) <= 0: raise api_util.NoMoreCredits() log.info("Sending project files on worker") worker_in_storage = storages.SshStorage( conn, api_util.WORKER_INPUT_PATH, IS_TOOLCHAIN_SECURED) cmd_util.copy_project_file( user_id, project_codename, storage, worker_in_storage, "project_file.zip", tmp_folder, key=models.projects.PROJECT_FILE_RAW) cmd_util.copy_project_file( user_id, project_codename, storage, worker_in_storage, "anal.zip", tmp_folder, key=models.projects.PROJECT_FILE_ANALYSED) cmd_util.copy_project_file(user_id, project_codename, storage, worker_in_storage, "mesh.zip", tmp_folder, file_id=mesh['result_file_id']) worker_in_storage.upload_file(calc_param_file, "calc_params.zip") os.remove(calc_param_file) log.info("Project files sent to the worker") # Tell the script to start log.info("Starting the computation") task_proc.start() last_fetched_progress_time = datetime.datetime.utcfromtimestamp( 0) is_stopped = False while True: task_status = task_proc.check_status() # Charge if we need if datetime.datetime.utcnow() > end_time: end_time = models.users.charge_user_computing( user_id, job_id, "Cloud computation cost") if models.users.get_credit(user_id) <= 0: models.jobs.save_job_text(job_id, "No more credit") raise api_util.NoMoreCredits() if task_status != models.jobs.JOB_STATUS_RUNNING: log.info("Computation finished with status: " + models.jobs.job_status_to_str(task_status)) break if (datetime.datetime.utcnow() - last_fetched_progress_time ).seconds > STATUS_FETCHING_DELAY: fetch_progress(conn, user_id, project_codename, calculation['name'], calculation['id'], storage, tmp_folder) last_fetched_progress_time = datetime.datetime.utcnow() if not is_stopped: calculation = models.calc.get_calc( user_id, project['uid'], calculation['id']) if not calculation: raise api_util.ToolchainError("Calculation " + str(calc_id) + " disappeared") if calculation['status'] == models.calc.STATUS_STOPPED: log.info("Stopping computation") stop_calc(conn, project_codename) is_stopped = True time.sleep(1) # Checking if the machine is still here if not conn.ping(): models.jobs.save_job_text(job_id, "Worker instance disappeared") raise api_util.ToolchainError( "Worker instance disappeared") # Fetching computed data log.info("Saving results") worker_out_storage = storages.SshStorage( conn, api_util.WORKER_OUTPUT_PATH, IS_TOOLCHAIN_SECURED) log_file = util.path_join(api_util.WORKER_OUTPUT_PATH, "worker.log") if conn.file_exists(log_file): with file_util.temp_filename(dir=tmp_folder) as tmp: conn.get_file(log_file, tmp) models.jobs.save_job_log(job_id, tmp) else: log.warning("No worker log file") if not result_file.exists(worker_out_storage): log.error("Unable to find file " + str(result_file) + " on worker") raise api_util.ToolchainError( "Task failed, no result file") result_file.save_on_storage(worker_out_storage, storage, tmp_folder) if split_results: if not iterations_file.exists(worker_out_storage): log.error("Unable to find file " + str(iterations_file) + " on worker") raise api_util.ToolchainError( "Task failed, no result file") iterations_file.save_on_storage(worker_out_storage, storage, tmp_folder) if not reduce_file.exists(worker_out_storage): log.error("Unable to find file " + str(reduce_file) + " on worker") raise api_util.ToolchainError( "Task failed, no result file") reduce_file.save_on_storage(worker_out_storage, storage, tmp_folder) if internal_file.exists(worker_out_storage): internal_file.save_on_storage(worker_out_storage, storage, tmp_folder) else: log.warning("No internal files found on server") fetch_progress(conn, user_id, project_codename, calculation['name'], calculation['id'], storage, tmp_folder) log.info("Computation result fetched") # Signaling all output was fetched task_proc.stop_and_wait() # Charge again if required if datetime.datetime.utcnow() > end_time: models.users.charge_user_computing(project["user_id"], job_id, "Cloud computation cost") # Uploading file on cloud storage result_file.save_in_database(user_id) internal_file_id = internal_file.save_in_database( user_id) if internal_file.saved else None if split_results: iterations_file.save_in_database(user_id) reduce_file.save_in_database(user_id) models.calc.save_result_files(user_id, project_codename, calculation['name'], result_file.file_id, iterations_file.file_id, reduce_file.file_id, internal_file_id) else: models.calc.save_result_files(user_id, project_codename, calculation['name'], result_file.file_id, None, None, internal_file_id) except error_util.all_errors: with error_util.before_raising(): if REMOVE_RESULTS_ON_ERROR: internal_file.delete_from_distant(storage) result_file.delete_from_distant(storage) if split_results: iterations_file.delete_from_distant(storage) reduce_file.delete_from_distant(storage) log.info("Results saved")
def using_workers(api_name, provider, job_id, machine, nbr_machines, tags, debug_keep_instances_alive=False): machine_cost = models.provider_config.get_machine_provider_cost(provider.name, machine) if not machine_cost: raise RuntimeError("Unable to get the cost for provider " + str(provider.name)) instance_price = api_util.price_to_float(machine_cost["cost_per_sec"]) * 3600 # In $/h, for aws spots nbr_machines = int(nbr_machines) alive_thread = None if nbr_machines == 1: workers = [] try: log.info("Launching worker on provider " + str(provider.name)) workers = provider.create_workers(int(nbr_machines), machine=machine, spot_price=instance_price) log.info("worker created") main_worker = workers[0] if main_worker.specific_cost: models.jobs.set_job_specific_cost(job_id, provider.name, machine, main_worker.specific_cost, machine_cost["currency"], machine_cost["sec_granularity"], machine_cost["min_sec_granularity"]) # Tag instance provider.tag_workers(workers, {'Name': api_name + "_worker/job_" + str(job_id), "type": "worker"}) if not debug_keep_instances_alive: debug_keep_instances_alive = models.jobs.is_shutdown_disabled(job_id) tags = copy.copy(tags) tags['debug'] = "true" if debug_keep_instances_alive else "false" provider.tag_workers(workers, tags) # Connect to the worker ip = main_worker.public_ip if main_worker.public_ip else main_worker.private_ip log.info("Waiting for worker ssh connection to " + str(ip) + " ...") conn = ssh.SshConnection(ip, "aziugo", provider.get_key_path()) conn.wait_for_connection() log.info("Connection with worker established") alive_thread = KeepAliveWorkerThread(conn) alive_thread.start() yield RunningWorkers(provider, workers, conn, debug_keep_instances_alive) finally: if alive_thread: alive_thread.stop() alive_thread.join() if workers and provider: if not debug_keep_instances_alive: try: debug_keep_instances_alive = models.jobs.is_shutdown_disabled(job_id) except error_util.all_errors as e: log.warning(str(e)) if debug_keep_instances_alive: log.debug("Worker cleaning is disabled for debug purpose") else: log.info("Stopping workers...") cleanup_failed = False try: provider.terminate_workers(workers) except error_util.abort_errors: with error_util.before_raising(): try: provider.terminate_workers(workers) log.info("Workers stopped") except error_util.abort_errors: log.warning("Worker cleaned aborted.") msg = "Workers of job "+str(job_id)+" are not killed. Please kill them manually" log.error(msg) api_util.send_admin_email("Worker cleaned aborted.", msg) except error_util.all_errors as e: cleanup_failed = True msg = "Workers of job " + str(job_id) + " are not killed. Please kill them manually" log.error(msg) error_util.log_error(log, e) api_util.send_admin_email("Worker cleaned aborted.", msg) if not cleanup_failed: log.info("Workers stopped") else: machine_info = models.provider_config.get_machine(provider.name, machine) if not machine_info: raise RuntimeError("Unable to get the description of machine " + str(machine)) nbr_cores = int(machine_info['nbr_cores']) cluster_tags = copy.copy(tags) if not debug_keep_instances_alive: debug_keep_instances_alive = models.jobs.is_shutdown_disabled(job_id) cluster_tags.update({ "debug": "true" if debug_keep_instances_alive else "false", '%master%_Name': api_name + "_worker/job_" + str(job_id), '%master%_type': "cluster master", '%slave%_Name': api_name + "_worker/job_" + str(job_id) + " slave %slave_index%", '%slave%_type': "cluster slave", }) log.info("Launching worker on provider " + str(provider.name)) with core.cluster.Cluster(provider, "aziugo", nbr_cores, str(job_id), machine=machine, spot_price=instance_price, tags=cluster_tags, debug_no_terminate=debug_keep_instances_alive) as cluster: try: log.info("Main worker launched, with id " + str(cluster.master_id)) log.info("Launching " + str(nbr_machines - 1) + " slave workers...") cluster.add_slaves(nbr_machines - 1) log.info("Slave workers launched") # Connect to the worker log.info("Waiting for worker ssh connection to "+str(cluster.ip)+" ...") conn = ssh.SshConnection(cluster.ip, "aziugo", provider.get_key_path()) conn.wait_for_connection() log.info("Connection with worker established") alive_thread = KeepAliveClusterThread(cluster) alive_thread.start() yield RunningWorkers(provider, cluster.workers, conn, debug_keep_instances_alive) finally: if alive_thread: alive_thread.stop() alive_thread.join() if not debug_keep_instances_alive: try: debug_keep_instances_alive = models.jobs.is_shutdown_disabled(job_id) except error_util.all_errors as e: log.warning(str(e)) if debug_keep_instances_alive: cluster.disable_clean() if not debug_keep_instances_alive: log.info("Stopping workers...") if not debug_keep_instances_alive: log.info("Workers stopped")
def analyse(api_name, server_name, job, project, storage_name, project_file, provider_name, machine, nbr_machines, client_login, client_ip, api_version): """ Launch the machine(s), send the files, start the worker script, wait for progress and results and saving results :param api_name: The name of the api :type api_name: str :param server_name: The name of the server (ex: apidev.zephycloud.com) :type server_name: str :param job: The job information :type job: dict[str, any] :param project: The main project :type project: dict[str, any] :param storage_name: The name of the storage where the project will be located :type storage_name: str :param project_file: The raw project file to analyse :type project_file: str :param provider_name: The name of the provider :type provider_name: str :param machine: The type of machine to launch :type machine: str :param nbr_machines: The number of machines to run :type nbr_machines: int :param client_login: The login of the job owner :type client_login: str :param client_ip: The client ip address of the http request string this job :type client_ip: str :param api_version: The version of the http api where the user ask to launch this job :type api_version: str """ job_id = int(job['id']) project_codename = job["project_uid"] analyzed_filename = job["project_uid"] + "-anal-" + str(job_id) + ".zip" user_id = project["user_id"] provider = api_util.get_provider(provider_name) storage = api_util.get_storage(storage_name) tmp_folder = api_util.get_conf().get("general", "tmp_folder") tags = { 'operation': "anal", 'job_id': str(job_id), 'server': server_name, 'api': api_name, 'api_version': api_version, 'client': client_login, 'client_ip': client_ip, 'trusted': IS_TOOLCHAIN_SECURED } # Uploading file on cloud storage log.info("Uploading project file to storage") models.projects.append_file_to_project( user_id, job["project_uid"], project_file, "project_" + job["project_uid"] + ".zip", key=models.projects.PROJECT_FILE_RAW, overwrite=True) log.info("Project file uploaded") models.users.charge_user_fix_price(user_id, job_id, "Project storage cost") analyzed_file = cmd_util.ResultFile(project_codename, analyzed_filename) try: # Creating worker with cmd_util.using_workers( api_name, provider, job_id, machine, nbr_machines, tags, debug_keep_instances_alive=DO_NOT_KILL_INSTANCES) as workers: with cmd_util.TaskProcess(job_id, job["project_uid"], "anal", workers) as task_proc: conn = workers.ssh_connection # Charge user end_time = models.users.charge_user_computing( user_id, job_id, "Cloud computation cost") if models.users.get_credit(user_id) <= 0: raise api_util.NoMoreCredits() log.info("Sending project files on worker") conn.send_file( project_file, util.path_join(api_util.WORKER_INPUT_PATH, "project_file.zip")) os.remove(project_file) log.info("Project files sent to the worker") # Tell the script to start log.info("Starting the computation") task_proc.start() while True: task_status = task_proc.check_status() # Charge if we need if datetime.datetime.utcnow() > end_time: end_time = models.users.charge_user_computing( user_id, job_id, "Cloud computation cost") if models.users.get_credit(user_id) <= 0: models.jobs.save_job_text(job_id, "No more credit") raise api_util.NoMoreCredits() if task_status != models.jobs.JOB_STATUS_RUNNING: log.info("Computation finished with status: " + models.jobs.job_status_to_str(task_status)) break time.sleep(5) # Checking if the machine is still here if not conn.ping(): models.jobs.save_job_text(job_id, "Worker instance disappeared") raise api_util.ToolchainError( "Worker instance disappeared") # Fetching computed data log.info("Fetching results") worker_out_storage = storages.SshStorage( conn, api_util.WORKER_OUTPUT_PATH, IS_TOOLCHAIN_SECURED) log_file = util.path_join(api_util.WORKER_OUTPUT_PATH, "worker.log") if conn.file_exists(log_file): with file_util.temp_filename(dir=tmp_folder) as tmp: conn.get_file(log_file, tmp) models.jobs.save_job_log(job_id, tmp) else: log.warning("No worker log file") if not analyzed_file.exists(worker_out_storage): log.error("Unable to find file " + str(analyzed_file) + " on worker") raise api_util.ToolchainError( "Task failed, no result file") analyzed_file.save_on_storage(worker_out_storage, storage, tmp_folder) log.info("Computation result fetched") # Signaling all output was fetched task_proc.stop_and_wait() # Charge again if required if datetime.datetime.utcnow() > end_time: models.users.charge_user_computing(project["user_id"], job_id, "Cloud computation cost") # Uploading file on cloud storage analyzed_file.save_in_database( user_id, key=models.projects.PROJECT_FILE_ANALYSED) except error_util.all_errors: with error_util.before_raising(): if REMOVE_RESULTS_ON_ERROR: analyzed_file.delete_from_distant(storage) log.info("Result saved")