def fetch_progress(conn, user_id, project_codename, calc_name, calc_id, storage, tmp_folder): """ Check if a progress file has been created on the main worker and save it if it exists :param conn: The ssh connection to the main worker :type conn: ssh.SshConnection :param user_id: The id of the job owner :type user_id: int :param project_codename: The project uuid :type project_codename: str :param calc_name: The name of the calculation :type calc_name: str :param calc_id: The id of the calculation :type calc_id: int :param storage: The storage of the project :type storage: core.ssh.Storage :param tmp_folder: A temporary folder to use :type tmp_folder: str :return: True if success, False if no file is found or a failure happens :rtype: bool """ status_file_name = project_codename + "_calc_" + calc_name + "_status.zip" status_file = cmd_util.ResultFile(project_codename, status_file_name) old_status_file = None try: calc_dir = util.path_join(api_util.WORKER_WORK_PATH, "ZephyTOOLS", "PROJECTS_CFD", project_codename, "CALC") if not conn.folder_exists(calc_dir): log.debug("calc folder " + calc_dir + " doesn't exists yet, skipping...") return True _, out, _ = conn.run([ "find", calc_dir, "-mindepth", "1", "-maxdepth", "1", "-type", "d" ]) out = out.strip() if not out or "\n" in out: # No results or more than one result log.warning("Unable to get the calculation output folder") return calc_dir = out.rstrip("/") zipper_command = util.path_join(api_util.WORKER_WORK_PATH, "ZephyTOOLS", "APPLI", "TMP", "CFD_CALC_ZIP_STATUS.py") old_status_file = models.calc.get_calc_status_file( user_id, project_codename, calc_id) status_file_path = util.path_join(api_util.WORKER_OUTPUT_PATH, status_file_name) conn.run( ["python", zipper_command, "-i", calc_dir, "-o", status_file_path]) worker_out_storage = storages.SshStorage(conn, api_util.WORKER_OUTPUT_PATH, IS_TOOLCHAIN_SECURED) if not status_file.exists(worker_out_storage): log.warning( "Unable to get calculation status file: file not found") return False status_file.save_on_storage(worker_out_storage, storage, tmp_folder) file_id = status_file.save_in_database(user_id) models.calc.save_status_file(user_id, project_codename, calc_id, file_id) except error_util.all_errors as e: with error_util.saved_stack() as error_stack: status_file.delete_from_distant(storage) if error_util.is_abort(e): error_stack.reraise() else: error_util.log_error(log, e) return False if old_status_file: models.projects.remove_file_from_project(user_id, project_codename, old_status_file['id']) return True
def calculate(api_name, server_name, job, project, mesh, calculation, calc_param_file, provider_name, machine, nbr_machines, split_results, client_login, client_ip, api_version): """ Launch the machine(s), send the files, start the worker script, wait for progress and results and saving results :param api_name: The name of the api :type api_name: str :param server_name: The name of the server (ex: apidev.zephycloud.com) :type server_name: str :param job: The job information :type job: dict[str, any] :param project: The main project :type project: dict[str, any] :param mesh: The mesh used for this calculation :type mesh: dict[str, any] :param calculation: The calculation to launch :type calculation: dict[str, any] :param calc_param_file: The main job parameter file :type calc_param_file: str :param provider_name: The name of the provider :type provider_name: str :param machine: The type of machine to launch :type machine: str :param nbr_machines: The number of machines to run :type nbr_machines: int :param split_results: Do you want the result file to be splitted ? :type split_results: bool :param client_login: The login of the job owner :type client_login: str :param client_ip: The client ip address of the http request string this job :type client_ip: str :param api_version: The version of the http api where the user ask to launch this job :type api_version: str """ job_id = int(job['id']) nbr_machines = int(nbr_machines) models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_RUNNING) project_codename = project['uid'] user_id = project["user_id"] calc_id = calculation['id'] tmp_folder = api_util.get_conf().get("general", "tmp_folder") provider = api_util.get_provider(provider_name) storage = api_util.get_storage(project['storage']) tags = { 'operation': "calc", 'job_id': str(job_id), 'server': server_name, 'api': api_name, 'api_version': api_version, 'client': client_login, 'client_ip': client_ip, 'debug': DO_NOT_KILL_INSTANCES, 'trusted': IS_TOOLCHAIN_SECURED } models.users.charge_user_fix_price(user_id, job_id, "Calculation storage cost") result_name = project_codename + "-calc-" + str(job_id) result_file = cmd_util.ResultFile(project_codename, result_name + ".zip") internal_file = cmd_util.ResultFile(project_codename, result_name + "_workfiles.zip") if split_results: iterations_file = cmd_util.ResultFile(project_codename, result_name + "_iterations.zip") reduce_file = cmd_util.ResultFile(project_codename, result_name + "_reduce.zip") # Uploading file on cloud storage log.info("Uploading param file to storage") models.calc.save_calc_param_file(user_id, project_codename, calculation['name'], calc_param_file) try: # Creating worker with cmd_util.using_workers( api_name, provider, job_id, machine, nbr_machines, tags, debug_keep_instances_alive=DO_NOT_KILL_INSTANCES) as workers: # Launch main script with cmd_util.TaskProcess(job_id, job["project_uid"], "calc", workers, [split_results]) as task_proc: conn = workers.ssh_connection # Charge user end_time = models.users.charge_user_computing( user_id, job_id, "Cloud computation cost") if models.users.get_credit(user_id) <= 0: raise api_util.NoMoreCredits() log.info("Sending project files on worker") worker_in_storage = storages.SshStorage( conn, api_util.WORKER_INPUT_PATH, IS_TOOLCHAIN_SECURED) cmd_util.copy_project_file( user_id, project_codename, storage, worker_in_storage, "project_file.zip", tmp_folder, key=models.projects.PROJECT_FILE_RAW) cmd_util.copy_project_file( user_id, project_codename, storage, worker_in_storage, "anal.zip", tmp_folder, key=models.projects.PROJECT_FILE_ANALYSED) cmd_util.copy_project_file(user_id, project_codename, storage, worker_in_storage, "mesh.zip", tmp_folder, file_id=mesh['result_file_id']) worker_in_storage.upload_file(calc_param_file, "calc_params.zip") os.remove(calc_param_file) log.info("Project files sent to the worker") # Tell the script to start log.info("Starting the computation") task_proc.start() last_fetched_progress_time = datetime.datetime.utcfromtimestamp( 0) is_stopped = False while True: task_status = task_proc.check_status() # Charge if we need if datetime.datetime.utcnow() > end_time: end_time = models.users.charge_user_computing( user_id, job_id, "Cloud computation cost") if models.users.get_credit(user_id) <= 0: models.jobs.save_job_text(job_id, "No more credit") raise api_util.NoMoreCredits() if task_status != models.jobs.JOB_STATUS_RUNNING: log.info("Computation finished with status: " + models.jobs.job_status_to_str(task_status)) break if (datetime.datetime.utcnow() - last_fetched_progress_time ).seconds > STATUS_FETCHING_DELAY: fetch_progress(conn, user_id, project_codename, calculation['name'], calculation['id'], storage, tmp_folder) last_fetched_progress_time = datetime.datetime.utcnow() if not is_stopped: calculation = models.calc.get_calc( user_id, project['uid'], calculation['id']) if not calculation: raise api_util.ToolchainError("Calculation " + str(calc_id) + " disappeared") if calculation['status'] == models.calc.STATUS_STOPPED: log.info("Stopping computation") stop_calc(conn, project_codename) is_stopped = True time.sleep(1) # Checking if the machine is still here if not conn.ping(): models.jobs.save_job_text(job_id, "Worker instance disappeared") raise api_util.ToolchainError( "Worker instance disappeared") # Fetching computed data log.info("Saving results") worker_out_storage = storages.SshStorage( conn, api_util.WORKER_OUTPUT_PATH, IS_TOOLCHAIN_SECURED) log_file = util.path_join(api_util.WORKER_OUTPUT_PATH, "worker.log") if conn.file_exists(log_file): with file_util.temp_filename(dir=tmp_folder) as tmp: conn.get_file(log_file, tmp) models.jobs.save_job_log(job_id, tmp) else: log.warning("No worker log file") if not result_file.exists(worker_out_storage): log.error("Unable to find file " + str(result_file) + " on worker") raise api_util.ToolchainError( "Task failed, no result file") result_file.save_on_storage(worker_out_storage, storage, tmp_folder) if split_results: if not iterations_file.exists(worker_out_storage): log.error("Unable to find file " + str(iterations_file) + " on worker") raise api_util.ToolchainError( "Task failed, no result file") iterations_file.save_on_storage(worker_out_storage, storage, tmp_folder) if not reduce_file.exists(worker_out_storage): log.error("Unable to find file " + str(reduce_file) + " on worker") raise api_util.ToolchainError( "Task failed, no result file") reduce_file.save_on_storage(worker_out_storage, storage, tmp_folder) if internal_file.exists(worker_out_storage): internal_file.save_on_storage(worker_out_storage, storage, tmp_folder) else: log.warning("No internal files found on server") fetch_progress(conn, user_id, project_codename, calculation['name'], calculation['id'], storage, tmp_folder) log.info("Computation result fetched") # Signaling all output was fetched task_proc.stop_and_wait() # Charge again if required if datetime.datetime.utcnow() > end_time: models.users.charge_user_computing(project["user_id"], job_id, "Cloud computation cost") # Uploading file on cloud storage result_file.save_in_database(user_id) internal_file_id = internal_file.save_in_database( user_id) if internal_file.saved else None if split_results: iterations_file.save_in_database(user_id) reduce_file.save_in_database(user_id) models.calc.save_result_files(user_id, project_codename, calculation['name'], result_file.file_id, iterations_file.file_id, reduce_file.file_id, internal_file_id) else: models.calc.save_result_files(user_id, project_codename, calculation['name'], result_file.file_id, None, None, internal_file_id) except error_util.all_errors: with error_util.before_raising(): if REMOVE_RESULTS_ON_ERROR: internal_file.delete_from_distant(storage) result_file.delete_from_distant(storage) if split_results: iterations_file.delete_from_distant(storage) reduce_file.delete_from_distant(storage) log.info("Results saved")
def analyse(api_name, server_name, job, project, storage_name, project_file, provider_name, machine, nbr_machines, client_login, client_ip, api_version): """ Launch the machine(s), send the files, start the worker script, wait for progress and results and saving results :param api_name: The name of the api :type api_name: str :param server_name: The name of the server (ex: apidev.zephycloud.com) :type server_name: str :param job: The job information :type job: dict[str, any] :param project: The main project :type project: dict[str, any] :param storage_name: The name of the storage where the project will be located :type storage_name: str :param project_file: The raw project file to analyse :type project_file: str :param provider_name: The name of the provider :type provider_name: str :param machine: The type of machine to launch :type machine: str :param nbr_machines: The number of machines to run :type nbr_machines: int :param client_login: The login of the job owner :type client_login: str :param client_ip: The client ip address of the http request string this job :type client_ip: str :param api_version: The version of the http api where the user ask to launch this job :type api_version: str """ job_id = int(job['id']) project_codename = job["project_uid"] analyzed_filename = job["project_uid"] + "-anal-" + str(job_id) + ".zip" user_id = project["user_id"] provider = api_util.get_provider(provider_name) storage = api_util.get_storage(storage_name) tmp_folder = api_util.get_conf().get("general", "tmp_folder") tags = { 'operation': "anal", 'job_id': str(job_id), 'server': server_name, 'api': api_name, 'api_version': api_version, 'client': client_login, 'client_ip': client_ip, 'trusted': IS_TOOLCHAIN_SECURED } # Uploading file on cloud storage log.info("Uploading project file to storage") models.projects.append_file_to_project( user_id, job["project_uid"], project_file, "project_" + job["project_uid"] + ".zip", key=models.projects.PROJECT_FILE_RAW, overwrite=True) log.info("Project file uploaded") models.users.charge_user_fix_price(user_id, job_id, "Project storage cost") analyzed_file = cmd_util.ResultFile(project_codename, analyzed_filename) try: # Creating worker with cmd_util.using_workers( api_name, provider, job_id, machine, nbr_machines, tags, debug_keep_instances_alive=DO_NOT_KILL_INSTANCES) as workers: with cmd_util.TaskProcess(job_id, job["project_uid"], "anal", workers) as task_proc: conn = workers.ssh_connection # Charge user end_time = models.users.charge_user_computing( user_id, job_id, "Cloud computation cost") if models.users.get_credit(user_id) <= 0: raise api_util.NoMoreCredits() log.info("Sending project files on worker") conn.send_file( project_file, util.path_join(api_util.WORKER_INPUT_PATH, "project_file.zip")) os.remove(project_file) log.info("Project files sent to the worker") # Tell the script to start log.info("Starting the computation") task_proc.start() while True: task_status = task_proc.check_status() # Charge if we need if datetime.datetime.utcnow() > end_time: end_time = models.users.charge_user_computing( user_id, job_id, "Cloud computation cost") if models.users.get_credit(user_id) <= 0: models.jobs.save_job_text(job_id, "No more credit") raise api_util.NoMoreCredits() if task_status != models.jobs.JOB_STATUS_RUNNING: log.info("Computation finished with status: " + models.jobs.job_status_to_str(task_status)) break time.sleep(5) # Checking if the machine is still here if not conn.ping(): models.jobs.save_job_text(job_id, "Worker instance disappeared") raise api_util.ToolchainError( "Worker instance disappeared") # Fetching computed data log.info("Fetching results") worker_out_storage = storages.SshStorage( conn, api_util.WORKER_OUTPUT_PATH, IS_TOOLCHAIN_SECURED) log_file = util.path_join(api_util.WORKER_OUTPUT_PATH, "worker.log") if conn.file_exists(log_file): with file_util.temp_filename(dir=tmp_folder) as tmp: conn.get_file(log_file, tmp) models.jobs.save_job_log(job_id, tmp) else: log.warning("No worker log file") if not analyzed_file.exists(worker_out_storage): log.error("Unable to find file " + str(analyzed_file) + " on worker") raise api_util.ToolchainError( "Task failed, no result file") analyzed_file.save_on_storage(worker_out_storage, storage, tmp_folder) log.info("Computation result fetched") # Signaling all output was fetched task_proc.stop_and_wait() # Charge again if required if datetime.datetime.utcnow() > end_time: models.users.charge_user_computing(project["user_id"], job_id, "Cloud computation cost") # Uploading file on cloud storage analyzed_file.save_in_database( user_id, key=models.projects.PROJECT_FILE_ANALYSED) except error_util.all_errors: with error_util.before_raising(): if REMOVE_RESULTS_ON_ERROR: analyzed_file.delete_from_distant(storage) log.info("Result saved")