예제 #1
0
def fetch_progress(conn, user_id, project_codename, calc_name, calc_id,
                   storage, tmp_folder):
    """
    Check if a progress file has been created on the main worker and save it if it exists

    :param conn:                    The ssh connection to the main worker
    :type conn:                     ssh.SshConnection
    :param user_id:                 The id of the job owner
    :type user_id:                  int
    :param project_codename:        The project uuid
    :type project_codename:         str
    :param calc_name:               The name of the calculation
    :type calc_name:                str
    :param calc_id:                 The id of the calculation
    :type calc_id:                  int
    :param storage:                 The storage of the project
    :type storage:                  core.ssh.Storage
    :param tmp_folder:              A temporary folder to use
    :type tmp_folder:               str
    :return:                        True if success, False if no file is found or a failure happens
    :rtype:                         bool
    """
    status_file_name = project_codename + "_calc_" + calc_name + "_status.zip"
    status_file = cmd_util.ResultFile(project_codename, status_file_name)
    old_status_file = None
    try:
        calc_dir = util.path_join(api_util.WORKER_WORK_PATH, "ZephyTOOLS",
                                  "PROJECTS_CFD", project_codename, "CALC")
        if not conn.folder_exists(calc_dir):
            log.debug("calc folder " + calc_dir +
                      " doesn't exists yet, skipping...")
            return True
        _, out, _ = conn.run([
            "find", calc_dir, "-mindepth", "1", "-maxdepth", "1", "-type", "d"
        ])
        out = out.strip()
        if not out or "\n" in out:  # No results or more than one result
            log.warning("Unable to get the calculation output folder")
            return
        calc_dir = out.rstrip("/")
        zipper_command = util.path_join(api_util.WORKER_WORK_PATH,
                                        "ZephyTOOLS", "APPLI", "TMP",
                                        "CFD_CALC_ZIP_STATUS.py")
        old_status_file = models.calc.get_calc_status_file(
            user_id, project_codename, calc_id)
        status_file_path = util.path_join(api_util.WORKER_OUTPUT_PATH,
                                          status_file_name)
        conn.run(
            ["python", zipper_command, "-i", calc_dir, "-o", status_file_path])

        worker_out_storage = storages.SshStorage(conn,
                                                 api_util.WORKER_OUTPUT_PATH,
                                                 IS_TOOLCHAIN_SECURED)
        if not status_file.exists(worker_out_storage):
            log.warning(
                "Unable to get calculation status file: file not found")
            return False
        status_file.save_on_storage(worker_out_storage, storage, tmp_folder)
        file_id = status_file.save_in_database(user_id)
        models.calc.save_status_file(user_id, project_codename, calc_id,
                                     file_id)
    except error_util.all_errors as e:
        with error_util.saved_stack() as error_stack:
            status_file.delete_from_distant(storage)
            if error_util.is_abort(e):
                error_stack.reraise()
            else:
                error_util.log_error(log, e)
                return False
    if old_status_file:
        models.projects.remove_file_from_project(user_id, project_codename,
                                                 old_status_file['id'])
    return True
예제 #2
0
def calculate(api_name, server_name, job, project, mesh, calculation,
              calc_param_file, provider_name, machine, nbr_machines,
              split_results, client_login, client_ip, api_version):
    """
    Launch the machine(s), send the files, start the worker script, wait for progress and results and saving results

    :param api_name:                The name of the api
    :type api_name:                 str
    :param server_name:             The name of the server (ex: apidev.zephycloud.com)
    :type server_name:              str
    :param job:                     The job information
    :type job:                      dict[str, any]
    :param project:                 The main project
    :type project:                  dict[str, any]
    :param mesh:                    The mesh used for this calculation
    :type mesh:                     dict[str, any]
    :param calculation:             The calculation to launch
    :type calculation:              dict[str, any]
    :param calc_param_file:         The main job parameter file
    :type calc_param_file:          str
    :param provider_name:           The name of the provider
    :type provider_name:            str
    :param machine:                 The type of machine to launch
    :type machine:                  str
    :param nbr_machines:            The number of machines to run
    :type nbr_machines:             int
    :param split_results:           Do you want the result file to be splitted ?
    :type split_results:            bool
    :param client_login:            The login of the job owner
    :type client_login:             str
    :param client_ip:               The client ip address of the http request string this job
    :type client_ip:                str
    :param api_version:             The version of the http api where the user ask to launch this job
    :type api_version:              str
    """
    job_id = int(job['id'])
    nbr_machines = int(nbr_machines)
    models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_RUNNING)
    project_codename = project['uid']
    user_id = project["user_id"]
    calc_id = calculation['id']
    tmp_folder = api_util.get_conf().get("general", "tmp_folder")
    provider = api_util.get_provider(provider_name)
    storage = api_util.get_storage(project['storage'])
    tags = {
        'operation': "calc",
        'job_id': str(job_id),
        'server': server_name,
        'api': api_name,
        'api_version': api_version,
        'client': client_login,
        'client_ip': client_ip,
        'debug': DO_NOT_KILL_INSTANCES,
        'trusted': IS_TOOLCHAIN_SECURED
    }

    models.users.charge_user_fix_price(user_id, job_id,
                                       "Calculation storage cost")
    result_name = project_codename + "-calc-" + str(job_id)
    result_file = cmd_util.ResultFile(project_codename, result_name + ".zip")
    internal_file = cmd_util.ResultFile(project_codename,
                                        result_name + "_workfiles.zip")
    if split_results:
        iterations_file = cmd_util.ResultFile(project_codename,
                                              result_name + "_iterations.zip")
        reduce_file = cmd_util.ResultFile(project_codename,
                                          result_name + "_reduce.zip")

    # Uploading file on cloud storage
    log.info("Uploading param file to storage")
    models.calc.save_calc_param_file(user_id, project_codename,
                                     calculation['name'], calc_param_file)
    try:
        # Creating worker
        with cmd_util.using_workers(
                api_name,
                provider,
                job_id,
                machine,
                nbr_machines,
                tags,
                debug_keep_instances_alive=DO_NOT_KILL_INSTANCES) as workers:
            # Launch main script
            with cmd_util.TaskProcess(job_id, job["project_uid"], "calc",
                                      workers, [split_results]) as task_proc:
                conn = workers.ssh_connection
                # Charge user
                end_time = models.users.charge_user_computing(
                    user_id, job_id, "Cloud computation cost")
                if models.users.get_credit(user_id) <= 0:
                    raise api_util.NoMoreCredits()

                log.info("Sending project files on worker")
                worker_in_storage = storages.SshStorage(
                    conn, api_util.WORKER_INPUT_PATH, IS_TOOLCHAIN_SECURED)

                cmd_util.copy_project_file(
                    user_id,
                    project_codename,
                    storage,
                    worker_in_storage,
                    "project_file.zip",
                    tmp_folder,
                    key=models.projects.PROJECT_FILE_RAW)
                cmd_util.copy_project_file(
                    user_id,
                    project_codename,
                    storage,
                    worker_in_storage,
                    "anal.zip",
                    tmp_folder,
                    key=models.projects.PROJECT_FILE_ANALYSED)
                cmd_util.copy_project_file(user_id,
                                           project_codename,
                                           storage,
                                           worker_in_storage,
                                           "mesh.zip",
                                           tmp_folder,
                                           file_id=mesh['result_file_id'])
                worker_in_storage.upload_file(calc_param_file,
                                              "calc_params.zip")
                os.remove(calc_param_file)
                log.info("Project files sent to the worker")

                # Tell the script to start
                log.info("Starting the computation")
                task_proc.start()
                last_fetched_progress_time = datetime.datetime.utcfromtimestamp(
                    0)
                is_stopped = False
                while True:
                    task_status = task_proc.check_status()

                    # Charge if we need
                    if datetime.datetime.utcnow() > end_time:
                        end_time = models.users.charge_user_computing(
                            user_id, job_id, "Cloud computation cost")
                        if models.users.get_credit(user_id) <= 0:
                            models.jobs.save_job_text(job_id, "No more credit")
                            raise api_util.NoMoreCredits()

                    if task_status != models.jobs.JOB_STATUS_RUNNING:
                        log.info("Computation finished with status: " +
                                 models.jobs.job_status_to_str(task_status))
                        break
                    if (datetime.datetime.utcnow() - last_fetched_progress_time
                        ).seconds > STATUS_FETCHING_DELAY:
                        fetch_progress(conn, user_id, project_codename,
                                       calculation['name'], calculation['id'],
                                       storage, tmp_folder)
                        last_fetched_progress_time = datetime.datetime.utcnow()

                    if not is_stopped:
                        calculation = models.calc.get_calc(
                            user_id, project['uid'], calculation['id'])
                        if not calculation:
                            raise api_util.ToolchainError("Calculation " +
                                                          str(calc_id) +
                                                          " disappeared")
                        if calculation['status'] == models.calc.STATUS_STOPPED:
                            log.info("Stopping computation")
                            stop_calc(conn, project_codename)
                            is_stopped = True
                    time.sleep(1)

                # Checking if the machine is still here
                if not conn.ping():
                    models.jobs.save_job_text(job_id,
                                              "Worker instance disappeared")
                    raise api_util.ToolchainError(
                        "Worker instance disappeared")

                # Fetching computed data
                log.info("Saving results")
                worker_out_storage = storages.SshStorage(
                    conn, api_util.WORKER_OUTPUT_PATH, IS_TOOLCHAIN_SECURED)
                log_file = util.path_join(api_util.WORKER_OUTPUT_PATH,
                                          "worker.log")
                if conn.file_exists(log_file):
                    with file_util.temp_filename(dir=tmp_folder) as tmp:
                        conn.get_file(log_file, tmp)
                        models.jobs.save_job_log(job_id, tmp)
                else:
                    log.warning("No worker log file")

                if not result_file.exists(worker_out_storage):
                    log.error("Unable to find file " + str(result_file) +
                              " on worker")
                    raise api_util.ToolchainError(
                        "Task failed, no result file")
                result_file.save_on_storage(worker_out_storage, storage,
                                            tmp_folder)

                if split_results:
                    if not iterations_file.exists(worker_out_storage):
                        log.error("Unable to find file " +
                                  str(iterations_file) + " on worker")
                        raise api_util.ToolchainError(
                            "Task failed, no result file")
                    iterations_file.save_on_storage(worker_out_storage,
                                                    storage, tmp_folder)

                    if not reduce_file.exists(worker_out_storage):
                        log.error("Unable to find file " + str(reduce_file) +
                                  " on worker")
                        raise api_util.ToolchainError(
                            "Task failed, no result file")
                    reduce_file.save_on_storage(worker_out_storage, storage,
                                                tmp_folder)

                if internal_file.exists(worker_out_storage):
                    internal_file.save_on_storage(worker_out_storage, storage,
                                                  tmp_folder)
                else:
                    log.warning("No internal files found on server")

                fetch_progress(conn, user_id, project_codename,
                               calculation['name'], calculation['id'], storage,
                               tmp_folder)
                log.info("Computation result fetched")

                # Signaling all output was fetched
                task_proc.stop_and_wait()

        # Charge again if required
        if datetime.datetime.utcnow() > end_time:
            models.users.charge_user_computing(project["user_id"], job_id,
                                               "Cloud computation cost")

        # Uploading file on cloud storage
        result_file.save_in_database(user_id)
        internal_file_id = internal_file.save_in_database(
            user_id) if internal_file.saved else None
        if split_results:
            iterations_file.save_in_database(user_id)
            reduce_file.save_in_database(user_id)
            models.calc.save_result_files(user_id, project_codename,
                                          calculation['name'],
                                          result_file.file_id,
                                          iterations_file.file_id,
                                          reduce_file.file_id,
                                          internal_file_id)
        else:
            models.calc.save_result_files(user_id, project_codename,
                                          calculation['name'],
                                          result_file.file_id, None, None,
                                          internal_file_id)
    except error_util.all_errors:
        with error_util.before_raising():
            if REMOVE_RESULTS_ON_ERROR:
                internal_file.delete_from_distant(storage)
                result_file.delete_from_distant(storage)
                if split_results:
                    iterations_file.delete_from_distant(storage)
                    reduce_file.delete_from_distant(storage)
    log.info("Results saved")
def analyse(api_name, server_name, job, project, storage_name, project_file,
            provider_name, machine, nbr_machines, client_login, client_ip,
            api_version):
    """
    Launch the machine(s), send the files, start the worker script, wait for progress and results and saving results

    :param api_name:                The name of the api
    :type api_name:                 str
    :param server_name:             The name of the server (ex: apidev.zephycloud.com)
    :type server_name:              str
    :param job:                     The job information
    :type job:                      dict[str, any]
    :param project:                 The main project
    :type project:                  dict[str, any]
    :param storage_name:            The name of the storage where the project will be located
    :type storage_name:             str
    :param project_file:            The raw project file to analyse
    :type project_file:             str
    :param provider_name:           The name of the provider
    :type provider_name:            str
    :param machine:                 The type of machine to launch
    :type machine:                  str
    :param nbr_machines:            The number of machines to run
    :type nbr_machines:             int
    :param client_login:            The login of the job owner
    :type client_login:             str
    :param client_ip:               The client ip address of the http request string this job
    :type client_ip:                str
    :param api_version:             The version of the http api where the user ask to launch this job
    :type api_version:              str
    """
    job_id = int(job['id'])
    project_codename = job["project_uid"]
    analyzed_filename = job["project_uid"] + "-anal-" + str(job_id) + ".zip"
    user_id = project["user_id"]
    provider = api_util.get_provider(provider_name)
    storage = api_util.get_storage(storage_name)
    tmp_folder = api_util.get_conf().get("general", "tmp_folder")

    tags = {
        'operation': "anal",
        'job_id': str(job_id),
        'server': server_name,
        'api': api_name,
        'api_version': api_version,
        'client': client_login,
        'client_ip': client_ip,
        'trusted': IS_TOOLCHAIN_SECURED
    }

    # Uploading file on cloud storage
    log.info("Uploading project file to storage")
    models.projects.append_file_to_project(
        user_id,
        job["project_uid"],
        project_file,
        "project_" + job["project_uid"] + ".zip",
        key=models.projects.PROJECT_FILE_RAW,
        overwrite=True)
    log.info("Project file uploaded")

    models.users.charge_user_fix_price(user_id, job_id, "Project storage cost")
    analyzed_file = cmd_util.ResultFile(project_codename, analyzed_filename)
    try:
        # Creating worker
        with cmd_util.using_workers(
                api_name,
                provider,
                job_id,
                machine,
                nbr_machines,
                tags,
                debug_keep_instances_alive=DO_NOT_KILL_INSTANCES) as workers:
            with cmd_util.TaskProcess(job_id, job["project_uid"], "anal",
                                      workers) as task_proc:
                conn = workers.ssh_connection
                # Charge user
                end_time = models.users.charge_user_computing(
                    user_id, job_id, "Cloud computation cost")
                if models.users.get_credit(user_id) <= 0:
                    raise api_util.NoMoreCredits()

                log.info("Sending project files on worker")
                conn.send_file(
                    project_file,
                    util.path_join(api_util.WORKER_INPUT_PATH,
                                   "project_file.zip"))
                os.remove(project_file)
                log.info("Project files sent to the worker")

                # Tell the script to start
                log.info("Starting the computation")
                task_proc.start()
                while True:
                    task_status = task_proc.check_status()

                    # Charge if we need
                    if datetime.datetime.utcnow() > end_time:
                        end_time = models.users.charge_user_computing(
                            user_id, job_id, "Cloud computation cost")
                        if models.users.get_credit(user_id) <= 0:
                            models.jobs.save_job_text(job_id, "No more credit")
                            raise api_util.NoMoreCredits()

                    if task_status != models.jobs.JOB_STATUS_RUNNING:
                        log.info("Computation finished with status: " +
                                 models.jobs.job_status_to_str(task_status))
                        break
                    time.sleep(5)

                # Checking if the machine is still here
                if not conn.ping():
                    models.jobs.save_job_text(job_id,
                                              "Worker instance disappeared")
                    raise api_util.ToolchainError(
                        "Worker instance disappeared")

                # Fetching computed data
                log.info("Fetching results")
                worker_out_storage = storages.SshStorage(
                    conn, api_util.WORKER_OUTPUT_PATH, IS_TOOLCHAIN_SECURED)
                log_file = util.path_join(api_util.WORKER_OUTPUT_PATH,
                                          "worker.log")
                if conn.file_exists(log_file):
                    with file_util.temp_filename(dir=tmp_folder) as tmp:
                        conn.get_file(log_file, tmp)
                        models.jobs.save_job_log(job_id, tmp)
                else:
                    log.warning("No worker log file")

                if not analyzed_file.exists(worker_out_storage):
                    log.error("Unable to find file " + str(analyzed_file) +
                              " on worker")
                    raise api_util.ToolchainError(
                        "Task failed, no result file")
                analyzed_file.save_on_storage(worker_out_storage, storage,
                                              tmp_folder)
                log.info("Computation result fetched")

                # Signaling all output was fetched
                task_proc.stop_and_wait()

        # Charge again if required
        if datetime.datetime.utcnow() > end_time:
            models.users.charge_user_computing(project["user_id"], job_id,
                                               "Cloud computation cost")

        # Uploading file on cloud storage
        analyzed_file.save_in_database(
            user_id, key=models.projects.PROJECT_FILE_ANALYSED)
    except error_util.all_errors:
        with error_util.before_raising():
            if REMOVE_RESULTS_ON_ERROR:
                analyzed_file.delete_from_distant(storage)
    log.info("Result saved")