def start_task(job_id, component_name, task_id, role, party_id, task_config): schedule_logger.info('job {} {} {} {} task subprocess is ready'.format( job_id, component_name, role, party_id, task_config)) task_process_start_status = False try: task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id), role, party_id, component_name) os.makedirs(task_dir, exist_ok=True) task_config_path = os.path.join(task_dir, 'task_config.json') with open(task_config_path, 'w') as fw: json.dump(task_config, fw) process_cmd = [ 'python3', sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-r', role, '-p', party_id, '-c', task_config_path ] task_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job_id), role, party_id, component_name) schedule_logger.info( 'job {} {} {} {} task subprocess start'.format( job_id, component_name, role, party_id, task_config)) p = job_utils.run_subprocess(config_dir=task_dir, process_cmd=process_cmd, log_dir=task_log_dir) if p: task_process_start_status = True except Exception as e: schedule_logger.exception(e) finally: schedule_logger.info( 'job {} component {} on {} {} start task subprocess {}'.format( job_id, component_name, role, party_id, 'success' if task_process_start_status else 'failed'))
def run(job_id, component_name, task_id, task_version, role, party_id, task_parameters_path, task_info, **kwargs): process_cmd = [ sys.executable, sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-v', task_version, '-r', role, '-p', party_id, '-c', task_parameters_path, '--run_ip', RuntimeConfig.JOB_SERVER_HOST, '--job_server', '{}:{}'.format(RuntimeConfig.JOB_SERVER_HOST, RuntimeConfig.HTTP_PORT), ] task_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job_id), role, party_id, component_name) task_job_dir = os.path.join(job_utils.get_job_directory(job_id=job_id), role, party_id, component_name) schedule_logger(job_id).info( 'job {} task {} {} on {} {} executor subprocess is ready'.format( job_id, task_id, task_version, role, party_id)) task_dir = os.path.dirname(task_parameters_path) p = job_utils.run_subprocess(job_id=job_id, config_dir=task_dir, process_cmd=process_cmd, log_dir=task_log_dir, job_dir=task_job_dir) task_info["run_pid"] = p.pid return p
def download_upload(data_func): request_config = request.json _job_id = generate_job_id() stat_logger.info('generated job_id {}, body {}'.format(_job_id, request_config)) _job_dir = get_job_directory(_job_id) os.makedirs(_job_dir, exist_ok=True) module = data_func required_arguments = ['work_mode', 'namespace', 'table_name'] if module == 'upload': required_arguments.extend(['file', 'head', 'partition']) elif module == 'download': required_arguments.extend(['output_path']) else: raise Exception('can not support this operating: {}'.format(module)) detect_utils.check_config(request_config, required_arguments=required_arguments) if module == "upload": if not os.path.isabs(request_config['file']): request_config["file"] = os.path.join(file_utils.get_project_base_directory(), request_config["file"]) try: conf_file_path = new_runtime_conf(job_dir=_job_dir, method=data_func, module=module, role=request_config.get('local', {}).get("role"), party_id=request_config.get('local', {}).get("party_id", '')) file_utils.dump_json_conf(request_config, conf_file_path) progs = ["python3", os.path.join(file_utils.get_project_base_directory(), JOB_MODULE_CONF[module]["module_path"]), "-j", _job_id, "-c", conf_file_path ] try: p = run_subprocess(config_dir=_job_dir, process_cmd=progs) except Exception as e: stat_logger.exception(e) p = None return get_json_result(retcode=(0 if p else 101), job_id=_job_id, data={'table_name': request_config['table_name'], 'namespace': request_config['namespace'], 'pid': p.pid if p else ''}) except Exception as e: stat_logger.exception(e) return get_json_result(retcode=-104, retmsg="failed", job_id=_job_id)
def run_task(job_id, component_name, task_id, role, party_id, task_config): schedule_logger(job_id).info( 'job {} {} {} {} task subprocess is ready'.format(job_id, component_name, role, party_id, task_config)) task_process_start_status = False try: task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id), role, party_id, component_name) os.makedirs(task_dir, exist_ok=True) task_config_path = os.path.join(task_dir, 'task_config.json') with open(task_config_path, 'w') as fw: json.dump(task_config, fw) try: backend = task_config['job_parameters']['backend'] except KeyError: backend = 0 schedule_logger(job_id).warning("failed to get backend, set as 0") backend = Backend(backend) if backend.is_eggroll(): process_cmd = [ 'python3', sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-r', role, '-p', party_id, '-c', task_config_path, '--processors_per_node', str(task_config['job_parameters'].get("processors_per_node", 0)), '--job_server', '{}:{}'.format(get_lan_ip(), HTTP_PORT), ] elif backend.is_spark(): if "SPARK_HOME" not in os.environ: raise EnvironmentError("SPARK_HOME not found") spark_home = os.environ["SPARK_HOME"] # additional configs spark_submit_config = task_config['job_parameters'].get("spark_submit_config", dict()) deploy_mode = spark_submit_config.get("deploy-mode", "client") if deploy_mode not in ["client"]: raise ValueError(f"deploy mode {deploy_mode} not supported") spark_submit_cmd = os.path.join(spark_home, "bin/spark-submit") process_cmd = [spark_submit_cmd, f'--name={task_id}#{role}'] for k, v in spark_submit_config.items(): if k != "conf": process_cmd.append(f'--{k}={v}') if "conf" in spark_submit_config: for ck, cv in spark_submit_config["conf"].items(): process_cmd.append(f'--conf') process_cmd.append(f'{ck}={cv}') process_cmd.extend([ sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-r', role, '-p', party_id, '-c', task_config_path, '--job_server', '{}:{}'.format(get_lan_ip(), HTTP_PORT), ]) else: raise ValueError(f"${backend} supported") task_log_dir = os.path.join(job_utils.get_job_log_directory(job_id=job_id), role, party_id, component_name) schedule_logger(job_id).info( 'job {} {} {} {} task subprocess start'.format(job_id, component_name, role, party_id, task_config)) p = job_utils.run_subprocess(config_dir=task_dir, process_cmd=process_cmd, log_dir=task_log_dir) if p: task_process_start_status = True except Exception as e: schedule_logger(job_id).exception(e) finally: schedule_logger(job_id).info( 'job {} component {} on {} {} start task subprocess {}'.format(job_id, component_name, role, party_id, 'success' if task_process_start_status else 'failed'))
def download_upload(data_func): request_config = request.json _job_id = generate_job_id() stat_logger.info('generated job_id {}, body {}'.format( _job_id, request_config)) _job_dir = get_job_directory(_job_id) os.makedirs(_job_dir, exist_ok=True) module = data_func required_arguments = ['work_mode', 'namespace', 'table_name'] if module == 'upload': required_arguments.extend(['file', 'head', 'partition']) elif module == 'download': required_arguments.extend(['output_path']) else: raise Exception('can not support this operating: {}'.format(module)) detect_utils.check_config(request_config, required_arguments=required_arguments) job_work_mode = request_config['work_mode'] # todo: The current code here is redundant with job_app/submit_job, the next version of this function will be implemented by job_app/submit_job if job_work_mode != RuntimeConfig.WORK_MODE: if RuntimeConfig.WORK_MODE == WorkMode.CLUSTER and job_work_mode == WorkMode.STANDALONE: # use cluster standalone job server to execute standalone job return request_execute_server( request=request, execute_host='{}:{}'.format( request.remote_addr, CLUSTER_STANDALONE_JOB_SERVER_PORT)) else: raise Exception( 'server run on standalone can not support cluster mode job') if module == "upload": if not os.path.isabs(request_config['file']): request_config["file"] = os.path.join( file_utils.get_project_base_directory(), request_config["file"]) try: conf_file_path = new_runtime_conf( job_dir=_job_dir, method=data_func, module=module, role=request_config.get('local', {}).get("role"), party_id=request_config.get('local', {}).get("party_id", '')) file_utils.dump_json_conf(request_config, conf_file_path) progs = [ "python3", os.path.join(file_utils.get_project_base_directory(), JOB_MODULE_CONF[module]["module_path"]), "-j", _job_id, "-c", conf_file_path ] try: p = run_subprocess(config_dir=_job_dir, process_cmd=progs) except Exception as e: stat_logger.exception(e) p = None return get_json_result(retcode=(0 if p else 101), job_id=_job_id, data={ 'table_name': request_config['table_name'], 'namespace': request_config['namespace'], 'pid': p.pid if p else '' }) except Exception as e: stat_logger.exception(e) return get_json_result(retcode=-104, retmsg="failed", job_id=_job_id)
def start_task(cls, job_id, component_name, task_id, task_version, role, party_id): """ Start task, update status and party status :param job_id: :param component_name: :param task_id: :param task_version: :param role: :param party_id: :return: """ schedule_logger(job_id).info( 'try to start job {} task {} {} on {} {} executor subprocess'. format(job_id, task_id, task_version, role, party_id)) task_executor_process_start_status = False task_info = { "job_id": job_id, "task_id": task_id, "task_version": task_version, "role": role, "party_id": party_id, } try: task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id), role, party_id, component_name, task_id, task_version) os.makedirs(task_dir, exist_ok=True) task_parameters_path = os.path.join(task_dir, 'task_parameters.json') run_parameters_dict = job_utils.get_job_parameters( job_id, role, party_id) with open(task_parameters_path, 'w') as fw: fw.write(json_dumps(run_parameters_dict)) run_parameters = RunParameters(**run_parameters_dict) schedule_logger(job_id=job_id).info( f"use computing engine {run_parameters.computing_engine}") if run_parameters.computing_engine in { ComputingEngine.EGGROLL, ComputingEngine.STANDALONE }: process_cmd = [ sys.executable, sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-v', task_version, '-r', role, '-p', party_id, '-c', task_parameters_path, '--run_ip', RuntimeConfig.JOB_SERVER_HOST, '--job_server', '{}:{}'.format(RuntimeConfig.JOB_SERVER_HOST, RuntimeConfig.HTTP_PORT), ] elif run_parameters.computing_engine == ComputingEngine.SPARK: if "SPARK_HOME" not in os.environ: raise EnvironmentError("SPARK_HOME not found") spark_home = os.environ["SPARK_HOME"] # additional configs spark_submit_config = run_parameters.spark_run deploy_mode = spark_submit_config.get("deploy-mode", "client") if deploy_mode not in ["client"]: raise ValueError( f"deploy mode {deploy_mode} not supported") spark_submit_cmd = os.path.join(spark_home, "bin/spark-submit") process_cmd = [spark_submit_cmd, f'--name={task_id}#{role}'] for k, v in spark_submit_config.items(): if k != "conf": process_cmd.append(f'--{k}={v}') if "conf" in spark_submit_config: for ck, cv in spark_submit_config["conf"].items(): process_cmd.append(f'--conf') process_cmd.append(f'{ck}={cv}') process_cmd.extend([ sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-v', task_version, '-r', role, '-p', party_id, '-c', task_parameters_path, '--run_ip', RuntimeConfig.JOB_SERVER_HOST, '--job_server', '{}:{}'.format(RuntimeConfig.JOB_SERVER_HOST, RuntimeConfig.HTTP_PORT), ]) else: raise ValueError( f"${run_parameters.computing_engine} is not supported") task_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job_id), role, party_id, component_name) schedule_logger(job_id).info( 'job {} task {} {} on {} {} executor subprocess is ready'. format(job_id, task_id, task_version, role, party_id)) p = job_utils.run_subprocess(job_id=job_id, config_dir=task_dir, process_cmd=process_cmd, log_dir=task_log_dir) if p: task_info["party_status"] = TaskStatus.RUNNING #task_info["run_pid"] = p.pid task_info["start_time"] = current_timestamp() task_executor_process_start_status = True else: task_info["party_status"] = TaskStatus.FAILED except Exception as e: schedule_logger(job_id).exception(e) task_info["party_status"] = TaskStatus.FAILED finally: try: cls.update_task(task_info=task_info) cls.update_task_status(task_info=task_info) except Exception as e: schedule_logger(job_id).exception(e) schedule_logger(job_id).info( 'job {} task {} {} on {} {} executor subprocess start {}'. format( job_id, task_id, task_version, role, party_id, "success" if task_executor_process_start_status else "failed"))
def start_task(job_id, component_name, task_id, role, party_id, task_config): schedule_logger(job_id).info( 'job {} {} {} {} task subprocess is ready'.format( job_id, component_name, role, party_id, task_config)) task_process_start_status = False try: task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id), role, party_id, component_name) os.makedirs(task_dir, exist_ok=True) task_config_path = os.path.join(task_dir, 'task_config.json') with open(task_config_path, 'w') as fw: json.dump(task_config, fw) try: backend = task_config['job_parameters']['backend'] except KeyError: backend = 0 schedule_logger(job_id).warning( "failed to get backend, set as 0") backend = Backend(backend) if backend.is_eggroll() or backend.is_eggroll2(): process_cmd = [ 'python3', sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-r', role, '-p', party_id, '-c', task_config_path, '--job_server', '{}:{}'.format(task_config['job_server']['ip'], task_config['job_server']['http_port']), ] elif backend.is_spark(): if "SPARK_HOME" not in os.environ: raise EnvironmentError("SPARK_HOME not found") spark_submit_config = task_config['job_parameters'].get( "spark_submit_config", dict()) deploy_mode = spark_submit_config.get("deploy-mode", "client") queue = spark_submit_config.get("queue", "default") driver_memory = spark_submit_config.get("driver-memory", "1g") num_executors = spark_submit_config.get("num-executors", 2) executor_memory = spark_submit_config.get( "executor-memory", "1g") executor_cores = spark_submit_config.get("executor-cores", 1) if deploy_mode not in ["client"]: raise ValueError( f"deploy mode {deploy_mode} not supported") spark_home = os.environ["SPARK_HOME"] spark_submit_cmd = os.path.join(spark_home, "bin/spark-submit") process_cmd = [ spark_submit_cmd, f'--name={task_id}#{role}', f'--deploy-mode={deploy_mode}', f'--queue={queue}', f'--driver-memory={driver_memory}', f'--num-executors={num_executors}', f'--executor-memory={executor_memory}', f'--executor-cores={executor_cores}', sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-r', role, '-p', party_id, '-c', task_config_path, '--job_server', '{}:{}'.format(task_config['job_server']['ip'], task_config['job_server']['http_port']), ] else: raise ValueError(f"${backend} supported") task_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job_id), role, party_id, component_name) schedule_logger(job_id).info( 'job {} {} {} {} task subprocess start'.format( job_id, component_name, role, party_id, task_config)) p = job_utils.run_subprocess(config_dir=task_dir, process_cmd=process_cmd, log_dir=task_log_dir) if p: task_process_start_status = True except Exception as e: schedule_logger(job_id).exception(e) finally: schedule_logger(job_id).info( 'job {} component {} on {} {} start task subprocess {}'.format( job_id, component_name, role, party_id, 'success' if task_process_start_status else 'failed'))