def get_component_input_table(dsl_parser, job, component_name): component = dsl_parser.get_component_info(component_name=component_name) if 'reader' in component_name: component_parameters = component.get_role_parameters() return component_parameters[job.f_role][0]['ReaderParam'] task_input_dsl = component.get_input() job_args_on_party = TaskExecutor.get_job_args_on_party( dsl_parser=dsl_parser, job_runtime_conf=job.f_runtime_conf, role=job.f_role, party_id=job.f_party_id) config = job_utils.get_job_parameters(job.f_job_id, job.f_role, job.f_party_id) task_parameters = RunParameters(**config) job_parameters = task_parameters component_input_table = TaskExecutor.get_task_run_args( job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id, task_id=None, task_version=None, job_args=job_args_on_party, job_parameters=job_parameters, task_parameters=task_parameters, input_dsl=task_input_dsl, get_input_table=True) return component_input_table
def get_component_input_table(dsl_parser, job, component_name): component = dsl_parser.get_component_info(component_name=component_name) module_name = get_component_module(component_name, job.f_dsl) if 'reader' in module_name.lower(): return job.f_runtime_conf.get("component_parameters", {}).get( "role", {}).get(job.f_role, {}).get( str(job.f_roles.get(job.f_role).index(int( job.f_party_id)))).get(component_name) task_input_dsl = component.get_input() job_args_on_party = TaskExecutor.get_job_args_on_party( dsl_parser=dsl_parser, job_runtime_conf=job.f_runtime_conf, role=job.f_role, party_id=job.f_party_id) config = job_utils.get_job_parameters(job.f_job_id, job.f_role, job.f_party_id) task_parameters = RunParameters(**config) job_parameters = task_parameters component_input_table = TaskExecutor.get_task_run_args( job_id=job.f_job_id, role=job.f_role, party_id=job.f_party_id, task_id=None, task_version=None, job_args=job_args_on_party, job_parameters=job_parameters, task_parameters=task_parameters, input_dsl=task_input_dsl, get_input_table=True) return component_input_table
def calculate_job_resource(cls, job_parameters: RunParameters = None, job_id=None, role=None, party_id=None): if not job_parameters: job_parameters = job_utils.get_job_parameters(job_id=job_id, role=role, party_id=party_id) job_parameters = RunParameters(**job_parameters) cores = 0 memory = 0 if not (job_parameters.computing_engine in IGNORE_RESOURCE_COMPUTING_ENGINE or role in IGNORE_RESOURCE_ROLES and job_parameters.computing_engine in SUPPORT_IGNORE_RESOURCE_ENGINES): cores = ( int(job_parameters.adaptation_parameters["task_cores_per_node"] or 0) * int(job_parameters.adaptation_parameters["task_nodes"] or 0) * int(job_parameters.task_parallelism or 0)) memory = (int( job_parameters.adaptation_parameters["task_memory_per_node"] or 0) * int(job_parameters.adaptation_parameters["task_nodes"] or 0) * int(job_parameters.task_parallelism or 0)) return job_parameters.computing_engine, cores, memory
def calculate_task_resource(cls, task_parameters: RunParameters = None, task_info: dict = None): if not task_parameters: job_parameters = job_utils.get_job_parameters( job_id=task_info["job_id"], role=task_info["role"], party_id=task_info["party_id"]) task_parameters = RunParameters(**job_parameters) cores_per_task = task_parameters.adaptation_parameters["task_cores_per_node"] * \ task_parameters.adaptation_parameters["task_nodes"] memory_per_task = task_parameters.adaptation_parameters["task_memory_per_node"] * \ task_parameters.adaptation_parameters["task_nodes"] return cores_per_task, memory_per_task
def calculate_job_resource(cls, job_parameters: RunParameters = None, job_id=None, role=None, party_id=None): if not job_parameters: job_parameters = job_utils.get_job_parameters(job_id=job_id, role=role, party_id=party_id) job_parameters = RunParameters(**job_parameters) cores = job_parameters.adaptation_parameters[ "task_cores_per_node"] * job_parameters.adaptation_parameters[ "task_nodes"] * job_parameters.task_parallelism memory = job_parameters.adaptation_parameters[ "task_memory_per_node"] * job_parameters.adaptation_parameters[ "task_nodes"] * job_parameters.task_parallelism return job_parameters.computing_engine, cores, memory
def calculate_task_resource(cls, task_parameters: RunParameters = None, task_info: dict = None): if not task_parameters: job_parameters = job_utils.get_job_parameters(job_id=task_info["job_id"], role=task_info["role"], party_id=task_info["party_id"]) task_parameters = RunParameters(**job_parameters) if task_parameters.backend == Backend.LINKIS_SPARK_RABBITMQ: cores_per_task = 0 memory_per_task = 0 elif task_info["role"] in IGNORE_RESOURCE_ROLES and task_parameters.computing_engine in SUPPORT_IGNORE_RESOURCE_ENGINES: cores_per_task = 0 memory_per_task = 0 else: cores_per_task = task_parameters.adaptation_parameters["task_cores_per_node"] * \ task_parameters.adaptation_parameters["task_nodes"] memory_per_task = task_parameters.adaptation_parameters["task_memory_per_node"] * \ task_parameters.adaptation_parameters["task_nodes"] return cores_per_task, memory_per_task
def calculate_job_resource(cls, job_parameters: RunParameters = None, job_id=None, role=None, party_id=None): if not job_parameters: job_parameters = job_utils.get_job_parameters(job_id=job_id, role=role, party_id=party_id) job_parameters = RunParameters(**job_parameters) if job_parameters.backend == Backend.LINKIS_SPARK_RABBITMQ: cores = 0 memory = 0 elif role in IGNORE_RESOURCE_ROLES and job_parameters.computing_engine in SUPPORT_IGNORE_RESOURCE_ENGINES: cores = 0 memory = 0 else: cores = job_parameters.adaptation_parameters["task_cores_per_node"] * job_parameters.adaptation_parameters[ "task_nodes"] * job_parameters.task_parallelism memory = job_parameters.adaptation_parameters["task_memory_per_node"] * job_parameters.adaptation_parameters[ "task_nodes"] * job_parameters.task_parallelism return job_parameters.computing_engine, cores, memory
def start_task(cls, job_id, component_name, task_id, task_version, role, party_id, **kwargs): """ Start task, update status and party status :param job_id: :param component_name: :param task_id: :param task_version: :param role: :param party_id: :return: """ job_dsl = job_utils.get_job_dsl(job_id, role, party_id) PrivilegeAuth.authentication_component( job_dsl, src_party_id=kwargs.get('src_party_id'), src_role=kwargs.get('src_role'), party_id=party_id, component_name=component_name) schedule_logger(job_id).info( f"try to start task {task_id} {task_version} on {role} {party_id} executor subprocess" ) task_executor_process_start_status = False task_info = { "job_id": job_id, "task_id": task_id, "task_version": task_version, "role": role, "party_id": party_id, } is_failed = False try: task = JobSaver.query_task(task_id=task_id, task_version=task_version, role=role, party_id=party_id)[0] run_parameters_dict = job_utils.get_job_parameters( job_id, role, party_id) run_parameters_dict["src_user"] = kwargs.get("src_user") run_parameters = RunParameters(**run_parameters_dict) config_dir = job_utils.get_task_directory(job_id, role, party_id, component_name, task_id, task_version) os.makedirs(config_dir, exist_ok=True) run_parameters_path = os.path.join(config_dir, 'task_parameters.json') with open(run_parameters_path, 'w') as fw: fw.write(json_dumps(run_parameters_dict)) schedule_logger(job_id).info( f"use computing engine {run_parameters.computing_engine}") task_info["engine_conf"] = { "computing_engine": run_parameters.computing_engine } backend_engine = build_engine(run_parameters.computing_engine) run_info = backend_engine.run( task=task, run_parameters=run_parameters, run_parameters_path=run_parameters_path, config_dir=config_dir, log_dir=job_utils.get_job_log_directory( job_id, role, party_id, component_name), cwd_dir=job_utils.get_job_directory(job_id, role, party_id, component_name), user_name=kwargs.get("user_id")) task_info.update(run_info) task_info["start_time"] = current_timestamp() task_executor_process_start_status = True except Exception as e: schedule_logger(job_id).exception(e) is_failed = True finally: try: cls.update_task(task_info=task_info) task_info["party_status"] = TaskStatus.RUNNING cls.update_task_status(task_info=task_info) if is_failed: task_info["party_status"] = TaskStatus.FAILED cls.update_task_status(task_info=task_info) except Exception as e: schedule_logger(job_id).exception(e) schedule_logger(job_id).info( "task {} {} on {} {} executor subprocess start {}".format( task_id, task_version, role, party_id, "success" if task_executor_process_start_status else "failed"))
def start_task_worker(cls, worker_name, task: Task, task_parameters: RunParameters = None, executable: list = None, extra_env: dict = None, **kwargs): worker_id, config_dir, log_dir = cls.get_process_dirs( worker_name=worker_name, job_id=task.f_job_id, role=task.f_role, party_id=task.f_party_id, task=task) session_id = job_utils.generate_session_id(task.f_task_id, task.f_task_version, task.f_role, task.f_party_id) federation_session_id = job_utils.generate_task_version_id( task.f_task_id, task.f_task_version) info_kwargs = {} specific_cmd = [] if worker_name is WorkerName.TASK_EXECUTOR: from fate_flow.worker.task_executor import TaskExecutor module_file_path = sys.modules[TaskExecutor.__module__].__file__ else: raise Exception(f"not support {worker_name} worker") if task_parameters is None: task_parameters = RunParameters(**job_utils.get_job_parameters( task.f_job_id, task.f_role, task.f_party_id)) config = task_parameters.to_dict() config["src_user"] = kwargs.get("src_user") config_path, result_path = cls.get_config(config_dir=config_dir, config=config, log_dir=log_dir) if executable: process_cmd = executable else: process_cmd = [sys.executable or "python3"] common_cmd = [ module_file_path, "--job_id", task.f_job_id, "--component_name", task.f_component_name, "--task_id", task.f_task_id, "--task_version", task.f_task_version, "--role", task.f_role, "--party_id", task.f_party_id, "--config", config_path, '--result', result_path, "--log_dir", log_dir, "--parent_log_dir", os.path.dirname(log_dir), "--worker_id", worker_id, "--run_ip", RuntimeConfig.JOB_SERVER_HOST, "--job_server", f"{RuntimeConfig.JOB_SERVER_HOST}:{RuntimeConfig.HTTP_PORT}", "--session_id", session_id, "--federation_session_id", federation_session_id, ] process_cmd.extend(common_cmd) process_cmd.extend(specific_cmd) env = cls.get_env(task.f_job_id, task.f_provider_info) if extra_env: env.update(extra_env) schedule_logger(task.f_job_id).info( f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} {worker_name} worker subprocess is ready" ) p = process_utils.run_subprocess(job_id=task.f_job_id, config_dir=config_dir, process_cmd=process_cmd, added_env=env, log_dir=log_dir, cwd_dir=config_dir, process_name=worker_name.value, process_id=worker_id) cls.save_worker_info(task=task, worker_name=worker_name, worker_id=worker_id, run_ip=RuntimeConfig.JOB_SERVER_HOST, run_pid=p.pid, config=config, cmd=process_cmd, **info_kwargs) return {"run_pid": p.pid, "worker_id": worker_id, "cmd": process_cmd}
def start_task(cls, job_id, component_name, task_id, task_version, role, party_id): """ Start task, update status and party status :param job_id: :param component_name: :param task_id: :param task_version: :param role: :param party_id: :return: """ schedule_logger(job_id).info( 'try to start job {} task {} {} on {} {} executor subprocess'. format(job_id, task_id, task_version, role, party_id)) task_executor_process_start_status = False task_info = { "job_id": job_id, "task_id": task_id, "task_version": task_version, "role": role, "party_id": party_id, } try: task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id), role, party_id, component_name, task_id, task_version) os.makedirs(task_dir, exist_ok=True) task_parameters_path = os.path.join(task_dir, 'task_parameters.json') run_parameters_dict = job_utils.get_job_parameters( job_id, role, party_id) with open(task_parameters_path, 'w') as fw: fw.write(json_dumps(run_parameters_dict)) run_parameters = RunParameters(**run_parameters_dict) schedule_logger(job_id=job_id).info( f"use computing engine {run_parameters.computing_engine}") if run_parameters.computing_engine in { ComputingEngine.EGGROLL, ComputingEngine.STANDALONE }: process_cmd = [ sys.executable, sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-v', task_version, '-r', role, '-p', party_id, '-c', task_parameters_path, '--run_ip', RuntimeConfig.JOB_SERVER_HOST, '--job_server', '{}:{}'.format(RuntimeConfig.JOB_SERVER_HOST, RuntimeConfig.HTTP_PORT), ] elif run_parameters.computing_engine == ComputingEngine.SPARK: if "SPARK_HOME" not in os.environ: raise EnvironmentError("SPARK_HOME not found") spark_home = os.environ["SPARK_HOME"] # additional configs spark_submit_config = run_parameters.spark_run deploy_mode = spark_submit_config.get("deploy-mode", "client") if deploy_mode not in ["client"]: raise ValueError( f"deploy mode {deploy_mode} not supported") spark_submit_cmd = os.path.join(spark_home, "bin/spark-submit") process_cmd = [spark_submit_cmd, f'--name={task_id}#{role}'] for k, v in spark_submit_config.items(): if k != "conf": process_cmd.append(f'--{k}={v}') if "conf" in spark_submit_config: for ck, cv in spark_submit_config["conf"].items(): process_cmd.append(f'--conf') process_cmd.append(f'{ck}={cv}') process_cmd.extend([ sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-v', task_version, '-r', role, '-p', party_id, '-c', task_parameters_path, '--run_ip', RuntimeConfig.JOB_SERVER_HOST, '--job_server', '{}:{}'.format(RuntimeConfig.JOB_SERVER_HOST, RuntimeConfig.HTTP_PORT), ]) else: raise ValueError( f"${run_parameters.computing_engine} is not supported") task_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job_id), role, party_id, component_name) schedule_logger(job_id).info( 'job {} task {} {} on {} {} executor subprocess is ready'. format(job_id, task_id, task_version, role, party_id)) p = job_utils.run_subprocess(job_id=job_id, config_dir=task_dir, process_cmd=process_cmd, log_dir=task_log_dir) if p: task_info["party_status"] = TaskStatus.RUNNING #task_info["run_pid"] = p.pid task_info["start_time"] = current_timestamp() task_executor_process_start_status = True else: task_info["party_status"] = TaskStatus.FAILED except Exception as e: schedule_logger(job_id).exception(e) task_info["party_status"] = TaskStatus.FAILED finally: try: cls.update_task(task_info=task_info) cls.update_task_status(task_info=task_info) except Exception as e: schedule_logger(job_id).exception(e) schedule_logger(job_id).info( 'job {} task {} {} on {} {} executor subprocess start {}'. format( job_id, task_id, task_version, role, party_id, "success" if task_executor_process_start_status else "failed"))