def _run(self): result = {} dsl_parser = schedule_utils.get_job_dsl_parser( dsl=self.args.dsl, runtime_conf=self.args.runtime_conf, train_runtime_conf=self.args.train_runtime_conf, pipeline_dsl=self.args.pipeline_dsl) provider = ComponentProvider(**self.args.config["provider"]) common_task_info = self.args.config["common_task_info"] log_msg = f"initialize the components: {self.args.config['components']}" LOGGER.info( start_log(log_msg, role=self.args.role, party_id=self.args.party_id)) for component_name in self.args.config["components"]: result[component_name] = {} task_info = {} task_info.update(common_task_info) parameters, user_specified_parameters = ProviderManager.get_component_parameters( dsl_parser=dsl_parser, component_name=component_name, role=self.args.role, party_id=self.args.party_id, provider=provider) if parameters: task_info = {} task_info.update(common_task_info) task_info["component_name"] = component_name task_info["component_module"] = parameters["module"] task_info["provider_info"] = provider.to_dict() task_info["component_parameters"] = parameters TaskController.create_task( role=self.args.role, party_id=self.args.party_id, run_on_this_party=common_task_info["run_on_this_party"], task_info=task_info) result[component_name]["need_run"] = True else: # The party does not need to run, pass result[component_name]["need_run"] = False LOGGER.info( successful_log(log_msg, role=self.args.role, party_id=self.args.party_id)) return result
def start_general_worker(cls, worker_name: WorkerName, job_id="", role="", party_id=0, provider: ComponentProvider = None, initialized_config: dict = None, run_in_subprocess=True, **kwargs): if RuntimeConfig.DEBUG: run_in_subprocess = True participate = locals() worker_id, config_dir, log_dir = cls.get_process_dirs( worker_name=worker_name, job_id=job_id, role=role, party_id=party_id) if worker_name in [ WorkerName.PROVIDER_REGISTRAR, WorkerName.DEPENDENCE_UPLOAD ]: if not provider: raise ValueError("no provider argument") config = {"provider": provider.to_dict()} if worker_name == WorkerName.PROVIDER_REGISTRAR: from fate_flow.worker.provider_registrar import ProviderRegistrar module = ProviderRegistrar module_file_path = sys.modules[ ProviderRegistrar.__module__].__file__ specific_cmd = [] elif worker_name == WorkerName.DEPENDENCE_UPLOAD: from fate_flow.worker.dependence_upload import DependenceUpload module = DependenceUpload module_file_path = sys.modules[ DependenceUpload.__module__].__file__ specific_cmd = [ '--dependence_type', kwargs.get("dependence_type") ] provider_info = provider.to_dict() elif worker_name is WorkerName.TASK_INITIALIZER: if not initialized_config: raise ValueError("no initialized_config argument") config = initialized_config job_conf = job_utils.save_using_job_conf(job_id=job_id, role=role, party_id=party_id, config_dir=config_dir) from fate_flow.worker.task_initializer import TaskInitializer module = TaskInitializer module_file_path = sys.modules[TaskInitializer.__module__].__file__ specific_cmd = [ '--dsl', job_conf["dsl_path"], '--runtime_conf', job_conf["runtime_conf_path"], '--train_runtime_conf', job_conf["train_runtime_conf_path"], '--pipeline_dsl', job_conf["pipeline_dsl_path"], ] provider_info = initialized_config["provider"] else: raise Exception(f"not support {worker_name} worker") config_path, result_path = cls.get_config(config_dir=config_dir, config=config, log_dir=log_dir) process_cmd = [ sys.executable or "python3", module_file_path, "--config", config_path, '--result', result_path, "--log_dir", log_dir, "--parent_log_dir", os.path.dirname(log_dir), "--worker_id", worker_id, "--run_ip", RuntimeConfig.JOB_SERVER_HOST, "--job_server", f"{RuntimeConfig.JOB_SERVER_HOST}:{RuntimeConfig.HTTP_PORT}", ] if job_id: process_cmd.extend([ "--job_id", job_id, "--role", role, "--party_id", party_id, ]) process_cmd.extend(specific_cmd) if run_in_subprocess: p = process_utils.run_subprocess(job_id=job_id, config_dir=config_dir, process_cmd=process_cmd, added_env=cls.get_env( job_id, provider_info), log_dir=log_dir, cwd_dir=config_dir, process_name=worker_name.value, process_id=worker_id) participate["pid"] = p.pid if job_id and role and party_id: logger = schedule_logger(job_id) msg = f"{worker_name} worker {worker_id} subprocess {p.pid}" else: logger = stat_logger msg = f"{worker_name} worker {worker_id} subprocess {p.pid}" logger.info(ready_log(msg=msg, role=role, party_id=party_id)) # asynchronous if worker_name in [WorkerName.DEPENDENCE_UPLOAD]: if kwargs.get("callback") and kwargs.get("callback_param"): callback_param = {} participate.update(participate.get("kwargs", {})) for k, v in participate.items(): if k in kwargs.get("callback_param"): callback_param[k] = v kwargs.get("callback")(**callback_param) else: try: p.wait(timeout=120) if p.returncode == 0: logger.info( successful_log(msg=msg, role=role, party_id=party_id)) else: logger.info( failed_log(msg=msg, role=role, party_id=party_id)) if p.returncode == 0: return p.returncode, load_json_conf(result_path) else: std_path = process_utils.get_std_path( log_dir=log_dir, process_name=worker_name.value, process_id=worker_id) raise Exception( f"run error, please check logs: {std_path}, {log_dir}/INFO.log" ) except subprocess.TimeoutExpired as e: err = failed_log(msg=f"{msg} run timeout", role=role, party_id=party_id) logger.exception(err) raise Exception(err) finally: try: p.kill() p.poll() except Exception as e: logger.exception(e) else: kwargs = cls.cmd_to_func_kwargs(process_cmd) code, message, result = module().run(**kwargs) if code == 0: return code, result else: raise Exception(message)