def master(): from paddlerec.core.engine.cluster.cluster import ClusterEngine _envs = envs.load_yaml(args.backend) flattens = envs.flatten_environs(_envs, "_") flattens["engine_role"] = "MASTER" flattens["engine_mode"] = envs.get_runtime_environ("mode") flattens["engine_run_config"] = args.model flattens["engine_temp_path"] = tempfile.mkdtemp() envs.set_runtime_environs(flattens) ClusterEngine.workspace_replace() print(envs.pretty_print_envs(flattens, ("Submit Envs", "Value"))) launch = ClusterEngine(None, args.model) return launch
def master(): from paddlerec.core.engine.cluster.cluster import ClusterEngine # Get fleet_mode & device run_extras = get_all_inters_from_yaml(args.model, ["runner."]) mode = envs.get_runtime_environ("mode") fleet_class = ".".join(["runner", mode, "fleet_mode"]) device_class = ".".join(["runner", mode, "device"]) fleet_mode = run_extras.get(fleet_class, "ps") device = run_extras.get(device_class, "cpu") device = device.upper() fleet_mode = fleet_mode.upper() if fleet_mode == "COLLECTIVE" and device != "GPU": raise ValueError("COLLECTIVE can not be used without GPU") # Get Thread nums model_envs = envs.load_yaml(args.model) phases_class = ".".join(["runner", mode, "phases"]) phase_names = run_extras.get(phases_class) phases = [] all_phases = model_envs.get("phase") if phase_names is None: phases = all_phases else: for phase in all_phases: if phase["name"] in phase_names: phases.append(phase) thread_num = [] for phase in phases: thread_num.append(int(phase["thread_num"])) max_thread_num = max(thread_num) backend_envs = envs.load_yaml(args.backend) flattens = envs.flatten_environs(backend_envs, "_") flattens["engine_role"] = "MASTER" flattens["engine_mode"] = envs.get_runtime_environ("mode") flattens["engine_run_config"] = args.model flattens["max_thread_num"] = max_thread_num flattens["fleet_mode"] = fleet_mode flattens["device"] = device flattens["backend_yaml"] = args.backend envs.set_runtime_environs(flattens) launch = ClusterEngine(None, args.model) return launch
def master(): role = "MASTER" from paddlerec.core.engine.cluster.cluster import ClusterEngine with open(args.backend, 'r') as rb: _envs = yaml.load(rb.read(), Loader=yaml.FullLoader) flattens = envs.flatten_environs(_envs, "_") flattens["engine_role"] = role flattens["engine_run_config"] = args.model flattens["engine_temp_path"] = tempfile.mkdtemp() update_workspace(flattens) envs.set_runtime_environs(flattens) print( envs.pretty_print_envs(flattens, ("Submit Runtime Envs", "Value"))) launch = ClusterEngine(None, args.model) return launch
def worker(mode): if not mode: raise ValueError("mode: {} can not be recognized") from paddlerec.core.engine.cluster.cluster import ClusterEngine run_extras = get_all_inters_from_yaml(args.model, ["runner."]) trainer_class = ".".join(["runner", mode, "trainer_class"]) fleet_class = ".".join(["runner", mode, "fleet_mode"]) device_class = ".".join(["runner", mode, "device"]) strategy_class = ".".join(["runner", mode, "distribute_strategy"]) trainer = run_extras.get(trainer_class, "GeneralTrainer") fleet_mode = run_extras.get(fleet_class, "ps") device = run_extras.get(device_class, "cpu") distributed_strategy = run_extras.get(strategy_class, "async") executor_mode = "train" device = device.upper() fleet_mode = fleet_mode.upper() if fleet_mode == "COLLECTIVE" and device != "GPU": raise ValueError("COLLECTIVE can not be used without GPU") cluster_envs = {} cluster_envs["fleet_mode"] = fleet_mode cluster_envs["engine_role"] = "WORKER" cluster_envs["log_dir"] = "logs" cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.engine"] = "cluster" cluster_envs["train.trainer.executor_mode"] = executor_mode cluster_envs["train.trainer.strategy"] = distributed_strategy cluster_envs["train.trainer.threads"] = envs.get_runtime_environ( "CPU_NUM") cluster_envs["train.trainer.platform"] = envs.get_platform() print("launch {} engine with cluster to with model: {}".format( trainer, args.model)) set_runtime_envs(cluster_envs, args.model) launch = ClusterEngine(None, args.model) return launch