def start_worker_procs(self): if (envs.get_runtime_environ("fleet_mode") == "COLLECTIVE"): #trainer_ports = os.getenv("TRAINER_PORTS", None).split(",") cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") if cuda_visible_devices is None or cuda_visible_devices == "": selected_gpus = range(int(os.getenv("TRAINER_GPU_CARD_COUNT"))) else: # change selected_gpus into relative values # e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.selected_gpus=4,5,6,7; # therefore selected_gpus=0,1,2,3 cuda_visible_devices_list = cuda_visible_devices.split(',') for x in range(int(os.getenv("TRAINER_GPU_CARD_COUNT"))): assert x in cuda_visible_devices_list, "Can't find "\ "your selected_gpus %s in CUDA_VISIBLE_DEVICES[%s]."\ % (x, cuda_visible_devices) selected_gpus = [cuda_visible_devices_list.index(x)] print("selected_gpus:{}".format(selected_gpus)) factory = "paddlerec.core.factory" cmd = [sys.executable, "-u", "-m", factory, self.trainer] logs_dir = envs.get_runtime_environ("log_dir") print("use_paddlecloud_flag:{}".format( cluster_utils.use_paddlecloud())) if cluster_utils.use_paddlecloud(): cluster, pod = cluster_utils.get_cloud_cluster(selected_gpus) logger.info("get cluster from cloud:{}".format(cluster)) procs = cluster_utils.start_local_trainers(cluster, pod, cmd, log_dir=logs_dir) print("cluster:{}".format(cluster)) print("pod:{}".format(pod)) else: trainer = TrainerFactory.create(self.trainer) trainer.run()
def _get_dataset(self, state="TRAIN"): if state == "TRAIN": inputs = self.model.get_inputs() namespace = "train.reader" train_data_path = envs.get_global_env("train_data_path", None, namespace) else: inputs = self.model.get_infer_inputs() namespace = "evaluate.reader" train_data_path = envs.get_global_env("test_data_path", None, namespace) sparse_slots = envs.get_global_env("sparse_slots", None, namespace) dense_slots = envs.get_global_env("dense_slots", None, namespace) threads = int(envs.get_runtime_environ("train.trainer.threads")) batch_size = envs.get_global_env("batch_size", None, namespace) reader_class = envs.get_global_env("class", None, namespace) abs_dir = os.path.dirname(os.path.abspath(__file__)) reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py') if sparse_slots is None and dense_slots is None: pipe_cmd = "python {} {} {} {}".format(reader, reader_class, state, self._config_yaml) else: padding = envs.get_global_env("padding", 0, namespace) pipe_cmd = "python {} {} {} {} {} {} {} {}".format( reader, "slot", "slot", self._config_yaml, namespace, \ sparse_slots.replace(" ", "#"), dense_slots.replace(" ", "#"), str(padding)) if train_data_path.startswith("paddlerec::"): package_base = envs.get_runtime_environ("PACKAGE_BASE") assert package_base is not None train_data_path = os.path.join(package_base, train_data_path.split("::")[1]) dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(inputs) dataset.set_pipe_command(pipe_cmd) dataset.set_batch_size(batch_size) dataset.set_thread(threads) file_list = [ os.path.join(train_data_path, x) for x in os.listdir(train_data_path) ] self.files = file_list dataset.set_filelist(self.files) debug_mode = envs.get_global_env("reader_debug_mode", False, namespace) if debug_mode: print("--- Dataset Debug Mode Begin , show pre 10 data of {}---". format(file_list[0])) os.system("cat {} | {} | head -10".format(file_list[0], pipe_cmd)) print("--- Dataset Debug Mode End , show pre 10 data of {}---". format(file_list[0])) exit(0) return dataset
def dataloader_by_name(readerclass, dataset_name, yaml_file): reader_class = lazy_instance_by_fliename(readerclass, "TrainReader") name = "dataset." + dataset_name + "." data_path = get_global_env(name + "data_path") if data_path.startswith("paddlerec::"): package_base = get_runtime_environ("PACKAGE_BASE") assert package_base is not None data_path = os.path.join(package_base, data_path.split("::")[1]) files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)] reader = reader_class(yaml_file) reader.init() def gen_reader(): for file in files: with open(file, 'r') as f: for line in f: line = line.rstrip('\n') iter = reader.generate_sample(line) for parsed_line in iter(): if parsed_line is None: continue else: values = [] for pased in parsed_line: values.append(pased[1]) yield values def gen_batch_reader(): return reader.generate_batch_from_trainfiles(files) if hasattr(reader, 'generate_batch_from_trainfiles'): return gen_batch_reader() return gen_reader
def __init_impl__(self): abs_dir = os.path.dirname(os.path.abspath(__file__)) backend = envs.get_runtime_environ("engine_backend") if backend == "PaddleCloud": self.submit_script = os.path.join(abs_dir, "cloud/cluster.sh") else: raise ValueError("{} can not be supported now".format(backend))
def master(): from paddlerec.core.engine.cluster.cluster import ClusterEngine # Get fleet_mode & device run_extras = get_all_inters_from_yaml(args.model, ["runner."]) mode = envs.get_runtime_environ("mode") fleet_class = ".".join(["runner", mode, "fleet_mode"]) device_class = ".".join(["runner", mode, "device"]) fleet_mode = run_extras.get(fleet_class, "ps") device = run_extras.get(device_class, "cpu") device = device.upper() fleet_mode = fleet_mode.upper() if fleet_mode == "COLLECTIVE" and device != "GPU": raise ValueError("COLLECTIVE can not be used without GPU") # Get Thread nums model_envs = envs.load_yaml(args.model) phases_class = ".".join(["runner", mode, "phases"]) phase_names = run_extras.get(phases_class) phases = [] all_phases = model_envs.get("phase") if phase_names is None: phases = all_phases else: for phase in all_phases: if phase["name"] in phase_names: phases.append(phase) thread_num = [] for phase in phases: thread_num.append(int(phase["thread_num"])) max_thread_num = max(thread_num) backend_envs = envs.load_yaml(args.backend) flattens = envs.flatten_environs(backend_envs, "_") flattens["engine_role"] = "MASTER" flattens["engine_mode"] = envs.get_runtime_environ("mode") flattens["engine_run_config"] = args.model flattens["max_thread_num"] = max_thread_num flattens["fleet_mode"] = fleet_mode flattens["device"] = device flattens["backend_yaml"] = args.backend envs.set_runtime_environs(flattens) launch = ClusterEngine(None, args.model) return launch
def get_worker_num(run_extras, workers): _envs = envs.load_yaml(args.model) mode = envs.get_runtime_environ("mode") workspace = envs.get_runtime_environ("workspace") phases_class = ".".join(["runner", mode, "phases"]) phase_names = run_extras.get(phases_class) phases = [] all_phases = _envs.get("phase") if phase_names is None: phases = all_phases else: for phase in all_phases: if phase["name"] in phase_names: phases.append(phase) dataset_names = [] for phase in phases: dataset_names.append(phase["dataset_name"]) datapaths = [] for dataset in _envs.get("dataset"): if dataset["name"] in dataset_names: datapaths.append(dataset["data_path"]) if not datapaths: raise ValueError("data path must exist for training/inference") datapaths = [ envs.workspace_adapter_by_specific(path, workspace) for path in datapaths ] all_workers = [len(os.listdir(path)) for path in datapaths] all_workers.append(workers) max_worker_num = min(all_workers) if max_worker_num >= workers: return workers print( "phases do not have enough datas for training, set worker/gpu cards num from {} to {}" .format(workers, max_worker_num)) return max_worker_num
def __init_impl__(self): self.role = envs.get_runtime_environ("engine_role") if self.role == "WORKER": return abs_dir = os.path.dirname(os.path.abspath(__file__)) os.environ["abs_dir"] = str(abs_dir) self.backend = envs.get_runtime_environ("backend") if not self.backend: self.backend = "" self.backend = self.backend.upper() if self.backend == "PADDLECLOUD": self.submit_script = os.path.join(abs_dir, "cloud/cluster.sh") elif self.backend == "KUBERNETES": self.submit_script = os.path.join(abs_dir, "k8s/cluster.sh") else: raise ValueError("{} can not be supported now".format( self.backend))
def _get_dataset(self, state="TRAIN", hour=None): if state == "TRAIN": inputs = self.model.get_inputs() namespace = "train.reader" train_data_path = envs.get_global_env("train_data_path", None, namespace) else: inputs = self.model.get_infer_inputs() namespace = "evaluate.reader" train_data_path = envs.get_global_env("test_data_path", None, namespace) threads = int(envs.get_runtime_environ("train.trainer.threads")) batch_size = envs.get_global_env("batch_size", None, namespace) reader_class = envs.get_global_env("class", None, namespace) abs_dir = os.path.dirname(os.path.abspath(__file__)) reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py') pipe_cmd = "python {} {} {} {}".format(reader, reader_class, state, self._config_yaml) if train_data_path.startswith("paddlerec::"): package_base = envs.get_runtime_environ("PACKAGE_BASE") assert package_base is not None train_data_path = os.path.join(package_base, train_data_path.split("::")[1]) dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(inputs) dataset.set_pipe_command(pipe_cmd) dataset.set_batch_size(batch_size) dataset.set_thread(threads) if hour is not None: train_data_path = os.path.join(train_data_path, hour) file_list = [ os.path.join(train_data_path, x) for x in os.listdir(train_data_path) ] self.files = file_list dataset.set_filelist(self.files) return dataset
def which_executor_mode(self): executor_mode = envs.get_runtime_environ("train.trainer.executor_mode") if executor_mode.upper() not in ["TRAIN", "INFER"]: raise ValueError( "Not Support Executor Mode {}".format(executor_mode)) if executor_mode.upper() == "TRAIN": self.is_infer = False else: self.is_infer = True print("Executor Mode: {}".format(executor_mode)) self._context["is_infer"] = self.is_infer
def run(self): role = envs.get_runtime_environ("engine_role") if role == "MASTER": self.start_master_procs() elif role == "WORKER": self.start_worker_procs() else: raise ValueError("role {} error, must in MASTER/WORKER".format(role))
def slotdataloader(readerclass, train, yaml_file, context): if train == "TRAIN": reader_name = "SlotReader" namespace = "train.reader" data_path = get_global_env("train_data_path", None, namespace) else: reader_name = "SlotReader" namespace = "evaluate.reader" data_path = get_global_env("test_data_path", None, namespace) if data_path.startswith("paddlerec::"): package_base = get_runtime_environ("PACKAGE_BASE") assert package_base is not None data_path = os.path.join(package_base, data_path.split("::")[1]) files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)] if context["engine"] == EngineMode.LOCAL_CLUSTER: files = split_files(files, context["fleet"].worker_index(), context["fleet"].worker_num()) print("file_list: {}".format(files)) sparse = get_global_env("sparse_slots", "#", namespace) if sparse == "": sparse = "#" dense = get_global_env("dense_slots", "#", namespace) if dense == "": dense = "#" padding = get_global_env("padding", 0, namespace) reader = SlotReader(yaml_file) reader.init(sparse, dense, int(padding)) def gen_reader(): for file in files: with open(file, 'r') as f: for line in f: line = line.rstrip('\n') iter = reader.generate_sample(line) for parsed_line in iter(): if parsed_line is None: continue else: values = [] for pased in parsed_line: values.append(pased[1]) yield values def gen_batch_reader(): return reader.generate_batch_from_trainfiles(files) if hasattr(reader, 'generate_batch_from_trainfiles'): return gen_batch_reader() return gen_reader
def which_fleet_mode(self): fleet_mode = envs.get_runtime_environ("fleet_mode") if fleet_mode.upper() == "PS": self.fleet_mode = FleetMode.PS elif fleet_mode.upper() == "COLLECTIVE": self.fleet_mode = FleetMode.COLLECTIVE elif fleet_mode.upper() == "PSLIB": self.fleet_mode = FleetMode.PSLIB else: raise ValueError("Not Support Fleet Mode {}".format(fleet_mode)) self._context["is_pslib"] = (fleet_mode.upper() == "PSLIB") self._context["fleet_mode"] = fleet_mode
def __init_impl__(self): abs_dir = os.path.dirname(os.path.abspath(__file__)) backend = envs.get_runtime_environ("engine_backend") if not backend: backend = "" backend = backend.upper() if backend == "PADDLECLOUD": self.submit_script = os.path.join(abs_dir, "cloud/cluster.sh") elif backend == "KUBERNETES": self.submit_script = os.path.join(abs_dir, "k8s/cluster.sh") else: raise ValueError("{} can not be supported now".format(backend))
def worker(mode): if not mode: raise ValueError("mode: {} can not be recognized") run_extras = get_all_inters_from_yaml(args.model, ["runner."]) trainer_class = ".".join(["runner", mode, "trainer_class"]) fleet_class = ".".join(["runner", mode, "fleet_mode"]) device_class = ".".join(["runner", mode, "device"]) selected_gpus_class = ".".join(["runner", mode, "selected_gpus"]) strategy_class = ".".join(["runner", mode, "distribute_strategy"]) worker_class = ".".join(["runner", mode, "worker_num"]) server_class = ".".join(["runner", mode, "server_num"]) trainer = run_extras.get(trainer_class, "GeneralTrainer") fleet_mode = run_extras.get(fleet_class, "ps") device = run_extras.get(device_class, "cpu") selected_gpus = run_extras.get(selected_gpus_class, "0") distributed_strategy = run_extras.get(strategy_class, "async") worker_num = run_extras.get(worker_class, 1) server_num = run_extras.get(server_class, 1) executor_mode = "train" device = device.upper() fleet_mode = fleet_mode.upper() if fleet_mode == "COLLECTIVE" and device != "GPU": raise ValueError("COLLECTIVE can not be used with GPU") cluster_envs = {} if device == "GPU": cluster_envs["selected_gpus"] = selected_gpus cluster_envs["server_num"] = server_num cluster_envs["worker_num"] = worker_num cluster_envs["fleet_mode"] = fleet_mode cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.engine"] = "cluster" cluster_envs["train.trainer.executor_mode"] = executor_mode cluster_envs["train.trainer.strategy"] = distributed_strategy cluster_envs["train.trainer.threads"] = envs.get_runtime_environ( "CPU_NUM") cluster_envs["train.trainer.platform"] = envs.get_platform() print("launch {} engine with cluster to with model: {}".format( trainer, args.model)) set_runtime_envs(cluster_envs, args.model) trainer = TrainerFactory.create(args.model) return trainer
def master(): from paddlerec.core.engine.cluster.cluster import ClusterEngine _envs = envs.load_yaml(args.backend) flattens = envs.flatten_environs(_envs, "_") flattens["engine_role"] = "MASTER" flattens["engine_mode"] = envs.get_runtime_environ("mode") flattens["engine_run_config"] = args.model flattens["engine_temp_path"] = tempfile.mkdtemp() envs.set_runtime_environs(flattens) ClusterEngine.workspace_replace() print(envs.pretty_print_envs(flattens, ("Submit Envs", "Value"))) launch = ClusterEngine(None, args.model) return launch
def paddlecloud_env_check(self): # get fleet mode fleet_mode = envs.get_runtime_environ("fleet_mode") # get device device = envs.get_runtime_environ("device") # get cluster type cluster_type = envs.get_runtime_environ("cluster_type") cluster_env_check_tool = None if cluster_type.upper() == "MPI": if device == "CPU" and fleet_mode == "PS": cluster_env_check_tool = PaddleCloudMpiEnv() else: raise ValueError( "Paddlecloud with Mpi don't support GPU training, check your config.yaml & backend.yaml" ) elif cluster_type.upper() == "K8S": if fleet_mode == "PS": if device == "CPU": cluster_env_check_tool = CloudPsCpuEnv() elif device == "GPU": raise ValueError( "PS-GPU on paddlecloud is not supported at this time, comming soon" ) if fleet_mode == "COLLECTIVE": if device == "GPU": cluster_env_check_tool = CloudCollectiveEnv() elif device == "CPU": raise ValueError( "Unexpected config -> device: CPU with fleet_mode: Collective, check your config.yaml" ) else: raise ValueError( "cluster_type {} error, must in MPI/K8S".format(cluster_type)) cluster_env_check_tool.env_check() cluster_env_check_tool.env_set()
def which_engine(self): engine = envs.get_runtime_environ("train.trainer.engine") if engine.upper() == "SINGLE": self.engine = EngineMode.SINGLE self.is_fleet = False elif engine.upper() == "LOCAL_CLUSTER": self.engine = EngineMode.LOCAL_CLUSTER self.is_fleet = True elif engine.upper() == "CLUSTER": self.engine = EngineMode.CLUSTER self.is_fleet = True else: raise ValueError("Not Support Engine {}".format(engine)) self._context["is_fleet"] = self.is_fleet self._context["engine"] = self.engine
def worker(): role = "WORKER" trainer = get_trainer_prefix(args) + "ClusterTrainer" cluster_envs = {} cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.engine"] = "cluster" cluster_envs["train.trainer.threads"] = envs.get_runtime_environ( "CPU_NUM") cluster_envs["train.trainer.platform"] = envs.get_platform() print("launch {} engine with cluster to with model: {}".format( trainer, args.model)) set_runtime_envs(cluster_envs, args.model) trainer = TrainerFactory.create(args.model) return trainer
def dataloader_by_name(readerclass, dataset_name, yaml_file, context, reader_class_name="Reader"): reader_class = lazy_instance_by_fliename(readerclass, reader_class_name) name = "dataset." + dataset_name + "." data_path = get_global_env(name + "data_path") if data_path.startswith("paddlerec::"): package_base = get_runtime_environ("PACKAGE_BASE") assert package_base is not None data_path = os.path.join(package_base, data_path.split("::")[1]) files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)] if context["engine"] == EngineMode.LOCAL_CLUSTER: files = split_files(files, context["fleet"].worker_index(), context["fleet"].worker_num()) print("file_list : {}".format(files)) reader = reader_class(yaml_file) reader.init() def gen_reader(): for file in files: with open(file, 'r') as f: for line in f: line = line.rstrip('\n') iter = reader.generate_sample(line) for parsed_line in iter(): if parsed_line is None: continue else: values = [] for pased in parsed_line: values.append(pased[1]) yield values def gen_batch_reader(): return reader.generate_batch_from_trainfiles(files) if hasattr(reader, 'generate_batch_from_trainfiles'): return gen_batch_reader() return gen_reader
def single_infer_engine(args): run_extras = get_all_inters_from_yaml(args.model, ["runner."]) mode = envs.get_runtime_environ("mode") trainer_class = ".".join(["runner", mode, "trainer_class"]) fleet_class = ".".join(["runner", mode, "fleet_mode"]) device_class = ".".join(["runner", mode, "device"]) selected_gpus_class = ".".join(["runner", mode, "selected_gpus"]) epochs_class = ".".join(["runner", mode, "epochs"]) epochs = run_extras.get(epochs_class, 1) if epochs > 1: warnings.warn( "It makes no sense to predict the same model for multiple epochs", category=UserWarning, stacklevel=2) trainer = run_extras.get(trainer_class, "GeneralTrainer") fleet_mode = run_extras.get(fleet_class, "ps") device = run_extras.get(device_class, "cpu") selected_gpus = run_extras.get(selected_gpus_class, "0") executor_mode = "infer" single_envs = {} if device.upper() == "GPU": selected_gpus_num = len(selected_gpus.split(",")) if selected_gpus_num != 1: raise ValueError( "Single Mode Only Support One GPU, Set Local Cluster Mode to use Multi-GPUS" ) single_envs["selsected_gpus"] = selected_gpus single_envs["FLAGS_selected_gpus"] = selected_gpus single_envs["train.trainer.trainer"] = trainer single_envs["train.trainer.executor_mode"] = executor_mode single_envs["fleet_mode"] = fleet_mode single_envs["train.trainer.threads"] = "2" single_envs["train.trainer.platform"] = envs.get_platform() single_envs["train.trainer.engine"] = "single" set_runtime_envs(single_envs, args.model) trainer = TrainerFactory.create(args.model) return trainer
def _build_strategy(self, context): from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory mode = envs.get_runtime_environ("train.trainer.strategy") assert mode in ["async", "geo", "sync"] strategy = None if context['fleet_mode'] == "PS": strategy = paddle.distributed.fleet.DistributedStrategy() if mode == 'async': strategy.a_sync = True elif mode == 'sync': strategy.a_sync = False elif mode == 'geo': strategy.a_sync = True strategy.a_sync_configs = {"k_steps": 400} elif context['fleet_mode'] == "COLLECTIVE": strategy = paddle.distributed.fleet.DistributedStrategy() strategy.sync_nccl_allreduce = True strategy.nccl_comm_num = 2 strategy.fuse_all_reduce_ops = True # build strategy build_strategy = fluid.BuildStrategy() build_strategy.enable_sequential_execution = True build_strategy.fuse_elewise_add_act_ops = True build_strategy.fuse_bn_act_ops = True build_strategy.enable_auto_fusion = True build_strategy.fuse_all_optimizer_ops = True strategy.build_strategy = build_strategy # execute strategy execution_strategy = paddle.static.ExecutionStrategy() execution_strategy.num_threads = int(os.getenv('CPU_NUM', 2)) execution_strategy.num_iteration_per_drop_scope = 100 execution_strategy.num_iteration_per_run = 1 strategy.execution_strategy = execution_strategy assert strategy is not None context["strategy"] = strategy return strategy
def slotdataloader_by_name(readerclass, dataset_name, yaml_file): name = "dataset." + dataset_name + "." reader_name = "SlotReader" data_path = get_global_env(name + "data_path") if data_path.startswith("paddlerec::"): package_base = get_runtime_environ("PACKAGE_BASE") assert package_base is not None data_path = os.path.join(package_base, data_path.split("::")[1]) files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)] sparse = get_global_env(name + "sparse_slots", "#") if sparse == "": sparse = "#" dense = get_global_env(name + "dense_slots", "#") if dense == "": dense = "#" padding = get_global_env(name + "padding", 0) reader = SlotReader(yaml_file) reader.init(sparse, dense, int(padding)) def gen_reader(): for file in files: with open(file, 'r') as f: for line in f: line = line.rstrip('\n') iter = reader.generate_sample(line) for parsed_line in iter(): if parsed_line is None: continue else: values = [] for pased in parsed_line: values.append(pased[1]) yield values def gen_batch_reader(): return reader.generate_batch_from_trainfiles(files) if hasattr(reader, 'generate_batch_from_trainfiles'): return gen_batch_reader() return gen_reader
def build_strategy(self): mode = envs.get_runtime_environ("train.trainer.strategy") assert mode in ["async", "geo", "sync", "half_async"] strategy = None if mode == "async": strategy = StrategyFactory.create_async_strategy() elif mode == "geo": push_num = envs.get_global_env("train.strategy.mode.push_num", 100) strategy = StrategyFactory.create_geo_strategy(push_num) elif mode == "sync": strategy = StrategyFactory.create_sync_strategy() elif mode == "half_async": strategy = StrategyFactory.create_half_async_strategy() assert strategy is not None self.strategy = strategy return strategy
def _build_strategy(self, context): from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory mode = envs.get_runtime_environ("train.trainer.strategy") assert mode in ["async", "geo", "sync", "half_async"] strategy = None if mode == "async": strategy = StrategyFactory.create_async_strategy() elif mode == "geo": push_num = envs.get_global_env("train.strategy.mode.push_num", 100) strategy = StrategyFactory.create_geo_strategy(push_num) elif mode == "sync": strategy = StrategyFactory.create_sync_strategy() elif mode == "half_async": strategy = StrategyFactory.create_half_async_strategy() assert strategy is not None context["strategy"] = strategy return strategy
def worker(mode): if not mode: raise ValueError("mode: {} can not be recognized") from paddlerec.core.engine.cluster.cluster import ClusterEngine run_extras = get_all_inters_from_yaml(args.model, ["runner."]) trainer_class = ".".join(["runner", mode, "trainer_class"]) fleet_class = ".".join(["runner", mode, "fleet_mode"]) device_class = ".".join(["runner", mode, "device"]) strategy_class = ".".join(["runner", mode, "distribute_strategy"]) trainer = run_extras.get(trainer_class, "GeneralTrainer") fleet_mode = run_extras.get(fleet_class, "ps") device = run_extras.get(device_class, "cpu") distributed_strategy = run_extras.get(strategy_class, "async") executor_mode = "train" device = device.upper() fleet_mode = fleet_mode.upper() if fleet_mode == "COLLECTIVE" and device != "GPU": raise ValueError("COLLECTIVE can not be used without GPU") cluster_envs = {} cluster_envs["fleet_mode"] = fleet_mode cluster_envs["engine_role"] = "WORKER" cluster_envs["log_dir"] = "logs" cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.engine"] = "cluster" cluster_envs["train.trainer.executor_mode"] = executor_mode cluster_envs["train.trainer.strategy"] = distributed_strategy cluster_envs["train.trainer.threads"] = envs.get_runtime_environ( "CPU_NUM") cluster_envs["train.trainer.platform"] = envs.get_platform() print("launch {} engine with cluster to with model: {}".format( trainer, args.model)) set_runtime_envs(cluster_envs, args.model) launch = ClusterEngine(None, args.model) return launch
def worker(): role = "WORKER" _envs = envs.load_yaml(args.model) run_extras = get_all_inters_from_yaml(args.model, ["train.", "runner."]) trainer_class = run_extras.get( "runner." + _envs["mode"] + ".trainer_class", None) if trainer_class: trainer = trainer_class else: trainer = "GeneralTrainer" executor_mode = "train" distributed_strategy = run_extras.get( "runner." + _envs["mode"] + ".distribute_strategy", "async") selected_gpus = run_extras.get( "runner." + _envs["mode"] + ".selected_gpus", "0") fleet_mode = run_extras.get("runner." + _envs["mode"] + ".fleet_mode", "ps") cluster_envs = {} cluster_envs["selected_gpus"] = selected_gpus cluster_envs["fleet_mode"] = fleet_mode cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.executor_mode"] = executor_mode cluster_envs["train.trainer.engine"] = "cluster" cluster_envs["train.trainer.strategy"] = distributed_strategy cluster_envs["train.trainer.threads"] = envs.get_runtime_environ( "CPU_NUM") cluster_envs["train.trainer.platform"] = envs.get_platform() print("launch {} engine with cluster to with model: {}".format( trainer, args.model)) set_runtime_envs(cluster_envs, args.model) trainer = TrainerFactory.create(args.model) return trainer
def __init__(self, config=None): self._status_processor = {} self.model = None self.inference_models = [] self.increment_models = [] self._exector_context = {} self._context = {'status': 'uninit', 'is_exit': False} self._context["config_yaml"] = config self._model = {} self._dataset = {} self._runner_name = envs.get_runtime_environ("mode") self._context["runner_name"] = self._runner_name phase_names = envs.get_global_env( "runner." + self._runner_name + ".phases", None) _config = envs.load_yaml(config) self._context["env"] = _config self._context["dataset"] = _config.get("dataset") phases = [] if phase_names is None: phases = _config.get("phase") else: for phase in _config.get("phase"): if phase["name"] in phase_names: phases.append(phase) self._context["phases"] = phases print("PaddleRec: Runner {} Begin".format(self._runner_name)) self.which_engine() self.which_device() self.which_fleet_mode() self.which_executor_mode() self.legality_check()
def local_mpi_engine(args): print("launch cluster engine with cluster to run model: {}".format( args.model)) from paddlerec.core.engine.local_mpi import LocalMPIEngine print("use 1X1 MPI ClusterTraining at localhost to run model: {}".format( args.model)) mpi = util.run_which("mpirun") if not mpi: raise RuntimeError("can not find mpirun, please check environment") run_extras = get_all_inters_from_yaml(args.model, ["runner."]) mode = envs.get_runtime_environ("mode") trainer_class = ".".join(["runner", mode, "trainer_class"]) fleet_class = ".".join(["runner", mode, "fleet_mode"]) distributed_strategy = "async" executor_mode = "train" trainer = run_extras.get(trainer_class, "GeneralTrainer") fleet_mode = run_extras.get(fleet_class, "ps") cluster_envs = {} cluster_envs["mpirun"] = mpi cluster_envs["train.trainer.trainer"] = trainer cluster_envs["log_dir"] = "logs" cluster_envs["train.trainer.engine"] = "local_cluster" cluster_envs["train.trainer.executor_mode"] = executor_mode cluster_envs["fleet_mode"] = fleet_mode cluster_envs["train.trainer.strategy"] = distributed_strategy cluster_envs["train.trainer.threads"] = "2" cluster_envs["train.trainer.platform"] = envs.get_platform() set_runtime_envs(cluster_envs, args.model) launch = LocalMPIEngine(cluster_envs, args.model) return launch
def dataloader_by_name(readerclass, dataset_name, yaml_file, context, reader_class_name="Reader"): reader_class = lazy_instance_by_fliename(readerclass, reader_class_name) name = "dataset." + dataset_name + "." data_path = get_global_env(name + "data_path") if data_path.startswith("paddlerec::"): package_base = get_runtime_environ("PACKAGE_BASE") assert package_base is not None data_path = os.path.join(package_base, data_path.split("::")[1]) hidden_file_list, files = check_filelist(hidden_file_list=[], data_file_list=[], train_data_path=data_path) if (hidden_file_list is not None): print( "Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}" .format(hidden_file_list)) files.sort() # for local cluster: discard some files if files cannot be divided equally between GPUs if (context["device"] == "GPU") and "PADDLEREC_GPU_NUMS" in os.environ: selected_gpu_nums = int(os.getenv("PADDLEREC_GPU_NUMS")) discard_file_nums = len(files) % selected_gpu_nums if (discard_file_nums != 0): warnings.warn( "Because files cannot be divided equally between GPUs,discard these files:{}" .format(files[-discard_file_nums:])) files = files[:len(files) - discard_file_nums] need_split_files = False if context["engine"] == EngineMode.LOCAL_CLUSTER: # for local cluster: split files for multi process need_split_files = True elif context["engine"] == EngineMode.CLUSTER and context[ "cluster_type"] == "K8S": # for k8s mount mode, split files for every node need_split_files = True print("need_split_files: {}".format(need_split_files)) if need_split_files: files = split_files(files, context["fleet"].worker_index(), context["fleet"].worker_num()) context["file_list"] = files reader = reader_class(yaml_file) reader.init() def gen_reader(): for file in files: with open(file, 'r') as f: for line in f: line = line.rstrip('\n') iter = reader.generate_sample(line) for parsed_line in iter(): if parsed_line is None: continue else: values = [] for pased in parsed_line: values.append(pased[1]) yield values def gen_batch_reader(): return reader.generate_batch_from_trainfiles(files) if hasattr(reader, 'generate_batch_from_trainfiles'): return gen_batch_reader() if hasattr(reader, "batch_tensor_creator"): return reader.batch_tensor_creator(gen_reader) return gen_reader
def local_cluster_engine(args): def get_worker_num(run_extras, workers): _envs = envs.load_yaml(args.model) mode = envs.get_runtime_environ("mode") workspace = envs.get_runtime_environ("workspace") phases_class = ".".join(["runner", mode, "phases"]) phase_names = run_extras.get(phases_class) phases = [] all_phases = _envs.get("phase") if phase_names is None: phases = all_phases else: for phase in all_phases: if phase["name"] in phase_names: phases.append(phase) dataset_names = [] for phase in phases: dataset_names.append(phase["dataset_name"]) datapaths = [] for dataset in _envs.get("dataset"): if dataset["name"] in dataset_names: datapaths.append(dataset["data_path"]) if not datapaths: raise ValueError("data path must exist for training/inference") datapaths = [ envs.workspace_adapter_by_specific(path, workspace) for path in datapaths ] all_workers = [len(os.listdir(path)) for path in datapaths] all_workers.append(workers) max_worker_num = min(all_workers) if max_worker_num >= workers: return workers print( "phases do not have enough datas for training, set worker/gpu cards num from {} to {}" .format(workers, max_worker_num)) return max_worker_num from paddlerec.core.engine.local_cluster import LocalClusterEngine run_extras = get_all_inters_from_yaml(args.model, ["runner."]) mode = envs.get_runtime_environ("mode") trainer_class = ".".join(["runner", mode, "trainer_class"]) fleet_class = ".".join(["runner", mode, "fleet_mode"]) device_class = ".".join(["runner", mode, "device"]) selected_gpus_class = ".".join(["runner", mode, "selected_gpus"]) strategy_class = ".".join(["runner", mode, "distribute_strategy"]) worker_class = ".".join(["runner", mode, "worker_num"]) server_class = ".".join(["runner", mode, "server_num"]) trainer = run_extras.get(trainer_class, "GeneralTrainer") fleet_mode = run_extras.get(fleet_class, "ps") device = run_extras.get(device_class, "cpu") selected_gpus = run_extras.get(selected_gpus_class, "0") distributed_strategy = run_extras.get(strategy_class, "async") executor_mode = "train" worker_num = run_extras.get(worker_class, 1) server_num = run_extras.get(server_class, 1) device = device.upper() fleet_mode = fleet_mode.upper() cluster_envs = {} # Todo: delete follow hard code when paddle support ps-gpu. if device == "CPU": fleet_mode = "PS" elif device == "GPU": fleet_mode = "COLLECTIVE" if fleet_mode == "PS" and device != "CPU": raise ValueError("PS can not be used with GPU") if fleet_mode == "COLLECTIVE" and device != "GPU": raise ValueError("COLLECTIVE can not be used without GPU") if fleet_mode == "PS": worker_num = get_worker_num(run_extras, worker_num) if fleet_mode == "COLLECTIVE": cluster_envs["selected_gpus"] = selected_gpus gpus = selected_gpus.split(",") worker_num = get_worker_num(run_extras, len(gpus)) cluster_envs["selected_gpus"] = ','.join(gpus[:worker_num]) cluster_envs["server_num"] = server_num cluster_envs["worker_num"] = worker_num cluster_envs["start_port"] = envs.find_free_port() cluster_envs["fleet_mode"] = fleet_mode cluster_envs["log_dir"] = "logs" cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.executor_mode"] = executor_mode cluster_envs["train.trainer.strategy"] = distributed_strategy cluster_envs["train.trainer.threads"] = "2" cluster_envs["CPU_NUM"] = cluster_envs["train.trainer.threads"] cluster_envs["train.trainer.engine"] = "local_cluster" cluster_envs["train.trainer.platform"] = envs.get_platform() print("launch {} engine with cluster to run model: {}".format( trainer, args.model)) set_runtime_envs(cluster_envs, args.model) launch = LocalClusterEngine(cluster_envs, args.model) return launch