def start_worker_procs(self): if (envs.get_runtime_environ("fleet_mode") == "COLLECTIVE"): #trainer_ports = os.getenv("TRAINER_PORTS", None).split(",") cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") if cuda_visible_devices is None or cuda_visible_devices == "": selected_gpus = range(int(os.getenv("TRAINER_GPU_CARD_COUNT"))) else: # change selected_gpus into relative values # e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.selected_gpus=4,5,6,7; # therefore selected_gpus=0,1,2,3 cuda_visible_devices_list = cuda_visible_devices.split(',') for x in range(int(os.getenv("TRAINER_GPU_CARD_COUNT"))): assert x in cuda_visible_devices_list, "Can't find "\ "your selected_gpus %s in CUDA_VISIBLE_DEVICES[%s]."\ % (x, cuda_visible_devices) selected_gpus = [cuda_visible_devices_list.index(x)] print("selected_gpus:{}".format(selected_gpus)) factory = "paddlerec.core.factory" cmd = [sys.executable, "-u", "-m", factory, self.trainer] logs_dir = envs.get_runtime_environ("log_dir") print("use_paddlecloud_flag:{}".format( cluster_utils.use_paddlecloud())) if cluster_utils.use_paddlecloud(): cluster, pod = cluster_utils.get_cloud_cluster(selected_gpus) logger.info("get cluster from cloud:{}".format(cluster)) procs = cluster_utils.start_local_trainers(cluster, pod, cmd, log_dir=logs_dir) print("cluster:{}".format(cluster)) print("pod:{}".format(pod)) else: trainer = TrainerFactory.create(self.trainer) trainer.run()
def single_infer_engine(args): trainer = get_trainer_prefix(args) + "SingleInfer" single_envs = {} single_envs["train.trainer.trainer"] = trainer single_envs["train.trainer.threads"] = "2" single_envs["train.trainer.engine"] = "single_infer" single_envs["train.trainer.platform"] = envs.get_platform() print("use {} engine to run model: {}".format(trainer, args.model)) set_runtime_envs(single_envs, args.model) trainer = TrainerFactory.create(args.model) return trainer
def cluster_mpi_engine(args): print("launch cluster engine with cluster to run model: {}".format( args.model)) cluster_envs = {} cluster_envs["train.trainer.trainer"] = "CtrCodingTrainer" cluster_envs["train.trainer.platform"] = envs.get_platform() set_runtime_envs(cluster_envs, args.model) trainer = TrainerFactory.create(args.model) return trainer
def online_learning(args): trainer = "OnlineLearningTrainer" single_envs = {} single_envs["train.trainer.trainer"] = trainer single_envs["train.trainer.threads"] = "2" single_envs["train.trainer.engine"] = "online_learning" single_envs["train.trainer.platform"] = envs.get_platform() print("use {} engine to run model: {}".format(trainer, args.model)) set_runtime_envs(single_envs, args.model) trainer = TrainerFactory.create(args.model) return trainer
def worker(mode): if not mode: raise ValueError("mode: {} can not be recognized") run_extras = get_all_inters_from_yaml(args.model, ["runner."]) trainer_class = ".".join(["runner", mode, "trainer_class"]) fleet_class = ".".join(["runner", mode, "fleet_mode"]) device_class = ".".join(["runner", mode, "device"]) selected_gpus_class = ".".join(["runner", mode, "selected_gpus"]) strategy_class = ".".join(["runner", mode, "distribute_strategy"]) worker_class = ".".join(["runner", mode, "worker_num"]) server_class = ".".join(["runner", mode, "server_num"]) trainer = run_extras.get(trainer_class, "GeneralTrainer") fleet_mode = run_extras.get(fleet_class, "ps") device = run_extras.get(device_class, "cpu") selected_gpus = run_extras.get(selected_gpus_class, "0") distributed_strategy = run_extras.get(strategy_class, "async") worker_num = run_extras.get(worker_class, 1) server_num = run_extras.get(server_class, 1) executor_mode = "train" device = device.upper() fleet_mode = fleet_mode.upper() if fleet_mode == "COLLECTIVE" and device != "GPU": raise ValueError("COLLECTIVE can not be used with GPU") cluster_envs = {} if device == "GPU": cluster_envs["selected_gpus"] = selected_gpus cluster_envs["server_num"] = server_num cluster_envs["worker_num"] = worker_num cluster_envs["fleet_mode"] = fleet_mode cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.engine"] = "cluster" cluster_envs["train.trainer.executor_mode"] = executor_mode cluster_envs["train.trainer.strategy"] = distributed_strategy cluster_envs["train.trainer.threads"] = envs.get_runtime_environ( "CPU_NUM") cluster_envs["train.trainer.platform"] = envs.get_platform() print("launch {} engine with cluster to with model: {}".format( trainer, args.model)) set_runtime_envs(cluster_envs, args.model) trainer = TrainerFactory.create(args.model) return trainer
def worker(): role = "WORKER" trainer = get_trainer_prefix(args) + "ClusterTrainer" cluster_envs = {} cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.engine"] = "cluster" cluster_envs["train.trainer.threads"] = envs.get_runtime_environ( "CPU_NUM") cluster_envs["train.trainer.platform"] = envs.get_platform() print("launch {} engine with cluster to with model: {}".format( trainer, args.model)) set_runtime_envs(cluster_envs, args.model) trainer = TrainerFactory.create(args.model) return trainer
def single_infer_engine(args): run_extras = get_all_inters_from_yaml(args.model, ["runner."]) mode = envs.get_runtime_environ("mode") trainer_class = ".".join(["runner", mode, "trainer_class"]) fleet_class = ".".join(["runner", mode, "fleet_mode"]) device_class = ".".join(["runner", mode, "device"]) selected_gpus_class = ".".join(["runner", mode, "selected_gpus"]) epochs_class = ".".join(["runner", mode, "epochs"]) epochs = run_extras.get(epochs_class, 1) if epochs > 1: warnings.warn( "It makes no sense to predict the same model for multiple epochs", category=UserWarning, stacklevel=2) trainer = run_extras.get(trainer_class, "GeneralTrainer") fleet_mode = run_extras.get(fleet_class, "ps") device = run_extras.get(device_class, "cpu") selected_gpus = run_extras.get(selected_gpus_class, "0") executor_mode = "infer" single_envs = {} if device.upper() == "GPU": selected_gpus_num = len(selected_gpus.split(",")) if selected_gpus_num != 1: raise ValueError( "Single Mode Only Support One GPU, Set Local Cluster Mode to use Multi-GPUS" ) single_envs["selsected_gpus"] = selected_gpus single_envs["FLAGS_selected_gpus"] = selected_gpus single_envs["train.trainer.trainer"] = trainer single_envs["train.trainer.executor_mode"] = executor_mode single_envs["fleet_mode"] = fleet_mode single_envs["train.trainer.threads"] = "2" single_envs["train.trainer.platform"] = envs.get_platform() single_envs["train.trainer.engine"] = "single" set_runtime_envs(single_envs, args.model) trainer = TrainerFactory.create(args.model) return trainer
def worker(): role = "WORKER" _envs = envs.load_yaml(args.model) run_extras = get_all_inters_from_yaml(args.model, ["train.", "runner."]) trainer_class = run_extras.get( "runner." + _envs["mode"] + ".trainer_class", None) if trainer_class: trainer = trainer_class else: trainer = "GeneralTrainer" executor_mode = "train" distributed_strategy = run_extras.get( "runner." + _envs["mode"] + ".distribute_strategy", "async") selected_gpus = run_extras.get( "runner." + _envs["mode"] + ".selected_gpus", "0") fleet_mode = run_extras.get("runner." + _envs["mode"] + ".fleet_mode", "ps") cluster_envs = {} cluster_envs["selected_gpus"] = selected_gpus cluster_envs["fleet_mode"] = fleet_mode cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.executor_mode"] = executor_mode cluster_envs["train.trainer.engine"] = "cluster" cluster_envs["train.trainer.strategy"] = distributed_strategy cluster_envs["train.trainer.threads"] = envs.get_runtime_environ( "CPU_NUM") cluster_envs["train.trainer.platform"] = envs.get_platform() print("launch {} engine with cluster to with model: {}".format( trainer, args.model)) set_runtime_envs(cluster_envs, args.model) trainer = TrainerFactory.create(args.model) return trainer
def start_worker_procs(self): trainer = TrainerFactory.create(self.trainer) trainer.run()