def launch_collective(args): # parse arguments, used for cloud-single-machine and local (device_mode, devices_per_proc) = launch_utils.get_device_proc_info(args) trainers_num = cloud_utils.get_trainers_num() logger.debug("parsed from args trainerss_num:{} mode:{} devices:{}".format( trainers_num, device_mode, devices_per_proc)) cluster = None pod = None start_port = 6170 if os.environ.get('FLAGS_START_PORT') is not None: start_port = os.environ.get('FLAGS_START_PORT') if cloud_utils.use_paddlecloud() and trainers_num != 1: cluster, pod = cloud_utils.get_cloud_cluster(args.ips, device_mode, devices_per_proc, start_port) logger.debug("get cluster from cloud:{}".format(cluster)) else: # trainers_num = 1 or not use paddlecloud ips="a,b" cluster, pod = get_cluster_from_args(args, device_mode, devices_per_proc) logger.debug("get cluster from args:{}".format(cluster)) global_envs = copy.copy(os.environ.copy()) gloo_rendezvous_dir = tempfile.mkdtemp() # add gloo env global_envs["PADDLE_WITH_GLOO"] = str(os.getenv("PADDLE_WITH_GLOO", "0")) global_envs["PADDLE_GLOO_RENDEZVOUS"] = "3" global_envs["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir procs = start_local_trainers( cluster, pod, training_script=args.training_script, training_script_args=args.training_script_args, log_dir=args.log_dir, envs=global_envs) while True: alive = watch_local_trainers(procs, cluster.trainers_nranks()) if not alive: logger.info("Local processes completed.") logger.debug("POD info:{}".format(pod)) break time.sleep(3) if os.path.exists(gloo_rendezvous_dir): shutil.rmtree(gloo_rendezvous_dir)
def launch_collective(args): # parse arguments, used for cloud-single-machine and local gpus = get_gpus(args.gpus) trainers_num = cloud_utils.get_trainers_num() logger.debug("parsed from args trainerss_num:{} gpus:{}".format( trainers_num, gpus)) cluster = None pod = None start_port = 6170 if os.environ.get('FLAGS_START_PORT') is not None: start_port = os.environ.get('FLAGS_START_PORT') if cloud_utils.use_paddlecloud() and trainers_num != 1: cluster, pod = cloud_utils.get_cloud_cluster(args.ips, gpus, start_port) logger.debug("get cluster from cloud:{}".format(cluster)) else: # trainers_num = 1 or not use paddlecloud ips="a,b" cluster, pod = get_cluster_from_args(args, gpus) logger.debug("get cluster from args:{}".format(cluster)) procs = start_local_trainers( cluster, pod, training_script=args.training_script, training_script_args=args.training_script_args, log_dir=args.log_dir) while True: alive = watch_local_trainers(procs, cluster.trainers_nranks()) if not alive: logger.info("Local processes completed.") logger.debug("POD info:{}".format(pod)) break time.sleep(3)
def get_cluster_info(args): # parse arguments, used for cloud-single-machine and local if args.backend == 'gloo': cpuonly_check(args) if args.enable_auto_mapping: (device_mode, devices_per_proc) = (DeviceMode.GPU, []) else: (device_mode, devices_per_proc) = launch_utils.get_device_proc_info(args) trainers_num = cloud_utils.get_trainers_num() logger.debug("parsed from args trainerss_num:{} mode:{} devices:{}".format( trainers_num, device_mode, devices_per_proc)) cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") cluster = None pod = None start_port = 6170 if os.environ.get('FLAGS_START_PORT') is not None: start_port = os.environ.get('FLAGS_START_PORT') # auto mapping between processes and devices for auto-parallel if args.enable_auto_mapping == True: assert args.cluster_topo_path is not None, \ "The cluster topology must be provied when enabling auto mapping." rank_mapping_path = args.rank_mapping_path or os.getenv( "PADDLE_RANK_MAPPING_PATH") if not rank_mapping_path: os.environ["PADDLE_NEED_RANK_MAPPING"] = str(True) os.environ["PADDLE_ENABLE_ELASTIC"] = str( enable_elastic(args, device_mode)) cwd = pathlib.Path().resolve() rank_mapping_path = os.path.join(cwd, "auto_parallel_rank_mapping.json") os.environ["PADDLE_RANK_MAPPING_PATH"] = str(rank_mapping_path) original_args = sys.argv[1:] os.environ["PADDLE_ORIGINAL_CMD_ARGS"] = " ".join(original_args) os.environ["PADDLE_CLUSTER_TOPO_PATH"] = str(args.cluster_topo_path) os.environ["PADDLE_ENABLE_AUTO_MAPPING"] = str( args.enable_auto_mapping) cluster, pod = launch_utils.get_mapped_cluster_from_args_without_rank_mapping( args, device_mode) else: os.environ["PADDLE_NEED_RANK_MAPPING"] = str(False) os.environ["PADDLE_ENABLE_ELASTIC"] = str( enable_elastic(args, device_mode)) os.environ["PADDLE_CLUSTER_TOPO_PATH"] = str(args.cluster_topo_path) os.environ["PADDLE_RANK_MAPPING_PATH"] = str(rank_mapping_path) os.environ["PADDLE_ENABLE_AUTO_MAPPING"] = str( args.enable_auto_mapping) cluster, pod = launch_utils.get_mapped_cluster_from_args_with_rank_mapping( args, device_mode) elif cloud_utils.use_paddlecloud() and trainers_num != 1: cluster, pod = cloud_utils.get_cloud_cluster( args.ips, device_mode, devices_per_proc, start_port) logger.debug("get cluster from cloud:{}".format(cluster)) elif device_mode == DeviceMode.ASCEND_NPU: # for ascend cluster, pod = ascend_utils.get_cloud_cluster( rank_table_file=os.getenv("RANK_TABLE_FILE", None), device_mode=device_mode, start_port=start_port) else: # trainers_num = 1 or not use paddlecloud ips="a,b" cluster, pod = get_cluster_from_args(args, device_mode, devices_per_proc) logger.debug("get cluster from args:{}".format(cluster)) return cluster, pod