def get_cloud_cluster(args_node_ips, args_node_ip, args_port, selected_gpus): """ args_node_ips, args_node_ip:string """ #you can automatically get ip info while using paddlecloud multi nodes mode. node_ips = os.getenv("PADDLE_TRAINERS") assert node_ips is not None, "PADDLE_TRAINERS should not be None" node_ip = os.getenv("POD_IP") assert node_ip is not None, "POD_IP should not be None" node_rank = os.getenv("PADDLE_TRAINER_ID") assert node_rank is not None, "PADDLE_TRAINER_ID should not be None" node_ips = node_ips.split(",") num_nodes = len(node_ips) node_rank = int(node_rank) if node_ip != "127.0.0.1" and node_ip != args_node_ip: logger.warning("Please NOTE: When using paddlecloud, node_ip is \ automatically got from POD_IP. Your input node_ip: {} doesn't equals to \ node_ip: {} from paddlecloud environment.".format(args_node_ip, node_ip)) if args_node_ips != "127.0.0.1" and args_node_ips != ",".join(node_ips): logger.warning( "Please NOTE: When using paddlecloud, cluster_node_ips is \ automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\ Your input cluster_node_ips: {} doesn't equals to IPs: {} from \ paddlecloud environment.".format(args_node_ips, node_ips)) started_port = args_port print("num_nodes:", num_nodes) if num_nodes > 1: try: paddle_port = int(os.getenv("PADDLE_PORT", "")) paddle_port_num = int(os.getenv("TRAINER_PORTS_NUM", "")) if paddle_port_num >= len( selected_gpus) and paddle_port != args_port: logger.warning( "Use Cloud specified port:{}.".format(paddle_port)) started_port = paddle_port except Exception as e: print(e) pass if started_port is None: started_port = 6170 logger.debug("parsed from args:node_ips:{} \ node_ip:{} node_rank:{} started_port:{}".format( node_ips, node_ip, node_rank, started_port)) ports = [x for x in range(started_port, started_port + len(selected_gpus))] cluster, pod = get_cluster(node_ips, node_ip, ports, selected_gpus) return cluster, cluster.pods[node_rank]
def get_cluster_from_args(selected_gpus): cluster_node_ips = '127.0.0.1' node_ip = '127.0.0.1' node_ips = [x.strip() for x in cluster_node_ips.split(',')] node_ips.index(node_ip) free_ports = None free_ports = find_free_ports(len(selected_gpus)) if free_ports is not None: free_ports = list(free_ports) return get_cluster(node_ips, node_ip, free_ports, selected_gpus)
def get_cluster_from_args(selected_gpus): cluster_node_ips = '127.0.0.1' node_ip = '127.0.0.1' node_ips = [x.strip() for x in cluster_node_ips.split(',')] node_ips.index(node_ip) free_ports = None free_ports = find_free_ports(len(selected_gpus)) if free_ports is not None: free_ports = list(free_ports) trainer_endpoints = [] for ip in node_ips: trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports]) return get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus)
def get_cloud_cluster(args_node_ips, args_node_ip, args_port, selected_gpus): """ args_node_ips:string, args_node_ip:string, args_port: int, selected_gpus:list """ #you can automatically get ip info while using paddlecloud multi nodes mode. node_ips = os.getenv("PADDLE_TRAINERS") assert node_ips is not None, "PADDLE_TRAINERS should not be None" node_ip = os.getenv("POD_IP") assert node_ip is not None, "POD_IP should not be None" node_rank = os.getenv("PADDLE_TRAINER_ID") assert node_rank is not None, "PADDLE_TRAINER_ID should not be None" paddle_ports_num = int(os.getenv("TRAINER_PORTS_NUM")) assert paddle_ports_num is not None, "TRAINER_PORTS_NUM should not be None" node_ips = node_ips.split(",") num_nodes = len(node_ips) node_rank = int(node_rank) if node_ip != "127.0.0.1" and node_ip != args_node_ip: logger.warning("Please NOTE: When using paddlecloud, node_ip is \ automatically got from POD_IP. Your input node_ip: {} doesn't equals to \ node_ip: {} from paddlecloud environment.".format(args_node_ip, node_ip)) if args_node_ips != "127.0.0.1" and args_node_ips != ",".join(node_ips): logger.warning( "Please NOTE: When using paddlecloud, cluster_node_ips is \ automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\ Your input cluster_node_ips: {} doesn't equals to IPs: {} from \ paddlecloud environment.".format(args_node_ips, node_ips)) # DISTRIBUTED_TRAINER_ENDPOINTS: new environment since paddlecloud 1.8.4 # e.g: DISTRIBUTED_TRAINER_ENDPOINTS="ip1:port1,ip1:port2,ip1:port3,ip1:port4,ip2:port5,ip2:port6,ip2:port7,ip2:port8" trainer_endpoints = os.getenv("DISTRIBUTED_TRAINER_ENDPOINTS") if trainer_endpoints is None: started_port = args_port if num_nodes > 1: try: paddle_port = int(os.getenv("PADDLE_PORT", "")) if paddle_ports_num >= len( selected_gpus) and paddle_port != args_port: logger.warning("Use Cloud specified port:{}.".format( paddle_port)) started_port = paddle_port except Exception as e: print(e) pass if started_port is None: started_port = 6170 ports = [ x for x in range(started_port, started_port + len(selected_gpus)) ] trainer_endpoints = [] for ip in node_ips: trainer_endpoints.append(["%s:%d" % (ip, port) for port in ports]) else: trainer_endpoints_ori = trainer_endpoints.split(",") trainer_endpoints = [] assert num_nodes * paddle_ports_num == len(trainer_endpoints_ori) for i in range(num_nodes): trainer_endpoints.append(trainer_endpoints_ori[ i * paddle_ports_num:(i + 1) * paddle_ports_num]) logger.debug("parsed from args: node_ips:{} \ node_ip:{} node_rank:{} trainer_endpoints:{}" .format(node_ips, node_ip, node_rank, trainer_endpoints)) cluster, pod = get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus) return cluster, cluster.pods[node_rank]