def main(args):
    if args.distributed_init_method is None and args.distributed_port > 0:
        # We can determine the init method automatically for Slurm.
        node_list = os.environ.get('SLURM_JOB_NODELIST')
        if node_list is not None:
            try:
                hostnames = subprocess.check_output(
                    ['scontrol', 'show', 'hostnames', node_list])
                args.distributed_init_method = 'tcp://{host}:{port}'.format(
                    host=hostnames.split()[0].decode('utf-8'),
                    port=args.distributed_port)
                args.distributed_rank = int(os.environ.get('SLURM_PROCID'))
                args.device_id = int(os.environ.get('SLURM_LOCALID'))
            except subprocess.CalledProcessError as e:  # scontrol failed
                raise e
            except FileNotFoundError as e:  # Slurm is not installed
                pass
    if args.distributed_init_method is None:
        raise ValueError('--distributed-init-method or --distributed-port '
                         'must be specified for distributed training')

    args.distributed_rank = distributed_utils.distributed_init(args)
    args.device_id = args.local_rank
    print('| initialized host {} as rank {} and device id {}'.format(
        socket.gethostname(), args.distributed_rank, args.device_id))
    single_process_main(args)
Exemplo n.º 2
0
def run(args, error_queue):
    try:
        args.distributed_rank = distributed_utils.distributed_init(args)
        single_process_main(args)
    except KeyboardInterrupt:
        pass  # killed by parent, do nothing
    except Exception:
        # propagate exception to parent process, keeping original traceback
        import traceback
        error_queue.put((args.distributed_rank, traceback.format_exc()))
Exemplo n.º 3
0
def run(args, error_queue):
    try:
        args.distributed_rank = distributed_utils.distributed_init(args)
        single_process_main(args)
    except KeyboardInterrupt:
        pass  # killed by parent, do nothing
    except Exception:
        # propagate exception to parent process, keeping original traceback
        import traceback
        error_queue.put((args.distributed_rank, traceback.format_exc()))
def main(args):
    node_to_rank = json.load(open('node_to_rank.json', 'r'))
    args.master_addr = {v: k for k, v in node_to_rank.items()}[0]
    os.environ['MASTER_ADDR'] = args.master_addr
    os.environ['MASTER_PORT'] = '10000'
    host = socket.gethostbyname(socket.gethostname())
    args.distributed_port = '10000'
    print("master", args.master_addr, "host", host,
          os.environ.get('SLURM_JOB_NODELIST'))
    print("OMPI_COMM_WORLD_SIZE", os.environ["OMPI_COMM_WORLD_SIZE"])
    print("OMPI_COMM_WORLD_RANK", os.environ["OMPI_COMM_WORLD_RANK"])
    print("OMPI_COMM_WORLD_LOCAL_RANK",
          os.environ["OMPI_COMM_WORLD_LOCAL_RANK"], args.device_id)
    exp_id = args.master_addr
    args.distributed_init_method = "file:///shared/share/" + (exp_id)
    print('| initialized host {} as rank {}'.format(
        socket.gethostbyname(socket.gethostname()), args.distributed_rank))
    single_process_main(0, args)
Exemplo n.º 5
0
def main(args):
    if args.distributed_init_method is None and args.distributed_port > 0:
        # We can determine the init method automatically for Slurm.
        node_list = os.environ.get('SLURM_JOB_NODELIST')
        if node_list is not None:
            try:
                hostnames = subprocess.check_output(['scontrol', 'show', 'hostnames', node_list])
                args.distributed_init_method = 'tcp://{host}:{port}'.format(
                    host=hostnames.split()[0].decode('utf-8'),
                    port=args.distributed_port)
                args.distributed_rank = int(os.environ.get('SLURM_PROCID'))
                args.device_id = int(os.environ.get('SLURM_LOCALID'))
            except subprocess.CalledProcessError as e:  # scontrol failed
                raise e
            except FileNotFoundError as e:  # Slurm is not installed
                pass
    if args.distributed_init_method is None:
        raise ValueError('--distributed-init-method or --distributed-port '
                         'must be specified for distributed training')

    args.distributed_rank = distributed_utils.distributed_init(args)
    print('| initialized host {} as rank {}'.format(socket.gethostname(), args.distributed_rank))
    single_process_main(args)
Exemplo n.º 6
0
def main(args):
    if args.distributed_init_method is None and args.distributed_port > 0:
        # We can determine the init method automatically for Slurm.
        node_list = os.environ.get('SLURM_JOB_NODELIST')
        if node_list is not None:
            try:
                if args.distributed_backend == 'nccl' and args.dist_avg == 'allreduce':
                    hostnames = subprocess.check_output(
                        ['scontrol', 'show', 'hostnames', node_list])
                    args.distributed_init_method = 'tcp://{host}:{port}'.format(
                        host=hostnames.split()[0].decode('utf-8'),
                        port=args.distributed_port)
                    args.distributed_rank = int(os.environ.get('SLURM_PROCID'))
                    args.device_id = int(os.environ.get(
                        'SLURM_LOCALID')) % torch.cuda.device_count()
                else:
                    args.master_addr = os.environ['HOSTNAME']
                    if args.distributed_backend == 'mpi':
                        if args.dist_process:
                            args.distributed_rank = int(
                                os.environ['OMPI_COMM_WORLD_RANK'])
                            #print (int(os.environ['OMPI_COMM_WORLD_RANK']), int(os.environ['OMPI_COMM_WORLD_NODE_RANK']), os.environ['OMPI_COMM_WORLD_LOCAL_RANK'], int(os.environ['OMPI_UNIVERSE_SIZE']))
                            #args.distributed_rank = int(os.environ['OMPI_COMM_WORLD_NODE_RANK'])
                            args.device_id = int(
                                os.environ['OMPI_COMM_WORLD_LOCAL_RANK']
                            ) % torch.cuda.device_count()
                        else:
                            args.distributed_rank = int(
                                os.environ['OMPI_COMM_WORLD_RANK'])
                            args.distributed_world_size = int(
                                os.environ['OMPI_UNIVERSE_SIZE'])
                            args.device_id = int(
                                os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
                    else:
                        if args.dist_process:
                            args.distributed_rank = int(
                                os.environ.get('SLURM_PROCID'))
                            args.device_id = int(
                                os.environ.get('SLURM_LOCALID')
                            ) % torch.cuda.device_count()
                        else:
                            args.distributed_rank = int(
                                os.environ['SLURM_PROCID'])
                            args.distributed_world_size = int(
                                os.environ['SLURM_NTASKS'])
                            args.device_id = int(os.environ['SLURM_LOCALID'])
                    args.distributed_init_method = 'tcp://{host}:{port}'.format(
                        host=args.master_addr, port=args.distributed_port)

            except subprocess.CalledProcessError as e:  # scontrol failed
                raise e
            except FileNotFoundError as e:  # Slurm is not installed
                pass
    if args.distributed_init_method is None and args.distributed_port is None:
        raise ValueError('--distributed-init-method or --distributed-port '
                         'must be specified for distributed training')

    args.distributed_rank = distributed_utils.distributed_init(args)
    print('| initialized host {} as rank {}'.format(socket.gethostname(),
                                                    args.distributed_rank))
    print('init:', args.distributed_rank, args.distributed_world_size,
          args.distributed_init_method)

    single_process_main(args)