def _setup_misc(self): # misc setup components that were in goissip_sgd config = self.config state = {} update_state( state, { 'epoch': 0, 'itr': 0, 'best_prec1': 0, 'is_best': True, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'elapsed_time': 0, 'batch_meter': Meter(ptag='Time').__dict__, 'data_meter': Meter(ptag='Data').__dict__, 'nn_meter': Meter(ptag='Forward/Backward').__dict__ }) self.state = state # module used to relaunch jobs and handle external termination signals ClusterManager.set_checkpoint_dir(config['checkpoint_dir']) self.cmanager = ClusterManager(rank=config['rank'], world_size=config['world_size'], model_tag=config['tag'], state=state, all_workers=config['checkpoint_all']) # enable low-level optimization of compute graph using cuDNN library? cudnn.benchmark = True self.batch_meter = Meter(state['batch_meter']) self.data_meter = Meter(state['data_meter']) self.nn_meter = Meter(state['nn_meter']) # initalize log file if not os.path.exists(config['out_fname']): with open(config['out_fname'], 'w') as f: print('BEGIN-TRAINING\n' 'World-Size,{ws}\n' 'Num-DLWorkers,{nw}\n' 'Batch-Size,{bs}\n' 'Epoch,itr,BT(s),avg:BT(s),std:BT(s),' 'NT(s),avg:NT(s),std:NT(s),' 'DT(s),avg:DT(s),std:DT(s),' 'Loss,avg:Loss,Prec@1,avg:Prec@1,Prec@5,avg:Prec@5,val'. format(ws=config['world_size'], nw=config['num_dataloader_workers'], bs=config['batch_size']), file=f) self.start_itr = state['itr'] self.start_epoch = state['epoch'] self.elapsed_time = state['elapsed_time'] self.begin_time = time.time() - state['elapsed_time'] self.best_val_prec1 = 0
def parse_args(): """ Set env-vars and global args rank: <-- $SLRUM_PROCID world_size<-- $SLURM_NTASKS Master address <-- $SLRUM_NODENAME of rank 0 process (or HOSTNAME) Master port <-- any free port (doesn't really matter) """ class DataStore(): def __init__(self): self.all_reduce = 'False' self.batch_size = 32 self.lr = 0.1 self.num_dataloader_workers = 10 self.num_epochs = 90 self.num_iterations_per_training_epoch = None self.momentum = 0.9 self.weight_decay = 1e-4 self.push_sum = 'True' self.graph_type = 5 self.mixing_strategy = 0 self.schedule = None self.peers_per_itr_schedule = None self.overlap = 'False' self.synch_freq = 0 self.warmup = 'False' self.seed = 47 self.print_freq = 10 self.checkpoint_all = 'False' self.overwrite_checkpoints = 'True' self.master_port = '40100' self.checkpoint_dir = "./" self.network_interface_type = 'infiniband' self.num_itr_ignore = 10 # self.dataset_dir = "./data/" self.no_cuda_streams = None self.master_addr = None self.backend = 'nccl' self.rank = 1 self.world_size = 5 self.tag = '' self.out_fname = '' self.resume = 'False' self.verbose = 'True' self.train_fast = 'False' self.nesterov = 'False' args = DataStore() #parser.parse_args() ClusterManager.set_checkpoint_dir(args.checkpoint_dir) # rank and world_size need to be changed depending on the scheduler being # used to run the distributed jobs args.master_addr = os.environ['HOSTNAME'] if args.backend == 'mpi': args.rank = int(os.environ['OMPI_COMM_WORLD_RANK']) args.world_size = int(os.environ['OMPI_UNIVERSE_SIZE']) else: args.rank = 1 #int(os.environ['SLURM_PROCID']) args.world_size = 5 #int(os.environ['SLURM_NTASKS']) args.out_fname = ClusterManager.CHECKPOINT_DIR \ + args.tag \ + 'out_r' + str(args.rank) \ + '_n' + str(args.world_size) \ + '.csv' args.resume = True if args.resume == 'True' else False args.verbose = True if args.verbose == 'True' else False args.train_fast = True if args.train_fast == 'True' else False args.nesterov = True if args.nesterov == 'True' else False args.checkpoint_all = True if args.checkpoint_all == 'True' else False args.warmup = True if args.warmup == 'True' else False args.overlap = True if args.overlap == 'True' else False args.push_sum = True if args.push_sum == 'True' else False args.all_reduce = True if args.all_reduce == 'True' else False args.cpu_comm = True if (args.backend == 'gloo' and not args.push_sum and not args.all_reduce) else False args.comm_device = torch.device('cpu') if args.cpu_comm else torch.device( 'cuda') args.overwrite_checkpoints = True if args.overwrite_checkpoints == 'True' else False args.lr_schedule = {} if args.schedule is None: args.schedule = [30, 0.1, 60, 0.1, 80, 0.1] i, epoch = 0, None for v in args.schedule: if i == 0: epoch = v elif i == 1: args.lr_schedule[epoch] = v i = (i + 1) % 2 del args.schedule # parse peers per itr sched (epoch, num_peers) args.ppi_schedule = {} if args.peers_per_itr_schedule is None: args.peers_per_itr_schedule = [0, 1] i, epoch = 0, None for v in args.peers_per_itr_schedule: if i == 0: epoch = v elif i == 1: args.ppi_schedule[epoch] = v i = (i + 1) % 2 del args.peers_per_itr_schedule # must specify how many peers to communicate from the start of training assert 0 in args.ppi_schedule if args.all_reduce: assert args.graph_type == -1 if args.backend == 'gloo': assert args.network_interface_type == 'ethernet' os.environ['GLOO_SOCKET_IFNAME'] = get_tcp_interface_name( network_interface_type=args.network_interface_type) elif args.network_interface_type == 'ethernet': if args.backend == 'nccl': os.environ['NCCL_SOCKET_IFNAME'] = get_tcp_interface_name( network_interface_type=args.network_interface_type) os.environ['NCCL_IB_DISABLE'] = '1' else: raise NotImplementedError # initialize torch distributed backend os.environ['MASTER_ADDR'] = args.master_addr os.environ['MASTER_PORT'] = args.master_port dist.init_process_group(backend=args.backend, world_size=args.world_size, rank=args.rank) args.graph, args.mixing = None, None graph_class = GRAPH_TOPOLOGIES[args.graph_type] if graph_class: # dist.barrier is done here to ensure the NCCL communicator is created # here. This prevents an error which may be caused if the NCCL # communicator is created at a time gap of more than 5 minutes in # different processes dist.barrier() args.graph = graph_class(args.rank, args.world_size, peers_per_itr=args.ppi_schedule[0]) mixing_class = MIXING_STRATEGIES[args.mixing_strategy] if mixing_class and args.graph: args.mixing = mixing_class(args.graph, args.comm_device) return args
def parse_args(): """ Set env-vars and global args rank: <-- $SLRUM_PROCID world_size<-- $SLURM_NTASKS Master address <-- $SLRUM_NODENAME of rank 0 process (or HOSTNAME) Master port <-- any free port (doesn't really matter) """ args = parser.parse_args() ClusterManager.set_checkpoint_dir(args.checkpoint_dir) args.master_addr = os.environ['HOSTNAME'] if args.backend == 'mpi': args.rank = int(os.environ['OMPI_COMM_WORLD_RANK']) args.world_size = int(os.environ['OMPI_UNIVERSE_SIZE']) args.device_id = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) else: args.rank = int(os.environ['SLURM_PROCID']) args.world_size = int(os.environ['SLURM_NTASKS']) args.device_id = int(os.environ['SLURM_LOCALID']) args.out_fname = ClusterManager.CHECKPOINT_DIR \ + args.tag \ + 'out_r' + str(args.rank) \ + '_n' + str(args.world_size) \ + '.csv' args.resume = True if args.resume == 'True' else False args.verbose = True if args.verbose == 'True' else False args.train_fast = True if args.train_fast == 'True' else False args.nesterov = True if args.nesterov == 'True' else False args.checkpoint_all = True if args.checkpoint_all == 'True' else False args.warmup = True if args.warmup == 'True' else False args.cpu_comm = True if args.backend == 'gloo' else False args.comm_device = torch.device('cpu') if args.cpu_comm else torch.device( 'cuda') args.overlap = True if args.overlap == 'True' else False args.push_sum = True if args.push_sum == 'True' else False args.all_reduce = True if args.all_reduce == 'True' else False args.bilat = True if args.bilat == 'True' else False args.global_epoch = None args.global_itr = None if args.rank == 0 and os.path.isfile(args.shared_fpath): os.remove(args.shared_fpath) while os.path.isfile(args.shared_fpath): pass args.lr_schedule = {} if args.schedule is None: args.schedule = [30, 0.1, 60, 0.1, 80, 0.1] i, epoch = 0, None for v in args.schedule: if i == 0: epoch = v elif i == 1: args.lr_schedule[epoch] = v i = (i + 1) % 2 del args.schedule # parse peers per itr sched (epoch, num_peers) args.ppi_schedule = {} if args.peers_per_itr_schedule is None: args.peers_per_itr_schedule = [0, 1] i, epoch = 0, None for v in args.peers_per_itr_schedule: if i == 0: epoch = v elif i == 1: args.ppi_schedule[epoch] = v i = (i + 1) % 2 del args.peers_per_itr_schedule # must specify how many peers to communicate from the start of training assert 0 in args.ppi_schedule if args.backend == 'gloo': assert args.network_interface_type == 'ethernet' os.environ['GLOO_SOCKET_IFNAME'] = get_tcp_interface_name( network_interface_type=args.network_interface_type) elif args.network_interface_type == 'ethernet': if args.backend == 'nccl': os.environ['NCCL_SOCKET_IFNAME'] = get_tcp_interface_name( network_interface_type=args.network_interface_type) os.environ['NCCL_IB_DISABLE'] = '1' else: raise NotImplementedError # initialize torch distributed backend os.environ['MASTER_ADDR'] = args.master_addr os.environ['MASTER_PORT'] = str(int(args.master_port) + 1) dist.init_process_group(backend=args.backend, world_size=args.world_size, rank=args.rank) args.graph_class = GRAPH_TOPOLOGIES[args.graph_type] args.mixing_class = MIXING_STRATEGIES[args.mixing_strategy] if args.graph_class is None: raise Exception('Incorrect arguments for graph_type') if args.mixing_class is None: raise Exception('Incorrect arguments for mixing_strategy') return args