def _set_horovod_backend(self): self.check_horovod() self._distrib_type = DistributedType.HOROVOD # Initialize Horovod to get rank / size info hvd.init() if self.on_gpu: # Horovod assigns one local GPU per process self.parallel_device_ids = list(range(hvd.local_size())) else: self.num_processes = hvd.local_size()
def local_size(cls, *args): """Get the number of workers at the current node.""" try: return mgw.local_size(*args) except NameError: raise NameError('module <mgw> not imported')
def _test__hvd_dist_model_create_from_backend_dist(backend, true_device): model = _HorovodDistModel.create_from_backend(backend=backend) assert hvd.rank() > -1 with pytest.raises( RuntimeError, match=r"Can not re-initialize Horovod if it is already initialized" ): _HorovodDistModel.create_from_backend(backend=backend) _assert_model( model, { "device": true_device, "local_rank": hvd.local_rank(), "rank": hvd.rank(), "world_size": hvd.size(), "node_index": 0, "nnodes": 1, "nproc_per_node": hvd.local_size(), }, ) model.finalize()
def prepare(args, e_ix_ln, r_ix_ln, t_ix_ln): mdl = _model(args, e_ix_ln, r_ix_ln, t_ix_ln) lr_ml = (hvd.local_size() if hvd.nccl_built() else 1) if not args.tpu and args.adasum else _size(args) opt = torch.optim.Adam(mdl.parameters(), lr=lr_ml * args.learning_rate, weight_decay=args.weight_decay) st_e, bst_ls = _resume(args, mdl, opt) if args.resume != '' else (1, None) if not args.tpu: opt = hvd.DistributedOptimizer( opt, named_parameters=mdl.named_parameters(), compression=hvd.Compression.fp16 if args.fp16 else hvd.Compression.none, op=hvd.Adasum if args.adasum else hvd.Average) hvd.broadcast_parameters(mdl.state_dict(), root_rank=0) lr_sc = torch.optim.lr_scheduler.StepLR(opt, step_size=args.learning_rate_step, gamma=args.learning_rate_gamma) if not args.tpu: hvd.broadcast_optimizer_state(opt, root_rank=0) ls_f = _loss_f(args).to(args.dvc) return mdl, opt, lr_sc, ls_f, st_e, bst_ls
def calculate_shuffle_buffer_size(hvd, avg_row_size, train_row_count_per_worker): """ Determines the shuffling buffer size such that each worker gets at most 1GB for shuffling buffer such that on a single machine, among all the workers on that machine, at most memory_cap_gb GB are allocated for shuffling buffer. Also, it ensures that the buffer size is identical among all the workers. example 1: memory_cap_gb = 4 machine1: 8 workers machine2: 3 workers shuffle_buffer_size = 0.5 GB example 2: memory_cap_gb = 4 machine1: 2 workers machine2: 3 workers shuffle_buffer_size = 1 GB example 3: memory_cap_gb = 4 machine1: 2 workers machine2: 8 workers machine3: 5 workers shuffle_buffer_size = 0.5 GB """ local_size = hvd.local_size() local_sizes = hvd.allgather(torch.tensor([local_size])) max_local_size = torch.max(local_sizes).item() if max_local_size > TOTAL_BUFFER_MEMORY_CAP_GIB: shuffle_buffer_size = TOTAL_BUFFER_MEMORY_CAP_GIB * BYTES_PER_GIB / avg_row_size / max_local_size else: shuffle_buffer_size = BYTES_PER_GIB / avg_row_size return int(min(shuffle_buffer_size, train_row_count_per_worker))
def _check_distributed(): try: dist = hvd.size() != hvd.local_size() except ValueError: # not using horovod dist = False return dist
def test_stability(self): hvd.init() # TODO support non-MPI Adasum operation if not hvd.mpi_enabled(): self.skipTest("MPI not enabled") device = torch.device('cuda:{}'.format(hvd.local_rank())) if torch.cuda.is_available() else torch.device('cpu') np.random.seed(2) torch.manual_seed(2) size = hvd.size() local_size = hvd.local_size() rank = hvd.rank() for data_type in self.data_types: N = 1024 a = np.random.normal(0, np.finfo(data_type).tiny, (N, 1)).astype(np.float64) r = np.random.normal(0, 1, (size, 1)).astype(np.float64) q = np.dot(a,r.T).astype(data_type).astype(np.float64) tensor = np.zeros(N,dtype=data_type) tensor[:] = q[:,hvd.rank()] tensor = torch.from_numpy(tensor).to(device) hvd.allreduce_(tensor, op=hvd.Adasum) expected = np.sum(q,axis=1) / size comp = self.are_close(data_type, expected, tensor.cpu().numpy()) if comp: print('Stability test passed') else: print('computed: ', tensor) print('expected: ', expected) print('off by: ', self.diff_ratio(expected,tensor.cpu().numpy())) assert comp
def _whoami(self, verbose): hw_cfg = { 'n_cpus': mp.cpu_count(), 'n_gpus': torch.cuda.device_count() if torch.cuda.is_available() else 0 } sw_cfg = { 'global_size': hvd.size(), 'global_rank': hvd.rank(), 'local_size': hvd.local_size(), 'local_rank': hvd.local_rank(), 'master_rank': __MASTER_PROC_RANK__ } assert sw_cfg['local_size'] <= hw_cfg['n_cpus'] # maximum one process per core assert hw_cfg['n_gpus'] <= hw_cfg['n_cpus'] # maximum one GPU per core if hw_cfg['n_gpus'] > 0: assert sw_cfg['local_size'] <= hw_cfg['n_gpus'] # maximum one process per GPU torch.cuda.set_device(sw_cfg['local_rank']) # if node is equipped with GPUs, each process should be pinned to one device = torch.cuda.current_device() else: device = torch.device('cpu') hw_cfg['device'] = device self.hw_cfg = hw_cfg self.sw_cfg = sw_cfg self.is_master = self.sw_cfg['global_rank'] == self.sw_cfg['master_rank'] self.verbose = verbose and self.is_master
def fn(magic_number): import horovod.torch as hvd hvd.init() print( 'Hello, rank = %d, local_rank = %d, size = %d, local_size = %d, magic_number = %d' % (hvd.rank(), hvd.local_rank(), hvd.size(), hvd.local_size(), magic_number)) return hvd.rank()
def _handle_horovod(self) -> None: if self._num_nodes_flag > 1: raise MisconfigurationException( "Horovod does not support setting num_nodes / num_gpus explicitly. Use " "horovodrun / mpirun to configure the number of processes.") if not _HOROVOD_AVAILABLE: raise MisconfigurationException( 'Requested `accelerator="horovod"`, but Horovod is not installed.' "Install with \n $HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]" ) hvd.init() if isinstance(self.accelerator, GPUAccelerator): # Horovod assigns one local GPU per process self._parallel_devices = list(range(hvd.local_size())) else: self._parallel_devices = [torch.device("cpu")] * hvd.local_size()
def test_parallel(self): hvd.init() # TODO support non-MPI Adasum operation # Only do this test if there are GPUs available. if not hvd.mpi_enabled() or not torch.cuda.is_available(): self.skipTest("No GPUs available") device = torch.device('cuda:{}'.format(hvd.local_rank())) np.random.seed(2) torch.manual_seed(2) size = hvd.size() local_size = hvd.local_size() rank = hvd.rank() for data_type in self.data_types: all_Ns = [size * 20 - 13, size * 2 + 1, size + 2, 2**19] tensors = [] all_qs = [] for N in all_Ns: a = np.random.normal(0, 1, (N, 1)).astype(np.float64) r = np.random.normal(0, 1, (size, 1)).astype(np.float64) q = np.dot(a, r.T) q = q.astype(data_type) all_qs.append(q.astype(np.float64)) tensors.append(q[:, hvd.rank()]) tensors = list( map(lambda x: torch.from_numpy(x).to(device), tensors)) handles = [ hvd.allreduce_async(tensor, op=hvd.Adasum) for tensor in tensors ] reduced_tensors = [synchronize(h) for h in handles] expected = [np.sum(q, axis=1) / size for q in all_qs] all_comp = [ self.are_close(data_type, e, rt.cpu().numpy()) for e, rt in zip(expected, reduced_tensors) ] if np.alltrue(all_comp): print('Parallel test passed') else: for c, e, rt in zip(all_comp, expected, reduced_tensors): if c == False: print('computed: ', rt) print('expected: ', e) print('off by: ', self.diff_ratio(e, rt.cpu().numpy())) assert np.alltrue(all_comp)
def calculate_shuffle_buffer_size(): """ Determines the shuffling buffer size such that each worker gets at most 1GB for shuffling buffer such that on a single machine, among all the workers on that machine, at most memory_cap_gb GB are allocated for shuffling buffer. Also, it ensures that the buffer size is identical among all the workers. example 1: memory_cap_gb = 4 machine1: 8 workers machine2: 3 workers shuffle_buffer_size = 0.5 GB example 2: memory_cap_gb = 4 machine1: 2 workers machine2: 3 workers shuffle_buffer_size = 1 GB example 3: memory_cap_gb = 4 machine1: 2 workers machine2: 8 workers machine3: 5 workers shuffle_buffer_size = 0.5 GB """ import horovod.torch as hvd # If user specifies any user_shuffle_buffer_size (even 0), we should honor it. if user_shuffle_buffer_size is not None: if user_shuffle_buffer_size < 0: raise ValueError( "user_shuffle_buffer_size cannot be negative!") return user_shuffle_buffer_size local_size = hvd.local_size() local_sizes = hvd.allgather(torch.tensor([local_size])) max_local_size = torch.max(local_sizes).item() if max_local_size > TOTAL_BUFFER_MEMORY_CAP_GIB: shuffle_buffer_size = TOTAL_BUFFER_MEMORY_CAP_GIB * BYTES_PER_GIB / avg_row_size / max_local_size else: shuffle_buffer_size = BYTES_PER_GIB / avg_row_size return int(min(shuffle_buffer_size, train_rows / hvd.size()))
def test_stability_2(self): hvd.init() # TODO support non-MPI Adasum operation if not hvd.mpi_enabled(): return device = torch.device('cuda:{}'.format(hvd.local_rank( ))) if torch.cuda.is_available() else torch.device('cpu') np.random.seed(2) torch.manual_seed(2) size = hvd.size() local_size = hvd.local_size() rank = hvd.rank() for data_type in self.data_types: N = 1024 dt_min = np.finfo(data_type).tiny.astype(np.float64) dt_max = math.sqrt(np.finfo(data_type).max.astype(np.float64)) a = np.random.normal(0, 1, (N, 1)).astype(np.float64) r = np.array([ dt_max**(float(i + 1) / float(size)) * dt_min**(float(size - i - 1) / float(size)) for i in range(size) ]).reshape(size, 1).astype(np.float64) np.random.shuffle(r) q = np.dot(a, r.T).astype(data_type).astype(np.float64) tensor = np.zeros(N, dtype=data_type) tensor[:] = q[:, hvd.rank()] tensor = torch.from_numpy(tensor).to(device) hvd.allreduce_(tensor, op=hvd.Adasum) expected = np.sum(q, axis=1) / size comp = self.are_close(data_type, expected, tensor.cpu().numpy()) if comp: print('Stability 2 test passed') else: print('computed: ', tensor) print('expected: ', expected) print('off by: ', self.diff_ratio(expected, tensor.cpu().numpy())) assert comp
def _test__hvd_dist_model_create_from_context_dist(true_backend, true_device): assert _HorovodDistModel.create_from_context() is None hvd.init() true_conf = { "device": true_device, "local_rank": hvd.local_rank(), "rank": hvd.rank(), "world_size": hvd.size(), "node_index": 0, "nnodes": 1, "nproc_per_node": hvd.local_size(), } model = _HorovodDistModel.create_from_context() _assert_model(model, true_conf) hvd.shutdown()
def _test__hvd_dist_model_create_from_context_dist(true_backend, true_device): assert _HorovodDistModel.create_from_context() is None hvd.init() lrank = hvd.local_rank() if torch.cuda.is_available(): torch.cuda.set_device(lrank) true_conf = { "device": true_device, "local_rank": lrank, "rank": hvd.rank(), "world_size": hvd.size(), "node_index": 0, "nnodes": 1, "nproc_per_node": hvd.local_size(), } model = _HorovodDistModel.create_from_context() assert model.backend() == true_backend _assert_model(model, true_conf) hvd.shutdown()
def _compute_nproc_per_node(self) -> int: return hvd.local_size()
def setup(config): data_dir = config.get("data_dir", None) seed = config.get("seed", 42) batch_size = config.get("batch_size", 64) use_adasum = config.get("use_adasum", False) lr = config.get("lr", 0.01) momentum = config.get("momentum", 0.5) use_cuda = config.get("use_cuda", False) # Horovod: initialize library. hvd.init() torch.manual_seed(seed) if use_cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {} data_dir = data_dir or "~/data" with FileLock(os.path.expanduser("~/.horovod_lock")): train_dataset = datasets.MNIST( data_dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ) # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, **kwargs) model = Net() # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not use_adasum else 1 if use_cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD(model.parameters(), lr=lr * lr_scaler, momentum=momentum) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), op=hvd.Adasum if use_adasum else hvd.Average, ) return model, optimizer, train_loader, train_sampler
sampler=val_sampler, **kwargs) # Set up standard VGG16 model. model = models.vgg16() # By default, Adasum doesn't need scaling up learning rate. # For sum/average with gradient Accumulation: scale learning rate by batches_per_allreduce lr_scaler = args.batches_per_allreduce * hvd.size() if not args.use_adasum else 1 if args.cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = args.batches_per_allreduce * hvd.local_size() # Horovod: scale learning rate by the number of GPUs. optimizer = optim.SGD(model.parameters(), lr=(args.base_lr * lr_scaler), momentum=args.momentum, weight_decay=args.wd) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, backward_passes_per_step=args.batches_per_allreduce,)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, sampler=test_sampler, **kwargs) model = Net() # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not args.use_adasum else 1 if args.cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler, momentum=args.momentum) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(
def train_func(config): data_dir = config.get("data_dir", None) seed = config.get("seed", 42) use_cuda = config.get("use_cuda", False) batch_size = config.get("batch_size", 64) use_adasum = config.get("use_adasum", False) lr = config.get("lr", 0.01) momentum = config.get("momentum", 0.5) num_epochs = config.get("num_epochs", 10) log_interval = config.get("log_interval", 10) # Horovod: initialize library. hvd.init() torch.manual_seed(seed) if use_cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {} data_dir = data_dir or "~/data" with FileLock(os.path.expanduser("~/.horovod_lock")): train_dataset = \ datasets.MNIST(data_dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, sampler=train_sampler, **kwargs) model = Net() # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not use_adasum else 1 if use_cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD( model.parameters(), lr=lr * lr_scaler, momentum=momentum) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), op=hvd.Adasum if use_adasum else hvd.Average) results = [] for epoch in range(1, num_epochs + 1): model.train() # Horovod: set epoch to sampler for shuffling. train_sampler.set_epoch(epoch) num_batches = len(train_loader) for batch_idx, (data, target) in enumerate(train_loader): if use_cuda: data, target = data.cuda(), target.cuda() optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() if batch_idx % log_interval == 0: # Horovod: use train_sampler to determine the number of # examples in this worker's partition. print("Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format( epoch, batch_idx * len(data), len(train_sampler), 100. * batch_idx / len(train_loader), loss.item())) if batch_idx == num_batches - 1: results.append(loss.item()) return results
def __init__( self, env, env_params: dict, log_dir: str, ac_kwargs: dict = {}, seed: int = 0, steps_per_epoch: int = 4000, epochs: int = 50, gamma: float = 0.99, clip_ratio: float = 0.2, pi_lr: float = 3e-4, vf_lr: float = 1e-3, train_iters: int = 100, entropy_coeff: float = 1e-2, lam: float = 0.97, target_kl: float = 0.01, save_freq: int = 10, load_path=None, render_train: bool = False, wandb_id: Optional[str] = None, **kwargs, ): self.log_dir = log_dir self.render_dir = os.path.join(log_dir, "renders") self.ckpt_dir = os.path.join(log_dir, "checkpoints") if hvd.rank() == 0: os.makedirs(self.log_dir, exist_ok=True) os.makedirs(self.render_dir, exist_ok=True) os.makedirs(self.ckpt_dir, exist_ok=True) self.softlink = os.path.abspath( os.path.join(self.ckpt_dir, f"ckpt_latest.pth")) self.ac_params_file = os.path.join(log_dir, "ac_params.json") hparams = convert_json(locals()) self.logger = EpochLogger(output_dir=self.log_dir, exp_name=wandb_id) if torch.cuda.is_available(): # Horovod: pin GPU to local rank. dev_id = int(torch.cuda.device_count() * hvd.local_rank() / hvd.local_size()) torch.cuda.set_device(dev_id) device = torch.device(f"cuda:{dev_id}") torch.cuda.manual_seed(seed) else: device = torch.device("cpu") # env_params.update({"device": device}) self.env = env(**env_params) self.ac_params = {k: v for k, v in ac_kwargs.items()} self.ac_params.update({ "observation_space": self.env.observation_space, "action_space": self.env.action_space, "nagents": self.env.nagents, }) self.entropy_coeff = entropy_coeff self.entropy_coeff_decay = entropy_coeff / epochs # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) torch.save(self.ac_params, self.ac_params_file) if os.path.isfile(self.softlink): self.logger.log("Restarting from latest checkpoint", color="red") load_path = self.softlink # Random seed seed += 10000 * hvd.rank() torch.manual_seed(seed) np.random.seed(seed) self.nagents = self.env.nagents self.ac = PPOLidarActorCritic( self.env.observation_space, self.env.action_space, nagents=self.nagents, centralized=True, **ac_kwargs, ) self.device = device self.pi_lr = pi_lr self.vf_lr = vf_lr self.load_path = load_path if load_path is not None: self.load_model(load_path) else: self.pi_optimizer = Adam(trainable_parameters(self.ac.pi), lr=self.pi_lr, eps=1e-8) self.vf_optimizer = Adam(trainable_parameters(self.ac.v), lr=self.vf_lr, eps=1e-8) # Sync params across processes hvd.broadcast_parameters(self.ac.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(self.pi_optimizer, root_rank=0) hvd.broadcast_optimizer_state(self.vf_optimizer, root_rank=0) self.ac = self.ac.to(device) self.move_optimizer_to_device(self.pi_optimizer) self.move_optimizer_to_device(self.vf_optimizer) if hvd.rank() == 0: if wandb_id is None: eid = (log_dir.split("/")[-2] if load_path is None else load_path.split("/")[-4]) else: eid = wandb_id wandb.init( name=eid, id=eid, project="Social Driving", resume=load_path is not None, ) wandb.watch_called = False if "self" in hparams: del hparams["self"] wandb.config.update(hparams, allow_val_change=True) wandb.watch(self.ac.pi, log="all") wandb.watch(self.ac.v, log="all") # Count variables var_counts = tuple( count_vars(module) for module in [self.ac.pi, self.ac.v]) self.logger.log( "\nNumber of parameters: \t pi: %d, \t v: %d\n" % var_counts, color="green", ) # Set up experience buffer self.steps_per_epoch = steps_per_epoch self.local_steps_per_epoch = int(steps_per_epoch / hvd.size()) self.buf = CentralizedPPOBuffer( self.env.observation_space[0].shape, self.env.observation_space[1].shape, self.env.action_space.shape, self.local_steps_per_epoch, gamma, lam, self.env.nagents, device=self.device, ) self.gamma = gamma self.clip_ratio = clip_ratio self.train_iters = train_iters self.target_kl = target_kl self.epochs = epochs self.save_freq = save_freq
def main_worker(args_): args_.cuda = not args_.no_cuda and torch.cuda.is_available() allreduce_batch_size = args_.batch_size * args_.batches_per_allreduce hvd.init() torch.distributed.init_process_group('nccl', rank=4) if args_.cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) print(f"this process's hvd rank = {hvd.local_rank()}") # torch.cuda.manual_seed(args_.seed) # cudnn.benchmark = True # # If set > 0, will resume training from a given checkpoint. # resume_from_epoch = 0 # for try_epoch in range(args_.epochs, 0, -1): # if os.path.exists(args_.checkpoint_format.format(epoch=try_epoch)): # resume_from_epoch = try_epoch # break # # # Horovod: broadcast resume_from_epoch from rank 0 (which will have # # checkpoints) to other ranks. # resume_from_epoch = hvd.broadcast(torch.tensor(resume_from_epoch), root_rank=0, # name='resume_from_epoch').item() # # Horovod: print logs on the first worker. # verbose = 1 if hvd.rank() == 0 else 0 # # # Horovod: write TensorBoard logs on first worker. # try: # if LooseVersion(torch.__version__) >= LooseVersion('1.2.0'): # from torch.utils.tensorboard import SummaryWriter # else: # from tensorboardX import SummaryWriter # os.makedirs(os.path.join(args_.model_output_dir, 'logs'), exist_ok=True) # log_writer = SummaryWriter(os.path.join(args_.model_output_dir, 'logs')) if hvd.rank() == 0 else None # except ImportError: # log_writer = None ### MODEL CREATION ### # create model model1 = VQ_VAE(num_inputs=1, weight_matching=0., channel_var=np.ones((1,))) model2 = VQ_VAE(num_inputs=1, weight_matching=0.0005, channel_var=np.ones((1,))) model1.cuda() model2.cuda() model1 = torch.nn.parallel.DistributedDataParallel(model1) model2 = torch.nn.parallel.DistributedDataParallel(model2) # By default, Adasum doesn't need scaling up learning rate. # For sum/average with gradient Accumulation: scale learning rate by batches_per_allreduce if args_.cuda and args_.use_adasum and hvd.nccl_built(): # If using GPU Adasum allreduce, scale learning rate by local_size. lr_scaler = args_.batches_per_allreduce * hvd.local_size() elif not args_.use_adasum: lr_scaler = args_.batches_per_allreduce * hvd.size() else: lr_scaler = 1 # Horovod: scale learning rate by the number of GPUs. optimizer1 = t.optim.Adam(model1.parameters(), lr=(args_.base_lr * lr_scaler), betas=(.9, .999)) optimizer2 = t.optim.Adam(model2.parameters(), lr=(args_.base_lr * lr_scaler), betas=(.9, .999)) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args_.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer1 = hvd.DistributedOptimizer( optimizer1, named_parameters=model1.named_parameters(), compression=compression, backward_passes_per_step=args_.batches_per_allreduce, op=hvd.Adasum if args_.use_adasum else hvd.Average) optimizer2 = hvd.DistributedOptimizer( optimizer2, named_parameters=model2.named_parameters(), compression=compression, backward_passes_per_step=args_.batches_per_allreduce, op=hvd.Adasum if args_.use_adasum else hvd.Average) # # Restore from a previous checkpoint, if initial_epoch is specified. # # Horovod: restore on the first worker which will broadcast weights to other workers. # if resume_from_epoch > 0 and hvd.rank() == 0: # filepath = args.checkpoint_format.format(epoch=resume_from_epoch) # checkpoint = torch.load(filepath) # model.load_state_dict(checkpoint['model']) # optimizer.load_state_dict(checkpoint['optimizer']) ### Settings ### model_output_dir = args_.model_output_dir project_dir = args_.project_dir ### Prepare Data ### log.info("LOADING FILES") # ======= load data using pytorch systems ======== torch.set_num_threads(4) dataset = DatasetFolderWithPaths( root=project_dir+"/JUNE"+"/raw_patches", loader=npy_loader, extensions='.npy' ) dataset_mask = DatasetFolderWithPaths( root=project_dir+"/JUNE"+"/raw_masks", loader=npy_loader, extensions='.npy' ) relation_mat = np.load(os.path.join(project_dir, "JUNE", "raw_patches", "relation_mat.npy"), allow_pickle=True) # Horovod: use DistributedSampler to partition data among workers. Manually specify # `num_replicas=hvd.size()` and `rank=hvd.rank()`. train_sampler = torch.utils.data.distributed.DistributedSampler( dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_sampler_mask = torch.utils.data.distributed.DistributedSampler( dataset_mask, num_replicas=hvd.size(), rank=hvd.rank()) os.makedirs(os.path.join(model_output_dir, "stage1"), exist_ok=True) os.makedirs(os.path.join(model_output_dir, "stage2"), exist_ok=True) # ========================================================= # ========================================================= log.info("TRAINING: STARTING STAGE 1") kwargs = {'num_workers': 4, 'pin_memory': True} if args_.cuda else {} train_loader = torch.utils.data.DataLoader( dataset, batch_size=allreduce_batch_size, sampler=train_sampler, **kwargs) train_mask_loader = torch.utils.data.DataLoader( dataset_mask, batch_size=allreduce_batch_size, sampler=train_sampler_mask, **kwargs) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model1.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer1, root_rank=0) output_dir = os.path.join(model_output_dir, "stage1") writer = SummaryWriter(output_dir) log.info(f"\ttensorboard logs written to {output_dir}") for epoch in range(args_.stage1_epochs): model1.train() train_sampler.set_epoch(epoch) mean_loss = train(model1, train_loader, optimizer1, # relation_mat=relation_mat, mask_loader=train_mask_loader, args_=args_ ) for key, loss in mean_loss.items(): mean_loss[key] = sum(loss) / len(loss) if len(loss) > 0 else -1. writer.add_scalar('Loss/' + key, mean_loss[key], epoch) writer.flush() log.info('\tepoch %d' % epoch) log.info('\t'.join(['{}:{:0.4f} '.format(key, loss) for key, loss in mean_loss.items()])) # only master process should save checkpoints. if torch.distributed.get_rank() == 0: log.info(f'\t saving epoch {epoch}') t.save(model1.state_dict(), os.path.join(output_dir, 'model_epoch%d.pt' % epoch)) writer.close() # ========================================================= # ========================================================= log.info("TRAINING: STARTING STAGE 2") # get the last saved epoch. on IBM, use max(). on OSX use min() # s1_epochs = glob.glob(os.path.join(model_output_dir, "stage1", "/*")) s1_epochs = glob.glob(os.path.join(model_output_dir, "stage1") + '/*.pt') last_epoch = max(s1_epochs, key=os.path.getctime) log.info(f"\tloading last epoch = {last_epoch}") train_loader = torch.utils.data.DataLoader(dataset, batch_size=allreduce_batch_size, sampler=train_sampler) train_mask_loader = torch.utils.data.DataLoader(dataset_mask, batch_size=allreduce_batch_size, sampler=train_sampler_mask) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model2.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer2, root_rank=0) output_dir = os.path.join(model_output_dir, "stage2") writer = SummaryWriter(output_dir) log.info(f"\ttensorboard logs written to {output_dir}") model2.load_state_dict(t.load(last_epoch)) for epoch in range(args_.stage2_epochs): model2.train() train_sampler.set_epoch(epoch) mean_loss = train(model2, train_loader, optimizer2, # relation_mat=relation_mat, mask_loader=train_mask_loader ) # shuffle samples ids at the end of the epoch # if shuffle_data: # np.random.shuffle(sample_ids) for key, loss in mean_loss.items(): mean_loss[key] = sum(loss) / len(loss) if len(loss) > 0 else -1. writer.add_scalar('Loss/' + key, mean_loss[key], epoch) writer.flush() log.info('\tepoch %d' % epoch) log.info('\t'.join(['{}:{:0.4f} '.format(key, loss) for key, loss in mean_loss.items()])) if torch.distributed.get_rank() == 0: log.info(f'\t saving epoch {epoch}') t.save(model2.state_dict(), os.path.join(output_dir, 'model_epoch%d.pt' % epoch)) writer.close()
def train_fn(): # Horovod: initialize library. hvd.init() torch.manual_seed(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() if args.cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_dataset = \ datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) transformations = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) test_dataset = datasets.MNIST('data-%d' % hvd.rank(), train=False, transform=transformations) # Horovod: use DistributedSampler to partition the test data. test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, sampler=test_sampler, **kwargs) model = Net() # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not args.use_adasum else 1 if args.cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler, momentum=args.momentum) # Horovod: (optional) compression algorithm. compression = (hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none) @hvd.elastic.run def train(state): # post synchronization event (worker added, worker removed) init ... for state.epoch in range(state.epoch, args.epochs + 1): state.model.train() train_sampler.set_epoch(state.epoch) steps_remaining = len(train_loader) - state.batch for state.batch, (data, target) in enumerate(train_loader): if state.batch >= steps_remaining: break if args.cuda: data, target = data.cuda(), target.cuda() state.optimizer.zero_grad() output = state.model(data) loss = F.nll_loss(output, target) loss.backward() state.optimizer.step() if state.batch % args.log_interval == 0: # Horovod: use train_sampler to determine # the number of examples in this worker's partition. print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'. format(state.epoch, state.batch * len(data), len(train_sampler), 100.0 * state.batch / len(train_loader), loss.item())) if (state.batch + 1) % args.num_batches_per_commit == 0: state.commit() state.batch = 0 def test(): model.eval() test_loss = 0. test_accuracy = 0. for data, target in test_loader: if args.cuda: data, target = data.cuda(), target.cuda() output = model(data) # sum up batch loss test_loss += F.nll_loss(output, target, size_average=False).item() # get the index of the max log-probability pred = output.data.max(1, keepdim=True)[1] test_accuracy += pred.eq( target.data.view_as(pred)).cpu().float().sum() # Horovod: use test_sampler to determine the number of examples in # this worker's partition. test_loss /= len(test_sampler) test_accuracy /= len(test_sampler) # Horovod: average metric values across workers. test_loss = metric_average(test_loss, 'avg_loss') test_accuracy = metric_average(test_accuracy, 'avg_accuracy') # Horovod: print output only on first rank. if hvd.rank() == 0: print( '\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( test_loss, 100. * test_accuracy)) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Adasum if args.use_adasum else hvd.Average) # adjust learning rate on reset def on_state_reset(): for param_group in optimizer.param_groups: param_group['lr'] = args.lr * hvd.size() state = hvd.elastic.TorchState(model, optimizer, epoch=1, batch=0) state.register_reset_callbacks([on_state_reset]) train(state) test()
def main(args): def train_mixed_precision(epoch, scaler): model.train() # Horovod: set epoch to sampler for shuffling. train_sampler.set_epoch(epoch) for batch_idx, (data, target) in enumerate(train_loader): if args.cuda: data, target = data.cuda(), target.cuda() optimizer.zero_grad() with torch.cuda.amp.autocast(): output = model(data) loss = F.nll_loss(output, target) scaler.scale(loss).backward() # Make sure all async allreduces are done optimizer.synchronize() # In-place unscaling of all gradients before weights update scaler.unscale_(optimizer) with optimizer.skip_synchronize(): scaler.step(optimizer) # Update scaler in case of overflow/underflow scaler.update() if batch_idx % args.log_interval == 0: # Horovod: use train_sampler to determine the number of examples in # this worker's partition. print( 'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tLoss Scale: {}' .format(epoch, batch_idx * len(data), len(train_sampler), 100. * batch_idx / len(train_loader), loss.item(), scaler.get_scale())) def train_epoch(epoch): model.train() # Horovod: set epoch to sampler for shuffling. train_sampler.set_epoch(epoch) for batch_idx, (data, target) in enumerate(train_loader): if args.cuda: data, target = data.cuda(), target.cuda() optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() if batch_idx % args.log_interval == 0: # Horovod: use train_sampler to determine the number of examples in # this worker's partition. print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_sampler), 100. * batch_idx / len(train_loader), loss.item())) def metric_average(val, name): tensor = torch.tensor(val) avg_tensor = hvd.allreduce(tensor, name=name) return avg_tensor.item() def test(): model.eval() test_loss = 0. test_accuracy = 0. for data, target in test_loader: if args.cuda: data, target = data.cuda(), target.cuda() output = model(data) # sum up batch loss test_loss += F.nll_loss(output, target, size_average=False).item() # get the index of the max log-probability pred = output.data.max(1, keepdim=True)[1] test_accuracy += pred.eq( target.data.view_as(pred)).cpu().float().sum() # Horovod: use test_sampler to determine the number of examples in # this worker's partition. test_loss /= len(test_sampler) test_accuracy /= len(test_sampler) # Horovod: average metric values across workers. test_loss = metric_average(test_loss, 'avg_loss') test_accuracy = metric_average(test_accuracy, 'avg_accuracy') # Horovod: print output only on first rank. if hvd.rank() == 0: print( '\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( test_loss, 100. * test_accuracy)) # Horovod: initialize library. hvd.init() torch.manual_seed(args.seed) if args.cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) else: if args.use_mixed_precision: raise ValueError( "Mixed precision is only supported with cuda enabled.") if (args.use_mixed_precision and LooseVersion(torch.__version__) < LooseVersion('1.6.0')): raise ValueError("""Mixed precision is using torch.cuda.amp.autocast(), which requires torch >= 1.6.0""") # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' data_dir = args.data_dir or './data' with FileLock(os.path.expanduser("~/.horovod_lock")): train_dataset = \ datasets.MNIST(data_dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) test_dataset = \ datasets.MNIST(data_dir, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the test data. test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, sampler=test_sampler, **kwargs) model = Net() # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not args.use_adasum else 1 if args.cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler, momentum=args.momentum) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Adasum if args.use_adasum else hvd.Average, gradient_predivide_factor=args.gradient_predivide_factor) if args.use_mixed_precision: # Initialize scaler in global scale scaler = torch.cuda.amp.GradScaler() for epoch in range(1, args.epochs + 1): if args.use_mixed_precision: train_mixed_precision(epoch, scaler) else: train_epoch(epoch) # Keep test in full precision since computation is relatively light. test()
return with open('/var/scratch/sdhar/logs/pytorch_synthetic.csv', 'a', newline='') as f: csvwriter = csv.writer(f, lineterminator="\n") csvwriter.writerow([ model, batch_size, device, num_devices, num_devices_per_node, disable_ib, disable_nccl_p2p, img_sec_mean, img_sec_conf, total_img_sec_mean, total_img_sec_conf]) log_csv( args.model, str(args.batch_size), device, str(hvd.size()), str(hvd.local_size()), #Disable infiniband str(args.disable_ib), #Disable NCCL P2P Communication str(args.disable_p2p), str(img_sec_mean), str(img_sec_conf), str(hvd.size() * img_sec_mean), str(hvd.size() * img_sec_conf))
def train(args): hvd.init() print("Hello from local_rank {}/{}, rank {}/{}".format( hvd.local_rank(), hvd.local_size(), hvd.rank(), hvd.size())) verbose = hvd.rank() == 0 if verbose: print('Using PyTorch version:', torch.__version__) print('Horovod version: {}, CUDA: {}, ROCM: {}, NCCL: {}, MPI: {}'.format( hvd_version, hvd.cuda_built(), hvd.rocm_built(), hvd.nccl_built(), hvd.mpi_built())) print(torch.__config__.show()) cudnn.benchmark = True torch.cuda.set_device(hvd.local_rank()) world_size = hvd.size() # Set up standard model. if verbose: print('Using {} model'.format(args.model)) model = getattr(models, args.model)() model = model.cuda() # import torch.multiprocessing as mp # # # assert "forkserver" in mp.get_all_start_methods() # mp.set_start_method("forkserver") lr_scaler = hvd.size() criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), 1e-4 * lr_scaler) optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters()) train_dataset = dataset_from_datadir(args.datadir, verbose) train_sampler = DistributedSampler(train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = DataLoader(dataset=train_dataset, batch_size=args.batchsize, shuffle=False, num_workers=args.workers, pin_memory=False, sampler=train_sampler, multiprocessing_context='forkserver') hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) total_step = args.steps if args.steps is not None else len(train_loader) # For each block of printed steps last_start = datetime.now() last_images = 0 # For final average avg_images = 0 avg_start = None tot_steps = 0 for epoch in range(args.epochs): for i, (images, labels) in enumerate(train_loader): images = images.cuda(non_blocking=True) labels = labels.cuda(non_blocking=True) outputs = model(images) loss = criterion(outputs, labels) optimizer.zero_grad() loss.backward() optimizer.step() li = len(images) last_images += li tot_steps += 1 if tot_steps == args.warmup_steps: avg_start = datetime.now() elif tot_steps > args.warmup_steps: avg_images += li if (i + 1) % args.print_steps == 0 and verbose: now = datetime.now() last_secs = (now-last_start).total_seconds() print(f'Epoch [{epoch+1}/{args.epochs}], Step [{i+1}/{total_step}], ' f'Loss: {loss.item():.4f}, ' f'Images/sec: {last_images*world_size/last_secs:.2f} ' f'(last {args.print_steps} steps)') last_start = now last_images = 0 if args.steps is not None and i >= args.steps: break if verbose: dur = datetime.now() - avg_start print(f"Training completed in: {dur}") print(f"Images/sec: {avg_images*world_size/dur.total_seconds():.2f} " f"(average, skipping {args.warmup_steps} warmup steps)")
def hvd_param_scaling(self): if hvd.nccl_built(): self.batch_size = int(self.batch_size / hvd.local_size()) self.iters_per_epoch = int(self.max_iterations / self.epochs / hvd.local_size())
def get_local_size(): global _USE_HVD if _USE_HVD: return hvd.local_size() return comm.get_local_size()
def main(): global args, best_prec1, best_prec5 args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() #horovod initialize hvd.init() log = None if hvd.rank() == 0: log = SummaryWriter(log_dir=args.log_dir) print('The Training Model is %s' % args.arch) # Check the save_dir exists or not if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) if args.cuda: torch.cuda.set_device(hvd.local_rank()) normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' train_dataset = datasets.CIFAR10('data-%d'%hvd.local_rank(), train=True, transform=transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]), download=True) val_dataset = datasets.CIFAR10('data-%d'%hvd.local_rank(), train=False,transform=transforms.Compose([ transforms.ToTensor(), normalize, ])) #Horovod Partition the training data train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_dataset,batch_size=args.batch_size,sampler=train_sampler,**kwargs) val_loader = torch.utils.data.DataLoader( val_dataset,batch_size=args.batch_size,sampler=val_sampler,**kwargs) # model = torch.nn.DataParallel(resnet.__dict__[args.arch]()) if args.arch in resnet.__dict__: model = resnet.__dict__[args.arch]() elif args.arch == 'alexnet': model = models.AlexNet() elif args.arch == 'vgg16': model = models.VGG16() if hvd.rank() == 0: numel = sum(p.numel() for p in model.parameters()) print('Total params: {:d}'.format(numel)) lr_scaler = hvd.size() if args.cuda: model.cuda() if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.evaluate, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() if args.half: model.half() criterion.half() base_optimizer = torch.optim.SGD(model.parameters(), args.lr * lr_scaler, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(base_optimizer, # milestones=[100, 150], last_epoch=args.start_epoch - 1) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(base_optimizer, root_rank=0) #Compression # compression = Allgather(MGCCompressor(0.05), ResidualMemory(), hvd.size()) # compression = Allgather(TernGradCompressor(), ResidualMemory(), hvd.size()) compression = Allreduce(NoneCompressor(), NoneMemory()) # compression = Allgather(DgcCompressor(0.01), ResidualMemory(), hvd.size()) # compression = Allgather(LowQSGDCompressor(), ResidualMemory(), hvd.size()) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(base_optimizer, compression, named_parameters=model.named_parameters()) if hvd.rank() == 0: log.add_scalar('train/accuracy', 0., 0) log.add_scalar('test/accuracy', 0., 0) for epoch in range(args.start_epoch + 1, args.epochs + 1): adjust_learning_rate(optimizer, epoch, size=lr_scaler) if hvd.rank() == 0: print('current lr {:.5e}'.format(optimizer.param_groups[0]['lr'])) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, log=log) # evaluate on validation set prec1, prec5 = validate(val_loader, model, criterion, epoch, log=log) # remember best prec@1 and save checkpoint best_prec1 = max(prec1, best_prec1) best_prec5 = max(prec5, best_prec5) if hvd.rank() == 0: print('Best Pred@1:{:.2f}%, Prec@5:{:.2f}%\n'.format(best_prec1, best_prec5)) # if epoch > 0 and epoch % args.save_every == 0: # save_checkpoint({ # 'epoch': epoch + 1, # 'state_dict': model.state_dict(), # 'best_prec1': best_prec1, # }, is_best, filename=os.path.join(args.save_dir, 'checkpoint.th')) # # save_checkpoint({ # 'state_dict': model.state_dict(), # 'best_prec1': best_prec1, # }, is_best, filename=os.path.join(args.save_dir, 'model.th')) if hvd.rank() == 0: log.close()
def main(): ''' simple starter program that can be copied for use when starting a new script. ''' parser = argparse.ArgumentParser(description='') parser.add_argument('-c', '--config_file', help='configuration file in json format', required=True) parser.add_argument( '--num_files', '-n', default=-1, type=int, help='limit the number of files to process. default is all') parser.add_argument( '--model_save', help='base name of saved model parameters for later loading') parser.add_argument('--nsave', default=100, type=int, help='frequency in batch number to save model') parser.add_argument( '--nval', default=100, type=int, help='frequency to evaluate validation sample in batch numbers') parser.add_argument('--nval_tests', default=-1, type=int, help='number batches to test per validation run') parser.add_argument('--status', default=20, type=int, help='frequency to print loss status in batch numbers') parser.add_argument('--batch', default=-1, type=int, help='set batch size, overrides file config') parser.add_argument('--random_seed', default=0, type=int, help='numpy random seed') parser.add_argument( '--valid_only', default=False, action='store_true', help='flag that triggers validation run. prints confusion matrix.') parser.add_argument( '--batch_limiter', help= 'if set to an integer, will limit the number of batches during training. Use this to create short training runs for profiling.', type=int) parser.add_argument( '-i', '--input_model_pars', help= 'if provided, the file will be used to fill the models state dict from a previous run.' ) parser.add_argument('-e', '--epochs', type=int, default=-1, help='number of epochs') parser.add_argument('-l', '--logdir', help='log directory for tensorboardx') parser.add_argument('--horovod', default=False, action='store_true', help="Setup for distributed training") parser.add_argument('--cpu-only', default=False, action='store_true', help='set to force CPU only running') parser.add_argument('--debug', dest='debug', default=False, action='store_true', help="Set Logger to DEBUG") parser.add_argument('--error', dest='error', default=False, action='store_true', help="Set Logger to ERROR") parser.add_argument('--warning', dest='warning', default=False, action='store_true', help="Set Logger to ERROR") parser.add_argument('--logfilename', dest='logfilename', default=None, help='if set, logging information will go to file') args = parser.parse_args() logging_format = '%(asctime)s %(levelname)s:%(name)s:%(process)s:%(thread)s:%(message)s' logging_datefmt = '%Y-%m-%d %H:%M:%S' log_level = logging.INFO if args.debug and not args.error and not args.warning: log_level = logging.DEBUG elif not args.debug and args.error and not args.warning: log_level = logging.ERROR elif not args.debug and not args.error and args.warning: log_level = logging.WARNING rank = 0 nranks = 1 local_rank = 0 local_size = 1 hvd = None if args.horovod: print('importing horovod') import horovod.torch as hvd print('imported horovod') hvd.init() rank = hvd.rank() nranks = hvd.size() local_rank = hvd.local_rank() local_size = hvd.local_size() logging_format = '%(asctime)s %(levelname)s:' + '{:05d}'.format( rank) + ':%(name)s:%(process)s:%(thread)s:%(message)s' if rank > 0 and log_level == logging.INFO: log_level = logging.WARNING logging.basicConfig(level=log_level, format=logging_format, datefmt=logging_datefmt, filename=args.logfilename) device = torch.device('cpu') if torch.cuda.is_available() and not args.cpu_only: device = torch.device('cuda:%d' % local_rank) torch.cuda.set_device(device) model_save = args.model_save if model_save is None: model_save = os.path.join(args.logdir, 'model') logger.warning('rank %6s of %6s local rank %6s of %6s', rank, nranks, local_rank, local_size) logger.info('hostname: %s', socket.gethostname()) logger.info('python version: %s', sys.version) logger.info('num_threads: %s', torch.get_num_threads()) logger.info('torch version: %s', torch.__version__) logger.info('torch file: %s', torch.__file__) logger.info('config file: %s', args.config_file) logger.info('num files: %s', args.num_files) logger.info('model_save: %s', model_save) logger.info('random_seed: %s', args.random_seed) logger.info('valid_only: %s', args.valid_only) logger.info('nsave: %s', args.nsave) logger.info('nval: %s', args.nval) logger.info('nval_tests: %s', args.nval_tests) logger.info('status: %s', args.status) logger.info('input_model_pars: %s', args.input_model_pars) logger.info('epochs: %s', args.epochs) logger.info('horovod: %s', args.horovod) logger.info('cpu_only: %s', args.cpu_only) logger.info('logdir: %s', args.logdir) np.random.seed(args.random_seed) config_file = json.load(open(args.config_file)) config_file['rank'] = rank config_file['nranks'] = nranks config_file['input_model_pars'] = args.input_model_pars config_file['horovod'] = args.horovod config_file['status'] = args.status config_file['nval'] = args.nval config_file['nval_tests'] = args.nval_tests config_file['nsave'] = args.nsave config_file['model_save'] = model_save config_file['valid_only'] = args.valid_only config_file['batch_limiter'] = args.batch_limiter config_file['cpu_only'] = args.cpu_only if args.valid_only and not args.input_model_pars: logger.error('if valid_only set, must provide input model') return if args.batch > 0: logger.info('setting batch size from command line: %s', args.batch) config_file['training']['batch_size'] = args.batch if args.epochs > 0: logger.info('setting epochs from command line: %s', args.epochs) config_file['training']['epochs'] = args.epochs logger.info('configuration = \n%s', json.dumps(config_file, indent=4, sort_keys=True)) config_file['hvd'] = hvd # get datasets for training and validation trainds, testds = data_handler.get_datasets(config_file) # setup tensorboard writer = None if args.logdir and rank == 0: if not os.path.exists(args.logdir): os.makedirs(args.logdir) writer = tensorboardX.SummaryWriter(args.logdir) logger.info('building model') torch.manual_seed(args.random_seed) net = model.get_model(config_file) logger.info('model = \n %s', net) total_params = sum(p.numel() for p in net.parameters()) logger.info('trainable parameters: %s', total_params) if args.valid_only: valid_model(net, validds, config_file) else: train_model(net, trainds, testds, config_file, device, writer)