def load_state(self): # Load the state on rank 0: if state is not None and hvd.rank() == 0: self.load_state(state) # Broadcast the global step: self._global_step = hvd.broadcast_object(self._global_step, root_rank=0) # Broadcast the state of the model: hvd.broadcast_parameters(self._net.state_dict(), root_rank=0) # Broadcast the optimizer state: hvd.broadcast_optimizer_state(self._opt, root_rank=0) # Horovod doesn't actually move the optimizer onto a GPU: if self.args.compute_mode == "GPU": for state in self._opt.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() # Broadcast the LR Schedule state: state_dict = hvd.broadcast_object(self.lr_scheduler.state_dict(), root_rank=0) self.lr_scheduler.load_state_dict(state_dict)
def load_checkpoint(self, net, opt, lr_sched, ctrls, load): if self.is_master: self.i_epoch = 0 # look for a checkpoint ckpt_list = os.listdir(self.dir_saves) if len(ckpt_list) != 0: if load == 'best': # used when evaluating performances ckpt_file = os.path.join(self.dir_saves, 'best.ckpt') elif load == 'last': # used when restoring crashed experiments/resuming interrupted experiments ckpt_file = max([os.path.join(self.dir_saves, f) for f in ckpt_list], key=os.path.getctime) else: ckpt_file = None # else: # used in 'WHAT IF' experiments as an initial condition # ckpt_id = str(load).rjust(__ALIGN_EPOCHS__, '0') # ckpt_name = 'epoch' + ckpt_id + '.ckpt' # ckpt_file = os.path.join(self.dir_saves, ckpt_name) # load checkpoint from file if self.verbose: print('Loading checkpoint {} ...'.format(ckpt_file), end='') ckpt = torch.load(ckpt_file) if self.verbose: print('done!') # restore experiment status self.i_epoch = ckpt['fold']['i_epoch'] self.metrics.update(ckpt['fold']['metrics']) net.load_state_dict(ckpt['network']) opt.load_state_dict(ckpt['training']['optimizer']) lr_sched.load_state_dict(ckpt['training']['lr_scheduler']) for c, sd in zip(ctrls, ckpt['training']['quantize']): c.load_state_dict(sd) # broadcast experiment status to worker processes self.i_epoch = hvd.broadcast_object(self.i_epoch, root_rank=__MASTER_PROC_RANK__, name='i_epoch') self.metrics = hvd.broadcast_object(self.metrics, root_rank=__MASTER_PROC_RANK__, name='metrics') hvd.broadcast_parameters(net.state_dict(), root_rank=__MASTER_PROC_RANK__) hvd.broadcast_optimizer_state(opt, root_rank=__MASTER_PROC_RANK__) lr_sched_state_dict = hvd.broadcast_object(lr_sched.state_dict(), root_rank=__MASTER_PROC_RANK__, name='lr_sched_state_dict') if not self.is_master: lr_sched.load_state_dict(lr_sched_state_dict) for i, c in enumerate(ctrls): csd = hvd.broadcast_object(c.state_dict(), root_rank=__MASTER_PROC_RANK__, name='controller{}'.format(i)) if not self.is_master: c.load_state_dict(csd)
def restore_model(self): if self._rank == 0: state = self.load_state_from_file() else: state = None if state is not None and self._rank == 0: self.restore_state(state) if self.args.framework.distributed_mode == "horovod": # Broadcast the global step: self._global_step = hvd.broadcast_object(self._global_step, root_rank = 0) # Broadcast the state of the model: hvd.broadcast_parameters(self._net.state_dict(), root_rank = 0) # Broadcast the optimizer state: hvd.broadcast_optimizer_state(self._opt, root_rank = 0) # Horovod doesn't actually move the optimizer onto a GPU: if self.args.run.compute_mode == "GPU": for state in self._opt.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() # Broadcast the LR Schedule state: state_dict = hvd.broadcast_object(self.lr_scheduler.state_dict(), root_rank = 0) elif self.args.framework.distributed_mode == "DDP": if self.args.run.compute_mode == "GPU": self._net.cuda() self._net = torch.nn.parallel.DistributedDataParallel(self._net) self._global_step = MPI.COMM_WORLD.bcast(self._global_step, root=0) state_dict = MPI.COMM_WORLD.bcast(self.lr_scheduler.state_dict(), root=0) # Load the state dict: self.lr_scheduler.load_state_dict(state_dict)
def run_api_experiment(input_features, output_features, dataset, **kwargs): config = { "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat", "fc_size": 14}, "training": {"epochs": 2}, } model = LudwigModel(config) output_dir = None try: # Training with csv _, _, output_dir = model.train(dataset=dataset, **kwargs) model.predict(dataset=dataset) # Attempt loading saved model, should broadcast successfully model_dir = os.path.join(output_dir, "model") if output_dir else None loaded_model = LudwigModel.load(model_dir) # Model loading should broadcast weights from coordinator loaded_state = loaded_model.model.state_dict() bcast_state = hvd.broadcast_object(loaded_state) for loaded, bcast in zip(loaded_state.values(), bcast_state.values()): assert np.allclose(loaded, bcast) finally: if output_dir: shutil.rmtree(output_dir, ignore_errors=True)
def restore_experiment_from_checkpoint(self, checkpoint, loadModelOnly=False): """ Restores an experiment stats, optimizer, and model from checkpoint Experiment is restored to CPU! When restoring a model, look for the status flag 'quantized' If the model has been quantized, but the experiment doesn't require quantization, then error out. :param checkpoint: Checkpoint path :return: None """ map_device = torch.device('cpu') #Broadcast the experiment status to all workers if self.multiprocessing is True: if hvd.rank() == 0: state_dict = torch.load(checkpoint, map_location=map_device) else: state_dict = None state_dict = hvd.broadcast_object(state_dict, root_rank=0, name='state_dict') else: state_dict = torch.load(checkpoint, map_location=map_device) experimentStatus = state_dict['experimentStatus'] #self.experimentStatus = experimentStatus for selfKey in self.experimentStatus.keys(): if selfKey in experimentStatus.keys(): self.experimentStatus[selfKey] = experimentStatus[selfKey] # self.experimentStatus.numEpochTrained = 0 if loadModelOnly is False: config = state_dict['experimentConfig'] # self.config = config for selfKey in self.config.keys(): if selfKey in config.keys(): self.config[selfKey] = config[selfKey] # Save the optimizer state self.optimizerStateDict = state_dict['optimizer'] else: self.experimentStatus.numEpochTrained = 0 self.experimentStatus.numPhaseTrained = 0 # Load the model # If pruning and quantization are both required, # then prune before quantize. # Otherwise the model might be pruned twice if experimentStatus.flagPruned is True: # If the network has been sparsified, then it no longer has # 'weight' as a registered buffer. # Instead, it has weight_orig and weight_mask # We need to allocate a sparsified network before loading the model's state dict # The target sparsity used to allocate the "sparsified network" can be random self.prune_network(sparsityTarget=0.0) # Check for whether it makes sense to load a quantized experiment # If so, quantize the model before proceed with loading if experimentStatus.flagFusedQuantized is True: assert self.config.quantize is True, \ 'Loaded experiment contains quantized model, but the experiment config does not require quantization' self.quantize_model() self.restore_model_from_state_dict(state_dict['model'])
def close_fold(self): if self.is_master: self.writer.close() if self.verbose: print('Fold [{}/{}] completed'.format(self.i_fold + 1, self.config['experiment']['n_folds'])) self.i_fold += 1 # communicate current fold to worker processes self.i_fold = hvd.broadcast_object(self.i_fold, root_rank=__MASTER_PROC_RANK__, name='i_fold_new')
def get_training_status(self): if self.is_master: # which fold should be resumed (i.e., the last)? folds_list = os.listdir(self.dir_exp) try: i_fold = max([int(f.replace('fold', '')) for f in folds_list]) except ValueError: i_fold = 0 self.i_fold = i_fold # communicate current fold to worker processes self.i_fold = hvd.broadcast_object(self.i_fold, root_rank=__MASTER_PROC_RANK__, name='i_fold')
def get_speed(self, index): if index in self.map_sec: return self.map_sec[index] self.grace.memory.clean() self.grace.compressor.clean() self.grace.memory.partition([index]) # obtain the mean speed of multiple iterations comm_time_per_iter = self.get_avg_comm(self.benchmark_step) # negotiate the training speed with other workers comm_time_per_iter = hvd.broadcast_object(comm_time_per_iter, root_rank=0) self.map_sec[index] = comm_time_per_iter print("benchmark time: {:.3f} ms\tindex: {}".format( comm_time_per_iter * 1000, index)) return comm_time_per_iter
def synchronize(): global _USE_HVD if _USE_HVD: hvd.broadcast_object(0) return return comm.synchronize()
# trainer args parser.add_argument('--skip_output',type=int,default=10,help='epoch skip for grad/vel output') parser.add_argument('--max_epochs',type=int,default=501,help='total number of epochs') args = parser.parse_args() if args.device=='cuda' and torch.cuda.is_available(): torch.cuda.set_device(hvd.local_rank()) # logging logdir='' if hvd.rank()==0: logger= MainLogger(name=args.name) logdir = logger.get_logdir() logger.print("Inversion start: %s (log_dir=%s)"%(args.name, logdir)) logger.print("hyper parameters: %s"%args) logdir = hvd.broadcast_object(logdir, 0) joblogger=JobLogger(hvd.rank(),logdir) model = TimeInv(args) model.to(args.device) nshot,sxy = rsf.fromfile(args.fshot,"n2 data") dataloader= time_distributed_dataloader(args.ftrue, torch.from_numpy(sxy)) optimizer = model.configure_optimizers() model.train() hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) for epoch in range(args.max_epochs+1):
def broadcast(self, obj, src=0): self.barrier() obj = hvd.broadcast_object(obj, src) return obj
config.hvd = hvd hvd.init() np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.backends.cudnn.deterministic = True torch.cuda.set_device(hvd.local_rank()) config.rank = hvd.rank() config.world = hvd.size() if hvd.local_rank() == 0: utils.download_model(config) hvd.broadcast_object(0, root_rank=0) model = x.Model(config) start_time = time.time() print('Loading dataset') train_data, dev_data, test_data = utils.build_dataset(config) train_iter = utils.build_dataloader(train_data, config) dev_iter = utils.build_dataloader(dev_data, config) test_iter = utils.build_dataloader(test_data, config) time_dif = utils.get_time_dif(start_time) print("Prepare data time: ", time_dif) # Train, eval, test model = model.to(config.device)
def distribute_optimizer_state(optimizer: torch.optim.Optimizer): """Distributes the optimizer state if horovod is available""" if HAVE_HOROVOD: state_dict = hvd.broadcast_object(optimizer.state_dict(), root_rank=0) if hvd.rank() > 0: optimizer.load_state_dict(state_dict)
def startup(gpu, args, config): hvd.init() torch.cuda.set_device(hvd.local_rank()) torch.manual_seed(7) args.rank = hvd.rank() # args.rank = args.nr * args.gpus + gpu # print('rank', args.rank, 'gpu', gpu, 'worldsize', args.world_size) # distributed.init_process_group( # backend='nccl', init_method='env://', world_size=args.world_size, rank=args.rank) torch.set_num_threads(1) experiment = setup_comet_ml(args, args.rank) # model model = Transducer(config) if config.model.random_init: for param in model.parameters(): torch.nn.init.uniform(param, -0.1, 0.1) model.preload(config.model.preload_from) model.preload_lm(config.model.dec.pretrain_file) model.cuda() # torch.cuda.set_device(gpu) # model = parallel.DistributedDataParallel(model, device_ids=[gpu]) # data d_params = Data.parameters d_params['freq_mask'] = config.data.freq_mask d_params['time_mask'] = config.data.time_mask train_dataset = Data( mean=config.data.mean, std=config.data.std, json_path=config.data.train_json, order_time_feature=True, tokenizer=config.model.tokenizer, bpe_size=config.model.bpe_size, cache_dir=config.model.bpe_cache_dir, adaptive_specaug=config.data.adaptive_specaug, time_repeats=config.data.time_repeats, **d_params ) test_dataset = Data( mean=config.data.mean, std=config.data.std, json_path=config.data.valid_json, order_time_feature=True, tokenizer=config.model.tokenizer, bpe_size=config.model.bpe_size, cache_dir=config.model.bpe_cache_dir, adaptive_specaug=config.data.adaptive_specaug, time_repeats=config.data.time_repeats, **d_params, valid=True ) train_sampler = data.distributed.DistributedSampler(train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) test_sampler = data.distributed.DistributedSampler(test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = data.DataLoader(dataset=train_dataset, batch_size=config.dist_train.batch_size, shuffle=train_sampler is None, num_workers=args.data_workers, pin_memory=True, collate_fn=collate_fn_padd if config.model.type == "BasicRNNT" else collate_fn_padd_order, drop_last=True, sampler=train_sampler) test_loader = data.DataLoader(dataset=test_dataset, batch_size=config.dist_train.batch_size, shuffle=False, num_workers=args.data_workers, collate_fn=collate_fn_padd if config.model.type == "BasicRNNT" else collate_fn_padd_order, drop_last=True, pin_memory=True, sampler=test_sampler) # freeze encoder if specified if config.model.enc.freeze: for param in model.encoder.parameters(): param.requires_grad = False # define optimizer and loss optimizer = optim.AdamW( filter(lambda p: p.requires_grad, model.parameters()), lr=config.dist_train.learning_rate, weight_decay=config.dist_train.weight_decay ) scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=config.dist_train.learning_rate, steps_per_epoch=int(len(train_loader) / config.dist_train.grad_acc_steps), epochs=config.dist_train.epochs, div_factor=config.dist_train.div_factor, pct_start=config.dist_train.pct_start, anneal_strategy='linear') optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), backward_passes_per_step=config.dist_train.grad_acc_steps, op=hvd.Adasum if args.use_adasum else hvd.Average) if args.load_model_from and args.rank == 0: print("LOADING MODEL FROM", args.load_model_from) checkpoint = torch.load(args.load_model_from) model.load_state_dict(checkpoint['model_state_dict']) # optimizer.load_state_dict(checkpoint['optimizer_state_dict']) hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # logging in terminal and comet h_params = {} h_params.update({ "batch_size": config.dist_train.batch_size, "grad_acc": config.dist_train.grad_acc_steps, "virtual_batch_size": config.dist_train.batch_size * config.dist_train.grad_acc_steps, "learning_rate": config.dist_train.learning_rate, "optimizer": optimizer.__class__.__name__, "scheduler": scheduler.__class__.__name__, }) num_params = sum([param.nelement() for param in model.parameters()]) if args.rank == 0: print(model) print(h_params) print(d_params) print('number of model parameters: ', num_params) print("\n train dataset summary \n", train_dataset.describe()) print("\n test dataset summary \n", test_dataset.describe()) print("\n data transforms \n", train_dataset.audio_transforms, test_dataset.audio_transforms) if args.rank == 0: experiment.log_parameters(h_params) experiment.log_parameters(d_params) experiment.set_name(config.comet_info.exp_name) # experiment name experiment.log_others(vars(args)) experiment.log_other('train_summary', str(train_dataset.describe())) experiment.log_other('test_summary', str(test_dataset.describe())) experiment.log_other('train_data_transforms', str(train_dataset.audio_transforms)) experiment.log_other('valid_data_transforms', str(test_dataset.audio_transforms)) experiment.log_other('n_model_params', num_params) # save args to file if args.rank == 0: ckpt_dir = os.path.join(config.dist_train.checkpoint_dir, config.comet_info.exp_name) if not os.path.isdir(ckpt_dir): os.makedirs(ckpt_dir) params_file = os.path.join(ckpt_dir, "args.txt") pretty_json = json.dumps(vars(args), sort_keys=True, indent=4) with open(params_file, 'w+') as f: f.write(pretty_json) print(pretty_json) # # resume from checkpoint # distributed.barrier() # block processes until enter loading args.start_epoch = 1 args.start_step = 1 args.total_iter = 0 if args.resume_from: if args.rank == 0: print("LOADING FROM CHECKPOINT...", args.resume_from) checkpoint = torch.load(args.resume_from, map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint['model_state_dict']) model.cuda() # optimizer.load_state_dict(checkpoint['optimizer_state_dict']) scheduler.load_state_dict(checkpoint['scheduler_state_dict']) args.start_epoch = checkpoint['epoch'] if args.resume_step > 0: args.start_step = args.resume_step else: args.start_step = checkpoint['step'] args.total_iter = checkpoint['total_iter'] # distributed.barrier() # block process until finish loading print('broadcasting model state') hvd.broadcast_parameters(model.state_dict(), root_rank=0) print('broadcasting optimizer state..') # new_optimizer_state = hvd.broadcast_object(optimizer.state_dict(), root_rank=0) # optimizer.load_state_dict(new_optimizer_state) print('broadcasting scheduler state..') new_scheduler_state = hvd.broadcast_object(scheduler.state_dict(), root_rank=0) scheduler.load_state_dict(new_scheduler_state) print('broadcasting other args') args.start_epoch = hvd.broadcast_object(args.start_epoch, root_rank=0) args.start_step = hvd.broadcast_object(args.start_step, root_rank=0) args.total_iter = hvd.broadcast_object(args.total_iter, root_rank=0) train(args, args.rank, experiment, model, optimizer, scheduler, train_loader, test_loader, train_sampler, config)
def __init__(self, configFile, multiprocessing=False): """ Initialize the basic configuration of an experiment object. Children initialization implementations should also 1) Instantiate and initialize model :param configFile: :param multiprocessing: """ # TODO: Children should provide their own init method # 1) Load configuration file # 2) Instantiate data loader and samplers # 3) Instantiate train and validation meters # Experiment states initialization status = generate_experiment_status() self.experimentStatus = status self.multiprocessing = multiprocessing qatRoundedConfig = torch.quantization.FakeQuantize.with_args( observer=custom_quant.RoundedMovingAverageMinMaxObserver, quant_min=-128, quant_max=127, averaging_constant=0.01 ) self.qatConfig = torch.quantization.QConfig( activation=qatRoundedConfig , weight=qatRoundedConfig ) # Placeholder reference to optimizer state # To be populated if restoring the experiment from checkpoint self.optimizerStateDict = None # TODO: Initialize these in the concrete __init__() method of each derived class self.model = None self.trainDataSet = None self.trainDataLoader = None self.trainDataSampler = None self.valDataSet = None self.valDataLoader = None self.valDataSampler = None self.logWriter = None self.trainMeter = None self.valMeter = None self.trainTimeMeter = None # Load experiment setting from config file config = generate_base_config() if (multiprocessing is False) or (multiprocessing is True and hvd.rank() == 0): try: file = open(configFile, "r") except IOError: raise ValueError("The provided configuration file cannot be opened.") with file: yamlConfig = yaml.load(file, Loader=yaml.FullLoader) config = edict(yamlConfig) # Broadcast the configuration to the workers during multiprocessing if multiprocessing is True: config = hvd.broadcast_object(obj=config, root_rank=0, name='config') self.config = config torch.manual_seed(self.config.seed) # Set intra-op parallelism threads torch.set_num_threads(self.config.numThreadsPerWorker)
def broadcast(self, obj, src=0): obj = hvd.broadcast_object(obj, src) return obj
def _setup_experiment(self, exp_id): """Get pointers to the data and experiment folders. Args: exp_id (str): The decimal literal identifying the experiment. """ if self.is_master: QUANT_HOME = sys.path[0] # get pointers to HARD SHARED resources HARD_STORAGE = os.path.join(QUANT_HOME, 'cfg', 'hard_storage.json') with open(HARD_STORAGE, 'r') as fp: d = json.load(fp) # data HARD_HOME_DATA = os.path.join(d['data'], 'Quant') HARD_DIR_DATA = os.path.join(HARD_HOME_DATA, 'problems', self.problem, 'data') if not os.path.isdir(HARD_DIR_DATA): raise FileNotFoundError('{} hard directory (data) not found: {}'.format(self.problem, HARD_DIR_DATA)) # log HARD_HOME_LOGS = os.path.join(d['logs'], 'Quant') HARD_DIR_LOGS = os.path.join(HARD_HOME_LOGS, 'problems', self.problem, 'logs') if not os.path.isdir(HARD_DIR_LOGS): raise FileNotFoundError('{} hard directory (logs) not found: {}'.format(self.problem, HARD_DIR_LOGS)) # get pointers to SOFT SHARED resources (which are redirected to HARD ones using symlinks) DIR_PROBLEM = os.path.join(QUANT_HOME, 'problems', self.problem) dir_data = os.path.join(DIR_PROBLEM, 'data') if not os.path.isdir(dir_data): os.symlink(HARD_DIR_DATA, dir_data) dir_logs = os.path.join(DIR_PROBLEM, 'logs') if not os.path.isdir(dir_logs): os.symlink(HARD_DIR_LOGS, dir_logs) # get pointers to PRIVATE experiment resources if exp_id: # retrieve an existing report exp_id = int(exp_id) else: # create a new report exp_folders = [f for f in os.listdir(dir_logs) if f.startswith('exp')] if len(exp_folders) == 0: exp_id = 0 else: exp_id = max(int(f.replace('exp', '')) for f in exp_folders) + 1 dir_exp = os.path.join(dir_logs, 'exp'+str(exp_id).rjust(__ALIGN_EXP__, '0')) if not os.path.isdir(dir_exp): os.mkdir(dir_exp) self.dir_data = dir_data self.dir_exp = dir_exp if self.verbose: # print setup message message = 'EXPERIMENT LOGBOOK\n' message += 'Problem: {}\n'.format(self.problem) message += 'Network topology: {}\n'.format(self.topology) message += 'Data directory: {}\n'.format(self.dir_data) message += 'Experiment directory: {}\n'.format(self.dir_exp) def print_message(message): """Print a nice delimiter around a multiline message.""" lines = message.splitlines() tab_size = 4 width = max(len(l) for l in lines) + tab_size print('+' + '-' * width + '+') for l in lines: print(l) print('+' + '-' * width + '+') print_message(message) # load configuration private_config_file = os.path.join(self.dir_exp, 'config.json') if not os.path.isfile(private_config_file): # no configuration in the experiment folder: look for global one shared_config_file = os.path.join(os.path.dirname(self.lib.__file__), 'config.json') if not os.path.isfile(shared_config_file): raise FileNotFoundError('Configuration file not found: {}'.format(shared_config_file)) shutil.copyfile(shared_config_file, private_config_file) # generate seed for experiment with open(private_config_file, 'r+') as fp: config = json.load(fp) config['experiment']['seed'] = torch.randint(__MAX_SEED__, (1,)).item() fp.seek(0) json.dump(config, fp, indent=4) fp.truncate() with open(private_config_file, 'r') as fp: self.config = json.load(fp) # communicate data pointer and experiment configuration to worker processes self.dir_data = hvd.broadcast_object(self.dir_data, root_rank=__MASTER_PROC_RANK__, name='dir_data') self.config = hvd.broadcast_object(self.config, root_rank=__MASTER_PROC_RANK__, name='config')
def broadcast(self, obj: object, src: int = 0) -> object: obj = hvd.broadcast_object(obj, src) return obj
def train(self): dset = ConcatDataset( [eval(cls)(**params) for cls, params in self.dataset]) # eval(cls) means to call the Dataset,e.g:DAVISDataset # (**params) means to delivery the initial params[dict] into Dataset. e.g:DAVISDataset(params) # Finally, concat these Datasets. # Partition dataset among workers using DistributedSampler train_sampler = torch.utils.data.distributed.DistributedSampler( dset, num_replicas=hvd.size(), rank=hvd.rank()) loader = DataLoader(dset, batch_size=self.batch_size, sampler=train_sampler, num_workers=self.num_workers, pin_memory=True, shuffle=False) # Add Horovod Distributed Optimizer backward_passes_per_step = dset.datasets[ 0].sample_size - 1 # e.g:3 frames has 2 backward() self.optimizer = hvd.DistributedOptimizer( self.optimizer, named_parameters=self.model.named_parameters(), backward_passes_per_step=backward_passes_per_step) # Broadcast parameters from rank 0 to all other processes. hvd.broadcast_parameters(self.model.state_dict(), root_rank=0) for epoch in range(self.epoch + 1, self.max_epochs + 1): self.epoch = epoch self.stats = ddict(AverageMeter) t0 = None runtime = AverageMeter() for i, batch in enumerate(loader, 1): t0 = time( ) if t0 is None else t0 # Ignore loader startup pause self.optimizer.zero_grad() stats = self.model(*batch) self.optimizer.step() runtime.update(time() - t0) t0 = time() stats['stats/lr'] = self.scheduler.get_last_lr()[0] self.update_stats(stats, i, len(loader), runtime, do_print=True) if hvd.rank() == 0: self.log_stats() # tensorboard self.scheduler.step() lr_dict = hvd.broadcast_object(self.scheduler.state_dict(), 0) if hvd.rank() > 0: self.scheduler.load_state_dict(lr_dict) if self.epoch % self.save_interval == 0 and hvd.rank() == 0: self.save_checkpoint() print("%s done" % self.name)