def test_broadcast_state(self): hvd.init() N, D_in, H, D_out = 64, 100, 10, 10 x = torch.autograd.Variable(torch.randn(N, D_in), requires_grad=True) y = torch.autograd.Variable(torch.randn(N, D_out), requires_grad=False) def create_model(create_opt): model = torch.nn.Sequential( torch.nn.Linear(D_in, H), torch.nn.ReLU(), torch.nn.Linear(H, D_out), ) optimizer = create_opt(model) optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) return model, optimizer def get_model_param_values(model): params = sorted(model.state_dict().items()) return [(k, v.clone()) for k, v in params] def get_optimizer_param_values(optimizer): results = [] state_dict = optimizer.state_dict() for group in state_dict['param_groups']: for param_id in group['params']: params = sorted(state_dict['state'][param_id].items()) for k, v in params: results.append( (k, v.clone() if torch.is_tensor(v) else v)) return results opt_params = dict(lr=0.2, momentum=0.9, weight_decay=0.1, centered=True) def new_optimizer(cls): p = { k: v for k, v in opt_params.items() if k in inspect.getargspec(cls.__init__).args } return lambda m: cls(m.parameters(), **p) # L-BFGS is currently unsupported, as are sparse tensors, which are # required by SparseAdam optimizer optimizers = [ (subclass.__name__, new_optimizer(subclass)) for subclass in torch.optim.Optimizer.__subclasses__() if subclass.__module__.startswith('torch.optim') and subclass != torch.optim.LBFGS and subclass != torch.optim.SparseAdam ] optimizers.sort() for opt_name, create_opt in optimizers: model, optimizer = create_model(create_opt) y_pred = model(x) loss = F.mse_loss(y_pred, y, size_average=False) optimizer.zero_grad() loss.backward() optimizer.step() model_param_values = get_model_param_values(model) for name, model_param_value in model_param_values: hvd.broadcast_(model_param_value, root_rank=0) opt_param_values_updated = [] opt_param_values = get_optimizer_param_values(optimizer) for name, opt_param_value in opt_param_values: is_tensor = torch.is_tensor(opt_param_value) if not is_tensor: t = type(opt_param_value) opt_param_value = torch.Tensor([opt_param_value]) hvd.broadcast_(opt_param_value, root_rank=0) if not is_tensor: opt_param_value = t(opt_param_value.numpy()[0]) opt_param_values_updated.append((name, opt_param_value)) opt_param_values = opt_param_values_updated if hvd.rank() == 0: state = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), } _, fname = tempfile.mkstemp('.pt') torch.save(state, fname) model, optimizer = create_model(create_opt) if hvd.rank() == 0: checkpoint = torch.load(fname) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) os.remove(fname) hvd.broadcast_parameters(model.state_dict(), root_rank=0) model_param_value_after = get_model_param_values(model) for before, after in zip(model_param_values, model_param_value_after): name, model_param_value = before name_after, model_param_value_after = after self.assertEqual(name, name_after) self.assertEqual(type(model_param_value), type(model_param_value_after)) self.assertTrue( (model_param_value == model_param_value_after).all()) hvd.broadcast_optimizer_state(optimizer, root_rank=0) self.assertEqual(len(optimizer.state_dict()['state'].values()), 4) opt_param_values_after = get_optimizer_param_values(optimizer) for before, after in zip(opt_param_values, opt_param_values_after): name, opt_param_value = before name_after, opt_param_value_after = after self.assertEqual(name, name_after) self.assertEqual(type(opt_param_value), type(opt_param_value_after)) if torch.is_tensor(opt_param_value): self.assertTrue( (opt_param_value == opt_param_value_after).all()) else: self.assertEqual(opt_param_value, opt_param_value_after)
def train_main(args, splits): # Horovod: initialize library. hvd.init() torch.manual_seed(args.seed) if torch.cuda.is_available(): # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) rank = hvd.rank() model = MyModel(annotation, use_bn=False) # By default, Adasum doesn"t need scaling up learning rate. if torch.cuda.is_available(): # Move model to GPU. model.cuda() optimizers = construct_optimizers(model) loss_function = huber_loss # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) for opt in optimizers: hvd.broadcast_optimizer_state(opt, root_rank=0) def _train(epoch, train_dataset): model.train() # Horovod: set epoch to sampler for shuffling. # train_dataset.set_epoch(epoch) start_epoch = timeit.default_timer() last_batch_time = start_epoch batch_wait_times = [] for batch_idx, (data, target) in enumerate(train_dataset): batch_wait_times.append(timeit.default_timer() - last_batch_time) if torch.cuda.is_available(): data = data.cuda() target = target.cuda() for opt in optimizers: opt.zero_grad() batch = OrderedDict() batch["embeddings"] = OrderedDict() batch["one_hot"] = OrderedDict() for i, name in enumerate(annotation["embeddings"]): batch["embeddings"][name] = data[:, i : i + 1] batch["one_hot"]["hot0"] = data[:, -2:-1] batch["one_hot"]["hot1"] = data[:, -1:] batch_pred = model(batch) if batch_idx % args.log_interval == 0: print( f"Processing batch {batch_idx} in epoch {epoch} on worker " f"{rank}." ) time.sleep(args.mock_train_step_time) loss = loss_function(batch_pred, target, delta=60) loss.mean().backward() for opt in optimizers: opt.step() last_batch_time = timeit.default_timer() epoch_duration = timeit.default_timer() - start_epoch avg_batch_wait_time = np.mean(batch_wait_times) std_batch_wait_time = np.std(batch_wait_times) max_batch_wait_time = np.max(batch_wait_times) min_batch_wait_time = np.min(batch_wait_times) print( f"\nEpoch {epoch}, worker {rank} stats over " f"{len(batch_wait_times)} steps: {epoch_duration:.3f}" ) print( f"Mean batch wait time: {avg_batch_wait_time:.3f}s +- " f"{std_batch_wait_time}" ) print(f"Max batch wait time: {max_batch_wait_time:.3f}s") print(f"Min batch wait time: {min_batch_wait_time:.3f}s") return batch_wait_times print(f"Starting training on worker {rank}.") batch_wait_times = [] for epoch, split_ds in enumerate(splits[rank].iter_epochs()): train_dataset = create_torch_iterator(split_ds, args.batch_size, rank) new_batch_times = _train(epoch, train_dataset) new_batch_times.pop(0) batch_wait_times.extend(new_batch_times) print(f"Done training on worker {rank}.") avg_batch_wait_time = np.mean(batch_wait_times) std_batch_wait_time = np.std(batch_wait_times) max_batch_wait_time = np.max(batch_wait_times) min_batch_wait_time = np.min(batch_wait_times) print(f"\nWorker {rank} training stats over {args.epochs} epochs:") print( f"Mean batch wait time: {avg_batch_wait_time:.3f}s +- " f"{std_batch_wait_time}" ) print(f"Max batch wait time: {max_batch_wait_time:.3f}s") print(f"Min batch wait time: {min_batch_wait_time:.3f}s")
def main(): parser = argparse.ArgumentParser() parser.add_argument("-exp_dir") parser.add_argument("-dataPath", default='', type=str, help="path of data files") parser.add_argument("-train_config") parser.add_argument("-data_config") parser.add_argument("-lr", default=0.0001, type=float, help="Override the LR in the config") parser.add_argument("-batch_size", default=32, type=int, help="Override the batch size in the config") parser.add_argument("-data_loader_threads", default=1, type=int, help="number of workers for data loading") parser.add_argument("-max_grad_norm", default=5, type=float, help="max_grad_norm for gradient clipping") parser.add_argument("-sweep_size", default=200, type=float, help="process n hours of data per sweep (default:200)") parser.add_argument("-num_epochs", default=1, type=int, help="number of training epochs (default:1)") parser.add_argument("-global_mvn", default=False, type=bool, help="if apply global mean and variance normalization") parser.add_argument( "-resume_from_model", type=str, help="the model from which you want to resume training") parser.add_argument("-dropout", type=float, help="set the dropout ratio") parser.add_argument("-aneal_lr_epoch", default=2, type=int, help="start to aneal the learning rate from this epoch" ) # aneal -> anneal? parser.add_argument("-aneal_lr_ratio", default=0.5, type=float, help="the ratio to aneal the learning rate") parser.add_argument('-p', '--print-freq', default=100, type=int, metavar='N', help='print frequency (default: 100)') args = parser.parse_args() with open(args.train_config_file) as f: config = yaml.safe_load(f) config["sweep_size"] = args.sweep_size with open(args.data_config_file) as f: data = yaml.safe_load(f) config["source_paths"] = [j for i, j in data['clean_source'].items()] if 'dir_noise' in data: config["dir_noise_paths"] = [ j for i, j in data['dir_noise'].items() ] if 'rir' in data: config["rir_paths"] = [j for i, j in data['rir'].items()] config['data_path'] = args.dataPath print("Experiment starts with config {}".format( json.dumps(config, sort_keys=True, indent=4))) # Initialize Horovod hvd.init() th.cuda.set_device(hvd.local_rank()) print("Run experiments with world size {}".format(hvd.size())) if not os.path.isdir(args.exp_dir): os.makedirs(args.exp_dir) trainset = SpeechDataset(config) train_dataloader = ChunkDataloader(trainset, batch_size=args.batch_size, distributed=True, num_workers=args.data_loader_threads) if args.global_mvn: transform = reader.preprocess.GlobalMeanVarianceNormalization() print("Estimating global mean and variance of feature vectors...") transform.learn_mean_and_variance_from_train_loader( train_dataloader, train_dataloader.stream_keys_for_transform, n_sample_to_use=2000) train_dataloader.transform = transform print("Global mean and variance transform trained successfully!") with open(args.exp_dir + "/transform.pkl", 'wb') as f: pickle.dump(transform, f, pickle.HIGHEST_PROTOCOL) print("Data loader set up successfully!") print("Number of minibatches: {}".format(len(train_dataloader))) # ceate model model_config = config["model_config"] lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True) model = NnetAM(lstm, model_config["hidden_size"] * 2, model_config["label_size"]) # Start training th.backends.cudnn.enabled = True if th.cuda.is_available(): model.cuda() # optimizer optimizer = th.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True) # Broadcast parameters and opterimizer state from rank 0 to all other processes. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Add Horovod Distributed Optimizer optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) # criterion criterion = nn.CrossEntropyLoss(ignore_index=-100) start_epoch = 0 if args.resume_from_model: assert os.path.isfile(args.resume_from_model ), "ERROR: model file {} does not exit!".format( args.resume_from_model) checkpoint = th.load(args.resume_from_model) state_dict = checkpoint['model'] start_epoch = checkpoint['epoch'] model.load_state_dict(state_dict) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' ".format(args.resume_from_model)) model.train() for epoch in range(start_epoch, args.num_epochs): # aneal learning rate if epoch > args.aneal_lr_epoch: for param_group in optimizer.param_groups: param_group['lr'] *= args.aneal_lr_ratio run_train_epoch(model, optimizer, criterion, train_dataloader, epoch, args) # save model if hvd.rank() == 0: checkpoint = {} checkpoint['model'] = model.state_dict() checkpoint['optimizer'] = optimizer.state_dict() checkpoint['epoch'] = epoch output_file = args.exp_dir + '/model.' + str(epoch) + '.tar' th.save(checkpoint, output_file)
def single_point(self, with_tqdm=True, hdf5_group='single_point'): """Performs a single point calculation Args: with_tqdm (bool, optional): use tqdm for samplig. Defaults to True. hdf5_group (str, optional): hdf5 group where to store the data. Defaults to 'single_point'. Returns: SimpleNamespace: contains the local energy, positions, ... """ logd(hvd.rank(), '') logd( hvd.rank(), ' Single Point Calculation : {nw} walkers | {ns} steps'.format( nw=self.sampler.nwalkers, ns=self.sampler.nstep)) # check if we have to compute and store the grads grad_mode = torch.no_grad() if self.wf.kinetic == 'auto': grad_mode = torch.enable_grad() # distribute the calculation num_threads = 1 hvd.broadcast_parameters(self.wf.state_dict(), root_rank=0) torch.set_num_threads(num_threads) with grad_mode: # sample the wave function pos = self.sampler(self.wf.pdf) if self.wf.cuda and pos.device.type == 'cpu': pos = pos.to(self.device) # compute energy/variance/error eloc = self.wf.local_energy(pos) e, s, err = torch.mean(eloc), torch.var( eloc), self.wf.sampling_error(eloc) # gather all data eloc_all = hvd.allgather(eloc, name='local_energies') e, s, err = torch.mean(eloc_all), torch.var( eloc_all), self.wf.sampling_error(eloc_all) # print if hvd.rank() == 0: log.options(style='percent').info( ' Energy : %f +/- %f' % (e.detach().item(), err.detach().item())) log.options(style='percent').info(' Variance : %f' % s.detach().item()) # dump data to hdf5 obs = SimpleNamespace(pos=pos, local_energy=eloc_all, energy=e, variance=s, error=err) # dump to file if hvd.rank() == 0: dump_to_hdf5(obs, self.hdf5file, root_name=hdf5_group) add_group_attr(self.hdf5file, hdf5_group, {'type': 'single_point'}) return obs
def train(serialized_model, optimizer_cls, model_opt_state_serialized, train_rows, val_rows, avg_row_size): from petastorm import TransformSpec, make_reader, make_batch_reader from petastorm.pytorch import BatchedDataLoader, InMemBatchedDataLoader import torch import horovod.torch as hvd if random_seed is not None: torch.manual_seed(random_seed) # Deserializing objects model_opt_state = torch.load(model_opt_state_serialized) model = deserialize(serialized_model) if loss_fns_pre_train: loss_fns = loss_fns_pre_train if loss_constructors: local_vars = locals() loss_fns = [loss_constructor(**local_vars) for loss_constructor in loss_constructors] # Horovod: initialize library. hvd.init() if user_verbose: import horovod as _horovod print(f"Shared lib path is pointing to: {_horovod.common.process_sets._basics.MPI_LIB_CTYPES}") # If user specifies any user_shuffle_buffer_size (even 0), we should honor it. if user_shuffle_buffer_size is None: shuffle_buffer_size = \ calculate_shuffle_buffer_size(hvd, avg_row_size, train_rows / hvd.size()) else: if user_shuffle_buffer_size < 0: raise ValueError("user_shuffle_buffer_size cannot be negative!") shuffle_buffer_size = user_shuffle_buffer_size if not should_use_gpu and user_verbose: print("Skip pinning current process to the GPU.") cuda_available = torch.cuda.is_available() if cuda_available and not should_use_gpu: print("GPU is available but use_gpu is set to False." "Training will proceed without GPU support.") cuda_available = False # We need to check all ranks have same device type for traning. # Horovod doesn't support heterogeneous allreduce for gradients. cuda_avail_list = hvd.allgather_object(cuda_available, name='device type') if cuda_avail_list.count(cuda_available) != hvd.size(): raise RuntimeError("All ranks don't have same device type!") if cuda_available: # Horovod: pin GPU to local rank or the assigned GPU from spark. torch.cuda.set_device(_get_assigned_gpu_or_default(default=hvd.local_rank())) # Move model to GPU. model.cuda() # Optimizer object needs to be re-instantiated. Internally, it uses memory addresses of # objects as their identity and therefore it cannot be serialized and then # deserialized. The deserialized optimizer object stores the names of the parameters # with their old memory addresses but in reality those are different than the # reconstructed deserialized object and that creates problem. # Learning rate is a required parameters in SGD optimizer. It will be overridden with # load_state_dict. optimizer = optimizer_cls(model.parameters(), lr=1) optimizer_state = model_opt_state['optimizer'] if last_checkpoint_state is not None: model.load_state_dict(last_checkpoint_state['model']) optimizer.load_state_dict(last_checkpoint_state['optimizer']) else: # scale the learning rate with the number of horovod workers for i in range(len(optimizer_state['param_groups'])): optimizer_state['param_groups'][i]['lr'] = \ optimizer_state['param_groups'][i]['lr'] * hvd.size() optimizer.load_state_dict(optimizer_state) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) for group in optimizer.param_groups: for p in group['params']: if id(p) not in optimizer.state_dict()['state']: p.grad = p.data.new(p.size()).zero_() optimizer.step() hvd.broadcast_optimizer_state(optimizer, root_rank=0) dist_optimizer_args = dict(optimizer=optimizer, named_parameters=model.named_parameters()) if gradient_compression: # Pass the compression arg only if it is specified by the user. dist_optimizer_args['compression'] = gradient_compression # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(**dist_optimizer_args) # This function takes the current optimizer and constructs a new optimizer with the # same state except with learning rate scaled down with the number of horovod workers. # This is important the retraining of the model. User may retrain the model with # different number of workers and we need the raw learning rate to adjust with the # new number of workers. transform_spec = None if transformation: transform_spec = TransformSpec(transformation) schema_fields = feature_columns + label_columns if sample_weight_col: schema_fields.append(sample_weight_col) if train_steps_per_epoch is None: steps_per_epoch = int(math.floor(float(train_rows) / batch_size / hvd.size())) else: steps_per_epoch = train_steps_per_epoch with remote_store.get_local_output_dir() as run_output_dir: logs_dir = os.path.join(run_output_dir, remote_store.logs_subdir) log_writer = SummaryWriter(logs_dir) if hvd.rank() == 0 else None ckpt_file = os.path.join(run_output_dir, remote_store.checkpoint_filename) def save_checkpoint(): model.cpu() optimizer_with_scaled_down_lr = \ get_optimizer_with_unscaled_lr(hvd, optimizer, optimizer_cls, model) state = { 'model': model.state_dict(), 'optimizer': optimizer_with_scaled_down_lr.state_dict(), } torch.save(state, ckpt_file) if cuda_available: model.cuda() if hvd.rank() == 0 and user_verbose: print(f"Training parameters: Epochs: {epochs}\n" f"Train rows: {train_rows}, Train batch size: {batch_size}, Train_steps_per_epoch: {steps_per_epoch}\n" f"Shuffle buffer size: {shuffle_buffer_size}, Random seed: {random_seed}\n" f"Checkpoint file: {ckpt_file}, Logs dir: {logs_dir}\n") # In general, make_batch_reader is faster than make_reader for reading the dataset. # However, we found out that make_reader performs data transformations much faster than # make_batch_reader with parallel worker processes. Therefore, the default reader # we choose is make_batch_reader unless there are data transformations. reader_factory = None reader_factory_kwargs = dict() if transform_spec: reader_factory = make_reader reader_factory_kwargs['pyarrow_serialize'] = True else: reader_factory = make_batch_reader # Petastorm: read data from the store with the correct shard for this rank # setting num_epochs=None will cause an infinite iterator # and enables ranks to perform training and validation with # unequal number of samples with reader_factory(remote_store.train_data_path, num_epochs=None, cur_shard=hvd.rank(), reader_pool_type=reader_pool_type, workers_count=train_reader_worker_count, shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER, schema_fields=schema_fields, transform_spec=transform_spec, storage_options=storage_options, # Don't shuffle row groups without shuffling. shuffle_row_groups=True if shuffle_buffer_size > 0 else False, **reader_factory_kwargs) as train_reader: with reader_factory(remote_store.val_data_path, num_epochs=None, cur_shard=hvd.rank(), reader_pool_type=reader_pool_type, workers_count=val_reader_worker_count, shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER, schema_fields=schema_fields, transform_spec=transform_spec, storage_options=storage_options, shuffle_row_groups=False, **reader_factory_kwargs) \ if should_validate else empty_batch_reader() as val_reader: if inmemory_cache_all: # Petastorm introduced InMemBatchedDataLoader class in v0.11.0 train_loader = InMemBatchedDataLoader(train_reader, batch_size=batch_size, num_epochs=epochs, rows_capacity=steps_per_epoch*batch_size, shuffle=True) else: train_loader = BatchedDataLoader(train_reader, batch_size=batch_size, shuffling_queue_capacity=shuffle_buffer_size) train_loader_iter = iter(train_loader) def prepare_batch(row): inputs = [ prepare_np_data( row[col].float(), col, metadata).reshape(shape) for col, shape in zip(feature_columns, input_shapes)] labels = [ prepare_np_data( row[col].float(), col, metadata) for col in label_columns] sample_weights = row.get(sample_weight_col, None) if sample_weights is not None: sample_weights = sample_weights.float() if cuda_available: inputs = [input.cuda() for input in inputs] labels = [label.cuda() for label in labels] if sample_weights is not None: sample_weights = sample_weights.cuda() return inputs, labels, sample_weights def transform_outputs(outputs, labels): if not isinstance(outputs, tuple) and not isinstance(outputs, list): outputs = [outputs] # reshape labels to match the output shape of the model if hasattr(outputs[0], 'shape'): if label_shapes: labels = [label.reshape(label_shape) for label, label_shape in zip(labels, label_shapes)] else: # If label_shapes parameter is not provided, reshape the label # columns data to match the shape of the model output labels = [label.reshape(output.shape) if output.shape.numel() == label.shape.numel() else label for label, output in zip(labels, outputs)] return outputs, labels def aggregate_metrics(stage, epoch, loss, metric_value_groups): all_metric_groups_values = get_metric_avgs(metric_value_groups) if remote_store.saving_runs: write_metrics_summary( stage, epoch, loss, all_metric_groups_values, log_writer) return { loss.name: loss.avg.item(), 'all_metrics': all_metric_groups_values } def loss_fn(outputs, labels, sample_weights): loss = calculate_loss(outputs, labels, loss_weights, loss_fns, sample_weights) return loss def print_metrics(batch_idx, loss, metric_value_groups, phase): if user_verbose > 0 and hvd.rank() == 0 and \ batch_idx % METRIC_PRINT_FREQUENCY == 0: print("{phase}\tepoch:\t{epoch}\tstep\t{batch_idx}:\t{metrics}". format(phase=phase, epoch=epoch, batch_idx=batch_idx, metrics=aggregate_metrics(phase, epoch, loss, metric_value_groups))) def _train(epoch): model.train() train_loss = metric_cls('loss', hvd) metric_value_groups = construct_metric_value_holders( metric_cls, metric_fn_groups, label_columns, hvd) # iterate on one epoch for batch_idx in range(steps_per_epoch): row = next(train_loader_iter) inputs, labels, sample_weights = prepare_batch(row) outputs, loss = train_minibatch(model, optimizer, transform_outputs, loss_fn, inputs, labels, sample_weights) update_metrics(metric_value_groups, outputs, labels) train_loss.update(loss) print_metrics(batch_idx, train_loss, metric_value_groups, 'train') return aggregate_metrics('train', epoch, train_loss, metric_value_groups) if should_validate: if validation_steps_per_epoch is None: validation_steps = int(math.ceil(float(val_rows) / val_batch_size / hvd.size())) else: validation_steps = validation_steps_per_epoch if hvd.rank() == 0 and user_verbose: print(f"Val rows: {val_rows}, Val batch size: {val_batch_size}, Val_steps_per_epoch: {validation_steps}\n") if inmemory_cache_all: # Petastorm introduced InMemBatchedDataLoader class in v0.11.0 val_loader = InMemBatchedDataLoader(val_reader, batch_size=val_batch_size, num_epochs=epochs, rows_capacity=validation_steps*val_batch_size, shuffle=False) else: val_loader = BatchedDataLoader(val_reader, batch_size=val_batch_size, shuffling_queue_capacity=0) val_loader_iter = iter(val_loader) def _validate(epoch): model.eval() val_loss = metric_cls('loss', hvd) metric_value_groups = construct_metric_value_holders( metric_cls, metric_fn_groups, label_columns, hvd) # iterate on one epoch for batch_idx in range(validation_steps): row = next(val_loader_iter) inputs, labels, sample_weights = prepare_batch(row) outputs = model(*inputs) outputs, labels = transform_outputs(outputs, labels) loss = calculate_loss( outputs, labels, loss_weights, loss_fns, sample_weights) val_loss.update(loss) update_metrics(metric_value_groups, outputs, labels) print_metrics(batch_idx, val_loss, metric_value_groups, 'val') return aggregate_metrics('val', epoch, val_loss, metric_value_groups) history = [] for epoch in range(epochs): epoch_metrics = { 'epoch': epoch, 'train': _train(epoch) } if should_validate: epoch_metrics['validation'] = _validate(epoch) if user_verbose > 0: pdt_dt = datetime.now(timezone.utc) pdt_time_str = pdt_dt.strftime("%Y-%b-%d %H:%M:%S UTC") print(pdt_time_str, epoch_metrics) history.append(epoch_metrics) if hvd.rank() == 0: # Save model after every epoch save_checkpoint() if remote_store.saving_runs: remote_store.sync(run_output_dir) if hvd.rank() == 0: best_checkpoint = torch.load(ckpt_file) serialized_checkpoint = io.BytesIO() torch.save(best_checkpoint, serialized_checkpoint) serialized_checkpoint.seek(0) return history, serialized_checkpoint
def train(config, checkpoint_dir=None): import horovod.torch as hvd hvd.init() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") net = ResNet18(None).to(device) optimizer = torch.optim.SGD( net.parameters(), lr=config["lr"], ) epoch = 0 if checkpoint_dir: with open(os.path.join(checkpoint_dir, "checkpoint")) as f: model_state, optimizer_state, epoch = torch.load(f) net.load_state_dict(model_state) optimizer.load_state_dict(optimizer_state) criterion = nn.CrossEntropyLoss() optimizer = hvd.DistributedOptimizer(optimizer) np.random.seed(1 + hvd.rank()) torch.manual_seed(1234) # To ensure consistent initialization across workers, hvd.broadcast_parameters(net.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) trainset = ray.get(config["data"]) trainloader = DataLoader( trainset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=4 ) for epoch in range(epoch, 40): # loop over the dataset multiple times running_loss = 0.0 epoch_steps = 0 for i, data in enumerate(trainloader): # get the inputs; data is a list of [inputs, labels] inputs, labels = data inputs, labels = inputs.to(device), labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.item() epoch_steps += 1 tune.report(loss=running_loss / epoch_steps) if i % 2000 == 1999: # print every 2000 mini-batches print( "[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / epoch_steps) ) with distributed_checkpoint_dir(step=epoch) as checkpoint_dir: print("this checkpoint dir: ", checkpoint_dir) path = os.path.join(checkpoint_dir, "checkpoint") torch.save((net.state_dict(), optimizer.state_dict(), epoch), path)
def main(): start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch # Data print('==> Preparing dataset %s' % args.dataset) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) if args.dataset == 'cifar10': dataloader = datasets.CIFAR10 num_classes = 10 else: dataloader = datasets.CIFAR100 num_classes = 100 trainset = dataloader(root=args.dataroot, train=True, download=True, transform=transform_train) sampler = torch.utils.data.distributed.DistributedSampler( trainset, num_replicas=hvd.size(), rank=hvd.rank()) trainloader = data.DataLoader(dataset=trainset, batch_size=args.train_batch * world_size, shuffle=False, sampler=sampler) testset = dataloader(root=args.dataroot, train=False, download=False, transform=transform_test) testloader = data.DataLoader(testset, batch_size=args.test_batch * world_size, shuffle=False, num_workers=args.workers) # Model print("==> creating model '{}'".format("vgg19")) model = vgg19_bn(num_classes=num_classes) device = torch.device('cuda', local_rank) model = model.to(device) # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank) print('Model on cuda:%d' % local_rank) print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # 用horovod封装优化器 optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) # 广播参数 hvd.broadcast_parameters(model.state_dict(), root_rank=0) # Train and val for epoch in range(start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc = test(testloader, model, criterion, epoch, use_cuda) print( 'Rank:{} Epoch[{}/{}]: LR: {:.3f}, Train loss: {:.5f}, Test loss: {:.5f}, Train acc: {:.2f}, Test acc: {:.2f}.' .format(local_rank, epoch + 1, args.epochs, state['lr'], train_loss, test_loss, train_acc, test_acc))
def train(serialized_model, optimizer_cls, model_opt_state_serialized, train_rows, val_rows, avg_row_size): from petastorm import make_batch_reader, TransformSpec from petastorm.pytorch import DataLoader import torch import horovod.torch as hvd # Deserializing objects model_opt_state = torch.load(model_opt_state_serialized) model = deserialize(serialized_model) if loss_fns_pre_train: loss_fns = loss_fns_pre_train if loss_constructors: local_vars = locals() loss_fns = [ loss_constructor(**local_vars) for loss_constructor in loss_constructors ] # Horovod: initialize library. hvd.init() if not user_shuffle_buffer_size: shuffle_buffer_size = \ calculate_shuffle_buffer_size(hvd, avg_row_size, train_rows / hvd.size()) else: shuffle_buffer_size = user_shuffle_buffer_size cuda_available = torch.cuda.is_available() if cuda_available: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) # Move model to GPU. model.cuda() # Optimizer object needs to be re-instantiated. Internally, it uses memory addresses of # objects as their identity and therefore it cannot be serialized and then # deserialized. The deserialized optimizer object stores the names of the parameters # with their old memory addresses but in reality those are different than the # reconstructed deserialized object and that creates problem. # Learning rate is a required parameters in SGD optimizer. It will be overridden with # load_state_dict. optimizer = optimizer_cls(model.parameters(), lr=1) optimizer_state = model_opt_state['optimizer'] if last_checkpoint_state is not None: model.load_state_dict(last_checkpoint_state['model']) optimizer.load_state_dict(last_checkpoint_state['optimizer']) else: # scale the learning rate with the number of horovod workers for i in range(len(optimizer_state['param_groups'])): optimizer_state['param_groups'][i]['lr'] = \ optimizer_state['param_groups'][i]['lr'] * hvd.size() optimizer.load_state_dict(optimizer_state) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) for group in optimizer.param_groups: for p in group['params']: if id(p) not in optimizer.state_dict()['state']: p.grad = p.data.new(p.size()).zero_() optimizer.step() hvd.broadcast_optimizer_state(optimizer, root_rank=0) dist_optimizer_args = dict(optimizer=optimizer, named_parameters=model.named_parameters()) if gradient_compression: # Pass the compression arg only if it is specified by the user. dist_optimizer_args['compression'] = gradient_compression # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(**dist_optimizer_args) # This function takes the current optimizer and constructs a new optimizer with the # same state except with learning rate scaled down with the number of horovod workers. # This is important the retraining of the model. User may retrain the model with # different number of workers and we need the raw learning rate to adjust with the # new number of workers. transform_spec = None if transformation: transform_spec = TransformSpec(transformation) schema_fields = feature_columns + label_columns if sample_weight_col: schema_fields.append(sample_weight_col) if train_steps_per_epoch is None: steps_per_epoch = int( math.ceil(float(train_rows) / batch_size / hvd.size())) else: steps_per_epoch = train_steps_per_epoch with remote_store.get_local_output_dir() as run_output_dir: logs_dir = os.path.join(run_output_dir, remote_store.logs_subdir) log_writer = SummaryWriter(logs_dir) if hvd.rank() == 0 else None ckpt_file = os.path.join(run_output_dir, remote_store.checkpoint_filename) def save_checkpoint(): model.cpu() optimizer_with_scaled_down_lr = \ get_optimizer_with_unscaled_lr(hvd, optimizer, optimizer_cls, model) state = { 'model': model.state_dict(), 'optimizer': optimizer_with_scaled_down_lr.state_dict(), } torch.save(state, ckpt_file) if cuda_available: model.cuda() # Petastorm: read data from the store with the correct shard for this rank # setting num_epochs=None will cause an infinite iterator # and enables ranks to perform training and validation with # unequal number of samples with make_batch_reader( remote_store.train_data_path, num_epochs=None, cur_shard=hvd.rank(), shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER, schema_fields=schema_fields, transform_spec=transform_spec) as train_reader: with make_batch_reader(remote_store.val_data_path, num_epochs=None, cur_shard=hvd.rank(), shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER, schema_fields=schema_fields, transform_spec=transform_spec) \ if should_validate else empty_batch_reader() as val_reader: train_loader = DataLoader( train_reader, batch_size=batch_size, shuffling_queue_capacity=shuffle_buffer_size) train_loader_iter = iter(train_loader) def prepare_batch(row): inputs = [ prepare_np_data(row[col].float(), col, metadata).reshape(shape) for col, shape in zip(feature_columns, input_shapes) ] labels = [ prepare_np_data(row[col].float(), col, metadata) for col in label_columns ] sample_weights = row.get(sample_weight_col, None) if cuda_available: inputs = [input.cuda() for input in inputs] labels = [label.cuda() for label in labels] if sample_weights: sample_weights = sample_weights.cuda() return inputs, labels, sample_weights def transform_outputs(outputs, labels): if type(outputs) != tuple and type(outputs) != list: outputs = [outputs] # reshape labels to match the output shape of the model if hasattr(outputs[0], 'shape'): labels = [ label.reshape(output.shape) if output.shape.numel() == label.shape.numel() else label for label, output in zip(labels, outputs) ] return outputs, labels def aggregate_metrics(stage, epoch, loss, metric_value_groups): all_metric_groups_values = get_metric_avgs( metric_value_groups) if remote_store.saving_runs: write_metrics_summary(stage, epoch, loss, all_metric_groups_values, log_writer) return { loss.name: loss.avg.item(), 'all_metrics': all_metric_groups_values } def loss_fn(outputs, labels, sample_weights): loss = calculate_loss(outputs, labels, loss_weights, loss_fns, sample_weights) return loss def print_metrics(batch_idx, loss, metric_value_groups, phase): if user_verbose > 0 and hvd.rank() == 0 and \ batch_idx % METRIC_PRINT_FREQUENCY == 0: print( "epoch:\t{epoch}\tstep\t{batch_idx}:\t{metrics}" .format(epoch=epoch, batch_idx=batch_idx, metrics=aggregate_metrics( phase, epoch, loss, metric_value_groups))) def _train(epoch): model.train() train_loss = metric_cls('loss', hvd) metric_value_groups = construct_metric_value_holders( metric_cls, metric_fn_groups, label_columns, hvd) # iterate on one epoch for batch_idx in range(steps_per_epoch): row = next(train_loader_iter) inputs, labels, sample_weights = prepare_batch(row) outputs, loss = train_minibatch( model, optimizer, transform_outputs, loss_fn, inputs, labels, sample_weights) update_metrics(metric_value_groups, outputs, labels) train_loss.update(loss) print_metrics(batch_idx, train_loss, metric_value_groups, 'train') return aggregate_metrics('train', epoch, train_loss, metric_value_groups) if should_validate: val_loader = DataLoader(val_reader, batch_size=batch_size) val_loader_iter = iter(val_loader) if validation_steps_per_epoch is None: validation_steps = int( math.ceil( float(val_rows) / batch_size / hvd.size())) else: validation_steps = validation_steps_per_epoch def _validate(epoch): model.eval() val_loss = metric_cls('loss', hvd) metric_value_groups = construct_metric_value_holders( metric_cls, metric_fn_groups, label_columns, hvd) # iterate on one epoch for batch_idx in range(validation_steps): row = next(val_loader_iter) inputs, labels, sample_weights = prepare_batch( row) outputs = model(*inputs) outputs, labels = transform_outputs( outputs, labels) loss = calculate_loss(outputs, labels, loss_weights, loss_fns, sample_weights) val_loss.update(loss) update_metrics(metric_value_groups, outputs, labels) print_metrics(batch_idx, val_loss, metric_value_groups, 'val') return aggregate_metrics('val', epoch, val_loss, metric_value_groups) history = [] for epoch in range(epochs): epoch_metrics = { 'epoch': epoch, 'train': _train(epoch) } if should_validate: epoch_metrics['validation'] = _validate(epoch) if user_verbose > 0: print(epoch_metrics) history.append(epoch_metrics) if hvd.rank() == 0: # Save model after every epoch save_checkpoint() if remote_store.saving_runs: remote_store.sync(run_output_dir) if hvd.rank() == 0: best_checkpoint = torch.load(ckpt_file) serialized_checkpoint = io.BytesIO() torch.save(best_checkpoint, serialized_checkpoint) serialized_checkpoint.seek(0) return history, serialized_checkpoint
def main(): if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) print(torch.cuda.device_count()) np.random.seed(args.seed) cudnn.benchmark = True torch.manual_seed(args.seed) if hvd.rank() == 0: logging.info('gpu device = %d' % args.gpu) logging.info("args = %s", args) best_acc_top1 = 0 start_epoch = 0 if args.resume: checkpoint = torch.load(os.path.join(args.save, 'checkpoint.pth.tar')) best_checkpoint = torch.load( os.path.join(args.save, 'model_best.pth.tar')) start_epoch = checkpoint['epoch'] best_acc_top1 = best_checkpoint['best_acc_top1'] start_epoch = hvd.broadcast(torch.tensor(start_epoch), root_rank=0, name='start_epoch').item() best_acc_top1 = hvd.broadcast(torch.tensor(best_acc_top1), root_rank=0, name='best_acc_top1').item() genotype = eval("genotypes.%s" % args.arch) model = Network(args.init_channels, CLASSES, args.layers, args.auxiliary, genotype) if args.parallel: model = nn.DataParallel(model).cuda() else: model = model.cuda() if hvd.rank() == 0: logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() criterion_smooth = CrossEntropyLabelSmooth(CLASSES, args.label_smooth) criterion_smooth = criterion_smooth.cuda() optimizer = torch.optim.SGD(model.parameters(), args.learning_rate * hvd.size(), momentum=args.momentum, weight_decay=args.weight_decay) # ***************** horovod ******************* optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) # ***************** horovod ******************* traindir = os.path.join(args.data, 'train') validdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_data = dset.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.2), transforms.ToTensor(), normalize, ])) valid_data = dset.ImageFolder( validdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])) train_sampler = torch.utils.data.distributed.DistributedSampler( train_data, num_replicas=hvd.size(), rank=hvd.rank()) train_queue = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, pin_memory=True, num_workers=args.num_workers, sampler=train_sampler) valid_queue = torch.utils.data.DataLoader(valid_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=args.num_workers) if start_epoch > 0 and hvd.rank() == 0: checkpoint = torch.load(os.path.join(args.save, 'checkpoint.pth.tar')) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("checkpoint {}, model, optimizer was loaded".format(start_epoch)) hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) if not args.resume: set_lr(0, 0, len(train_queue), optimizer, args.scheduler) for epoch in range(start_epoch, args.epochs + args.warmup_epochs): if hvd.rank() == 0: lr = optimizer.param_groups[0]['lr'] logging.info('epoch %d lr %e', epoch, lr) with open(os.path.join(args.save, 'learning_rate.txt'), mode='a') as f: f.write(str(lr) + '\n') if args.parallel: model.module.drop_path_prob = args.drop_path_prob * epoch / args.epochs else: model.drop_path_prob = args.drop_path_prob * epoch / args.epochs hvd.broadcast_parameters(model.state_dict(), root_rank=0) train_acc, train_obj = train(train_queue, train_sampler, model, criterion_smooth, optimizer, epoch) if hvd.rank() == 0: logging.info('train_acc %f', train_acc) with open(os.path.join(args.save, "train_acc.txt"), mode='a') as f: f.write(str(train_acc) + '\n') with open(os.path.join(args.save, "train_loss.txt"), mode='a') as f: f.write(str(train_obj) + '\n') valid_acc_top1, valid_acc_top5, valid_obj = infer( valid_queue, model, criterion) if hvd.rank() == 0: logging.info('valid_acc_top1 %f', valid_acc_top1) logging.info('valid_acc_top5 %f', valid_acc_top5) with open(os.path.join(args.save, "test_acc_1.txt"), mode='a') as f: f.write(str(valid_acc_top1) + '\n') with open(os.path.join(args.save, "test_acc_5.txt"), mode='a') as f: f.write(str(valid_acc_top5) + '\n') with open(os.path.join(args.save, "test_loss.txt"), mode='a') as f: f.write(str(valid_obj) + '\n') is_best = False if valid_acc_top1 > best_acc_top1: best_acc_top1 = valid_acc_top1 is_best = True if hvd.rank() == 0: utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_acc_top1': best_acc_top1, 'optimizer': optimizer.state_dict(), }, is_best, args.save)
def setup_models(*args) -> None: for model in args: hvd.broadcast_parameters(model.state_dict(), root_rank=0)
def broadcast_model(model: torch.nn.Module) -> None: hvd.broadcast_parameters(model.state_dict(), root_rank=0)
def main(): args = parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() # Horovod: initialize library. hvd.init() torch.manual_seed(args.seed) local_rank = hvd.local_rank() world_size = hvd.size() if args.cuda: device = torch.device(f'cuda:{local_rank}') # Horovod: pin GPU to local rank. torch.cuda.set_device(device) torch.cuda.manual_seed(args.seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' # Horovod: use DistributedSampler to partition the training data. data = prepare_datasets(args, rank=local_rank, num_workers=world_size, data='mnist') model = Net() # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not args.use_adasum else 1 if args.cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler, momentum=args.momentum) # Horovod: (optional) compression algorithm. compression = (hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Adasum if args.use_adasum else hvd.Average, gradient_predivide_factor=args.gradient_predivide_factor) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) loss_fn = nn.CrossEntropyLoss() epoch_times = [] for epoch in range(1, args.epochs + 1): t0 = time.time() train(epoch, data['training'], rank=local_rank, model=model, loss_fn=loss_fn, optimizer=optimizer, args=args, scaler=None) if epoch > 2: epoch_times.append(time.time() - t0) if epoch % 10 == 0: if hvd.local_rank() == 0: accuracy = evaluate(model=model, test_loader=data['testing'].loader) logger.log('-' * 75) logger.log(f'Epoch: {epoch}, Accuracy: {accuracy}') logger.log('-' * 75) if local_rank == 0: epoch_times_str = ', '.join(str(x) for x in epoch_times) logger.log('Epoch times:') logger.log(epoch_times_str) outdir = os.path.join(os.getcwd(), 'results_mnist', f'size{world_size}') if not os.path.isdir(outdir): os.makedirs(outdir) modeldir = os.path.join(outdir, 'saved_models') modelfile = os.path.join(modeldir, 'hvd_model_mnist.pth') if not os.path.isdir(modeldir): os.makedirs(modeldir) logger.log(f'Saving model to: {modelfile}') torch.save(model.state_dict(), modelfile) args_file = os.path.join(outdir, f'args_size{world_size}.json') logger.log(f'Saving args to: {args_file}.') with open(args_file, 'at') as f: json.dump(args.__dict__, f, indent=4) times_file = os.path.join(outdir, f'epoch_times_size{world_size}.csv') logger.log(f'Saving epoch times to: {times_file}') with open(times_file, 'a') as f: f.write(epoch_times_str + '\n')
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu os.environ.setdefault("NCCL_SOCKET_IFNAME", "^lo,docker") print(os.environ.get("NCCL_SOCKET_IFNAME")) print(args.dist_backend, args.dist_url, args.world_size, args.rank) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model if 'efficientnet' in args.arch: # NEW if args.pretrained: model = EfficientNet.from_pretrained(args.arch, advprop=args.advprop) print("=> using pre-trained model '{}'".format(args.arch)) else: print("=> creating model '{}'".format(args.arch)) model = EfficientNet.from_name(args.arch) else: if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.use_horovod: print('use horovod') hvd.init() torch.cuda.set_device(hvd.local_rank()) model = model.cuda(args.gpu) args.batch_size = int(args.batch_size / ngpus_per_node) args.n_workers = int(args.n_workers / ngpus_per_node) print(args.n_workers) compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # compression = hvd.Compression.none optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression) hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) elif args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.n_workers = int(args.n_workers / ngpus_per_node) print(args.n_workers) print('use DDP') model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: print('use DP') model = model.cuda() model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') if args.advprop: normalize = transforms.Lambda(lambda img: img * 2.0 - 1.0) else: normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) if 'efficientnet' in args.arch: image_size = EfficientNet.get_image_size(args.arch) else: image_size = args.image_size train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(image_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.use_horovod: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) elif args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.n_workers, pin_memory=True, sampler=train_sampler) val_transforms = transforms.Compose([ transforms.Resize(image_size, interpolation=PIL.Image.BICUBIC), transforms.CenterCrop(image_size), transforms.ToTensor(), normalize, ]) print('Using image size', image_size) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, val_transforms), batch_size=args.batch_size, shuffle=False, num_workers=args.n_workers, pin_memory=True) if args.evaluate: res = validate(val_loader, model, criterion, args) with open('res.txt', 'w') as f: print(res, file=f) return scaler = GradScaler() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch start_time = time.time() train(train_loader, model, criterion, optimizer, scaler, epoch, args) end_time = time.time() with open('benchmarks_1209.txt', 'a') as f: f.write('\texecution time per single epoch: ' + str(end_time - start_time) + '\n') exit() # evaluate on validation set acc1 = validate(val_loader, model, criterion, scaler, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best)
def train(args): # initialize Horovod library hvd.init() # Horovod limits CPU threads to be used per worker torch.set_num_threads(1) # disable logging for processes execpt 0 on every node if hvd.local_rank() != 0: f = open(os.devnull, "w") sys.stdout = sys.stderr = f elif not os.path.exists(args.dir): # create 40 random image, mask paris on master node for training print(f"generating synthetic data to {args.dir} (this may take a while)") os.makedirs(args.dir) # set random seed to generate same random data for every node np.random.seed(seed=0) for i in range(40): im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1) n = nib.Nifti1Image(im, np.eye(4)) nib.save(n, os.path.join(args.dir, f"img{i:d}.nii.gz")) n = nib.Nifti1Image(seg, np.eye(4)) nib.save(n, os.path.join(args.dir, f"seg{i:d}.nii.gz")) images = sorted(glob(os.path.join(args.dir, "img*.nii.gz"))) segs = sorted(glob(os.path.join(args.dir, "seg*.nii.gz"))) train_files = [{"img": img, "seg": seg} for img, seg in zip(images, segs)] # define transforms for image and segmentation train_transforms = Compose( [ LoadNiftid(keys=["img", "seg"]), AsChannelFirstd(keys=["img", "seg"], channel_dim=-1), ScaleIntensityd(keys="img"), RandCropByPosNegLabeld( keys=["img", "seg"], label_key="seg", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4 ), RandRotate90d(keys=["img", "seg"], prob=0.5, spatial_axes=[0, 2]), ToTensord(keys=["img", "seg"]), ] ) # create a training data loader train_ds = Dataset(data=train_files, transform=train_transforms) # create a training data sampler train_sampler = DistributedSampler(train_ds, num_replicas=hvd.size(), rank=hvd.rank()) # when supported, use "forkserver" to spawn dataloader workers instead of "fork" to prevent # issues with Infiniband implementations that are not fork-safe multiprocessing_context = None if hasattr(mp, "_supports_context") and mp._supports_context and "forkserver" in mp.get_all_start_methods(): multiprocessing_context = "forkserver" # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training train_loader = DataLoader( train_ds, batch_size=2, shuffle=False, num_workers=2, pin_memory=True, sampler=train_sampler, multiprocessing_context=multiprocessing_context, ) # create UNet, DiceLoss and Adam optimizer device = torch.device(f"cuda:{hvd.local_rank()}") model = monai.networks.nets.UNet( dimensions=3, in_channels=1, out_channels=1, channels=(16, 32, 64, 128, 256), strides=(2, 2, 2, 2), num_res_units=2, ).to(device) loss_function = monai.losses.DiceLoss(sigmoid=True).to(device) optimizer = torch.optim.Adam(model.parameters(), 1e-3) # Horovod broadcasts parameters & optimizer state hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod wraps optimizer with DistributedOptimizer optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters()) # start a typical PyTorch training epoch_loss_values = list() for epoch in range(5): print("-" * 10) print(f"epoch {epoch + 1}/{5}") model.train() epoch_loss = 0 step = 0 train_sampler.set_epoch(epoch) for batch_data in train_loader: step += 1 inputs, labels = batch_data["img"].to(device), batch_data["seg"].to(device) optimizer.zero_grad() outputs = model(inputs) loss = loss_function(outputs, labels) loss.backward() optimizer.step() epoch_loss += loss.item() epoch_len = len(train_ds) // train_loader.batch_size print(f"{step}/{epoch_len}, train_loss: {loss.item():.4f}") epoch_loss /= step epoch_loss_values.append(epoch_loss) print(f"epoch {epoch + 1} average loss: {epoch_loss:.4f}") print(f"train completed, epoch losses: {epoch_loss_values}") if hvd.rank() == 0: # all processes should see same parameters as they all start from same # random parameters and gradients are synchronized in backward passes, # therefore, saving it in one process is sufficient torch.save(model.state_dict(), "final_model.pth")
def auto_model(model: nn.Module, sync_bn: bool = False, **kwargs: Any) -> nn.Module: """Helper method to adapt provided model for non-distributed and distributed configurations (supporting all available backends from :meth:`~ignite.distributed.utils.available_backends()`). Internally, we perform to following: - send model to current :meth:`~ignite.distributed.utils.device()` if model's parameters are not on the device. - wrap the model to `torch DistributedDataParallel`_ for native torch distributed if world size is larger than 1. - wrap the model to `torch DataParallel`_ if no distributed context found and more than one CUDA devices available. - broadcast the initial variable states from rank 0 to all other processes if Horovod distributed framework is used. Examples: .. code-block:: python import ignite.distribted as idist model = idist.auto_model(model) In addition with NVidia/Apex, it can be used in the following way: .. code-block:: python import ignite.distribted as idist model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) model = idist.auto_model(model) Args: model (torch.nn.Module): model to adapt. sync_bn (bool): if True, applies `torch convert_sync_batchnorm`_ to the model for native torch distributed only. Default, False. Note, if using Nvidia/Apex, batchnorm conversion should be applied before calling ``amp.initialize``. **kwargs: kwargs to model's wrapping class: `torch DistributedDataParallel`_ or `torch DataParallel`_ if applicable. Please, make sure to use acceptable kwargs for given backend. Returns: torch.nn.Module .. _torch DistributedDataParallel: https://pytorch.org/docs/stable/generated/torch.nn.parallel. DistributedDataParallel.html .. _torch DataParallel: https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html .. _torch convert_sync_batchnorm: https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html# torch.nn.SyncBatchNorm.convert_sync_batchnorm """ logger = setup_logger(__name__ + ".auto_model") # Put model's parameters to device if its parameters are not on the device device = idist.device() if not all([p.device == device for p in model.parameters()]): model.to(device) # distributed data parallel model if idist.get_world_size() > 1: bnd = idist.backend() if idist.has_native_dist_support and bnd == idist_native.NCCL: if sync_bn: logger.info("Convert batch norm to sync batch norm") model = nn.SyncBatchNorm.convert_sync_batchnorm(model) if "device_ids" in kwargs: raise ValueError( f"Argument kwargs should not contain 'device_ids', but got {kwargs}" ) lrank = idist.get_local_rank() logger.info( f"Apply torch DistributedDataParallel on model, device id: {lrank}" ) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[ lrank, ], **kwargs) elif idist.has_native_dist_support and bnd == idist_native.GLOO: if sync_bn: logger.info("Convert batch norm to sync batch norm") model = nn.SyncBatchNorm.convert_sync_batchnorm(model) logger.info("Apply torch DistributedDataParallel on model") model = torch.nn.parallel.DistributedDataParallel(model, **kwargs) elif idist.has_hvd_support and bnd == idist_hvd.HOROVOD: import horovod.torch as hvd logger.info( "Broadcast the initial variable states from rank 0 to all other processes" ) hvd.broadcast_parameters(model.state_dict(), root_rank=0) # not distributed but multiple GPUs reachable so data parallel model elif torch.cuda.device_count() > 1 and "cuda" in idist.device().type: logger.info("Apply torch DataParallel on model") model = torch.nn.parallel.DataParallel(model, **kwargs) return model
def main(): torch.backends.cudnn.benchmark = True args = getArgs() torch.manual_seed(args.seed) args.cuda = torch.cuda.is_available() if args.cuda: device = torch.device('cuda') else: device = torch.device('cpu') # horovod 初始化 hvd.init() torch.manual_seed(args.seed) # 打印一下训练使用的配置 if hvd.rank() == 0: print("Training with configure: ") for arg in vars(args): print("{}:\t{}".format(arg, getattr(args, arg))) if not osp.exists(args.save_model_path): os.makedirs(args.save_model_path) # 保存训练配置 with open(osp.join(args.save_model_path, 'train-config.json'), 'w') as f: json.dump(args.__dict__, f, indent=4) # 设置随机种子,保证每个 GPU 上的权重初始化都一样 if args.cuda: # Pin GPU to local rank torch.cuda.set_device(hvd.local_rank()) # 这一句似乎没有用的吧。不过按照 horovod 的回复来说,还是加上好了。 torch.cuda.manual_seed(args.seed) # data dataset_train = SpineDataset(root=args.data, transform=my_transform) # 分布式训练需要使用这个 sampler sampler_train = DistributedSampler(dataset_train, num_replicas=hvd.size(), rank=hvd.rank()) dataloader_train = DataLoader(dataset_train, batch_size=1, sampler=sampler_train, num_workers=args.num_workers, pin_memory=True) # model if args.network == 'DeepLab': if args.voc: model = gcv.models.get_deeplab_resnet101_voc(pretrained=True) elif args.ade: model = gcv.models.get_deeplab_resnet101_ade(pretrained=True) else: model = gcv.models.DeepLabV3(nclass=args.num_classes, backbone=args.backbone) model.auxlayer.conv5[-1] = nn.Conv2d(256, args.num_classes, kernel_size=1) model.head.block[-1] = nn.Conv2d(256, args.num_classes, kernel_size=1) elif args.network == 'FCN': if args.voc: model = gcv.models.get_fcn_resnet101_voc(pretrained=True) elif args.ade: model = gcv.models.get_fcn_resnet101_ade(pretrained=True) else: model = gcv.models.FCN(nclass=args.num_classes, backbone=args.backbone) model.auxlayer.conv5[-1] = nn.Conv2d(256, args.num_classes, kernel_size=1) model.head.conv5[-1] = nn.Conv2d(512, args.num_classes, kernel_size=1) elif args.network == 'PSPNet': if args.voc: model = gcv.models.get_psp_resnet101_voc(pretrained=True) elif args.ade: model = gcv.models.get_psp_resnet101_ade(pretrained=True) else: model = gcv.models.PSP(nclass=args.num_classes, backbone=args.backbone) model.auxlayer.conv5[-1] = nn.Conv2d(256, 2, kernel_size=1) model.head.conv5[-1] = nn.Conv2d(512, args.num_classes, kernel_size=1) elif args.network == 'UNet': model = UNet(n_class=args.num_classes, backbone=args.backbone, pretrained=True) model = convert_syncbn_model(model) model = model.to(device) # optimizer 要用 hvd 的版本包一下 # optimizer = torch.optim.Adam(model.parameters(), args.learning_rate * hvd.size()) # 不同层使用不同的学习率 if args.network == 'UNet': optimizer = torch.optim.SGD([ { 'params': model.down_blocks.parameters(), 'lr': args.learning_rate * 0.5 }, { 'params': model.bridge.parameters() }, { 'params': model.head.parameters() }, ], lr=args.learning_rate, momentum=0.9, weight_decay=0.0001) elif args.network in ['FCN', 'PSPNet', 'DeepLab']: optimizer = optim.SGD([{ 'params': model.pretrained.parameters(), 'lr': args.learning_rate * 0.5 }, { 'params': model.auxlayer.parameters() }, { 'params': model.head.parameters() }], lr=args.learning_rate, momentum=0.9, weight_decay=0.0001) else: optimizer = optim.SGD(model.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=0.0001) optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) # 将模型和优化器的参数广播到各个 GPU 上 hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # lr scheduler def poly_lr_scheduler(epoch, num_epochs=args.num_epochs, power=args.power): return (1 - epoch / num_epochs)**power lr_scheduler = LambdaLR(optimizer=optimizer, lr_lambda=poly_lr_scheduler) def train(epoch): model.train() # Horovod: set epoch to sampler for shuffling. sampler_train.set_epoch(epoch) lr_scheduler.step() loss_fn = nn.CrossEntropyLoss() for batch_idx, (data, target) in enumerate(dataloader_train): data = data.to(device).squeeze() target = target.to(device).squeeze() for batch_data, batch_target in zip( torch.split(data, args.batch_size), torch.split(target, args.batch_size)): optimizer.zero_grad() output = model(batch_data) if args.network in ['FCN', 'PSPNet', 'DeepLab']: loss = loss_fn(output[0], batch_target) \ + 0.2*loss_fn(output[1], batch_target) elif args.network == 'UNet': loss = loss_fn(output, batch_target) loss.backward() optimizer.step() if hvd.rank() == 0 and batch_idx % args.log_interval == 0: print("Train loss: ", loss.item()) for epoch in range(args.num_epochs): train(epoch) if hvd.rank() == 0: print("Saving model to {}".format( osp.join(args.save_model_path, "checkpoint-{:0>3d}.pth".format(epoch)))) torch.save({'state_dict': model.state_dict()}, osp.join(args.save_model_path, "checkpoint-{:0>3d}.pth".format(epoch)))
def main(): # Training settings parser = argparse.ArgumentParser(description='D-DNN imagenet benchmark') parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50', choices=model_names, help='model architecture: ' + ' | '.join(model_names) + ' (default: resnet50)') parser.add_argument('--lr', type=float, default=0.0125, metavar='LR', help='learning rate (default: 0.0125)') parser.add_argument('--momentum', type=float, default=0.9, metavar='M', help='SGD momentum (default: 0.9)') parser.add_argument('--wd', type=float, default=0.00005, help='weight decay') parser.add_argument('--warmup-epochs', type=float, default=1, help='number of warmup epochs') # Value of args.synthetic_data may seem confusing, but those values # come from bash and there 0=true and all else =false parser.add_argument('-s', '--synthetic_data', type=int, default=0, help="Use synthetic data") args = parser.parse_args() # Horovod: initialize library. hvd.init() torch.manual_seed(1) # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(1) cudnn.benchmark = True # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(cores_gpu) normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_comp = [ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize ] val_comp = [ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize ] if args.synthetic_data == -1: # Load highres data traindir = datadir + '/HIGHRES/train' valdir = datadir + '/HIGHRES/val' train_comp = [transforms.ToTensor(), normalize] val_comp = [transforms.ToTensor(), normalize] elif args.synthetic_data: # Load normal data traindir = datadir + '/train' valdir = datadir + '/val' else: # Load synthetic data traindir = datadir + '/IMAGENET/train' valdir = datadir + '/IMAGENET/val' train_dataset = \ datasets.ImageFolder(traindir, transform=transforms.Compose(train_comp)) val_dataset = \ datasets.ImageFolder(valdir, transform=transforms.Compose(val_comp)) # Horovod: use DistributedSampler to partition data among workers. Manually specify # `num_replicas=hvd.size()` and `rank=hvd.rank()`. kwargs = {'num_workers': cores_gpu, 'pin_memory': True} train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) global train_loader train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, **kwargs) val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset, num_replicas=hvd.size(), rank=hvd.rank()) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, sampler=val_sampler, **kwargs) # create model print("=> creating model '{}'".format(args.arch)) model = model_names[args.arch].cuda() # Horovod: scale learning rate by the number of GPUs. optimizer = optim.SGD(model.parameters(), lr=(args.lr * batches_per_allreduce * hvd.size()), momentum=args.momentum, weight_decay=args.wd) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), backward_passes_per_step=batches_per_allreduce, op=hvd.Average) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Run program throughputs = [] elapsed_times = [] for epoch in range(0, epochs): throughput, elapsed_time = train(epoch, args, model, train_sampler, train_loader, optimizer, val_sampler, val_loader) throughputs.append(throughput) elapsed_times.append(elapsed_time) _, valid_accuracy = test(model, val_sampler, val_loader) n = len(throughputs) throughput = sum(throughputs) / n if n > 0 else 0.0 elapsed_time = sum(elapsed_times) / n if n > 0 else 0.0 print('valid accuracy: %.4f | %.3f samples/sec, %.3f sec/epoch (average)' '' % (valid_accuracy, throughput, elapsed_time))
def train(): cfg = opt.cfg data = opt.data img_size = opt.img_size epochs = 1 if opt.prebias else opt.epochs # 500200 batches at bs 64, 117263 images = 273 epochs batch_size = opt.batch_size accumulate = opt.accumulate # effective bs = batch_size * accumulate = 16 * 4 = 64 weights = opt.weights # initial training weights if 'pw' not in opt.arc: # remove BCELoss positive weights hyp['cls_pw'] = 1. hyp['obj_pw'] = 1. # Horovod: initialize library. hvd.init() # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(6) # Horovod: Pin GPU to be used to process local rank (one GPU per process) torch.cuda.set_device(hvd.local_rank()) device = torch.device('cuda:{}'.format(hvd.local_rank())) # Initialize init_seeds() multi_scale = opt.multi_scale if multi_scale: img_sz_min = round(img_size / 32 / 1.5) + 1 img_sz_max = round(img_size / 32 * 1.5) - 1 img_size = img_sz_max * 32 # initiate with maximum multi_scale size print('Using multi-scale %g - %g' % (img_sz_min * 32, img_size)) # Configure run data_dict = parse_data_cfg(data) train_path = data_dict['train'] #train_path = data_dict['valid'] nc = int(data_dict['classes']) # number of classes # Remove previous results # Horovod: other node could fail on delete file if hvd.local_rank() == 0: for f in glob.glob('*_batch*.jpg') + glob.glob(results_file): os.remove(f) # Initialize model model = Darknet(cfg, arc=opt.arc).to(device) # Optimizer pg0, pg1 = [], [] # optimizer parameter groups for k, v in dict(model.named_parameters()).items(): if 'Conv2d.weight' in k: pg1 += [v] # parameter group 1 (apply weight_decay) else: pg0 += [v] # parameter group 0 # Horovod: scale learning rate by the number of GPUs. if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'] * hvd.size()) # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1) else: optimizer = optim.SGD(pg0, lr=hyp['lr0'] * hvd.size(), momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay del pg0, pg1 # Horovod: Add Horovod Distributed Optimizer optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), backward_passes_per_step=accumulate) cutoff = -1 # backbone reaches to cutoff layer start_epoch = 0 best_fitness = float('inf') if hvd.rank() == 0: attempt_download(weights) if weights.endswith('.pt'): # pytorch format # possible weights are 'last.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc. if opt.bucket: os.system('gsutil cp gs://%s/last.pt %s' % (opt.bucket, last)) # download from bucket chkpt = torch.load(weights, map_location=device) # load model # if opt.transfer: chkpt['model'] = { k: v for k, v in chkpt['model'].items() if model.state_dict()[k].numel() == v.numel() } model.load_state_dict(chkpt['model'], strict=False) # else: # model.load_state_dict(chkpt['model']) # load optimizer if chkpt['optimizer'] is not None: optimizer.load_state_dict(chkpt['optimizer']) best_fitness = chkpt['best_fitness'] # load results if chkpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(chkpt['training_results']) # write results.txt start_epoch = chkpt['epoch'] + 1 del chkpt elif len(weights) > 0: # darknet format # possible weights are 'yolov3.weights', 'yolov3-tiny.conv.15', 'darknet53.conv.74' etc. cutoff = load_darknet_weights(model, weights) if opt.transfer or opt.prebias: # transfer learning edge (yolo) layers nf = int(model.module_defs[model.yolo_layers[0] - 1]['filters']) # yolo layer size (i.e. 255) if opt.prebias: for p in optimizer.param_groups: # lower param count allows more aggressive training settings: i.e. SGD ~0.1 lr0, ~0.9 momentum p['lr'] *= 100 # lr gain if p.get('momentum') is not None: # for SGD but not Adam p['momentum'] *= 0.9 for p in model.parameters(): if opt.prebias and p.numel() == nf: # train (yolo biases) p.requires_grad = True elif opt.transfer and p.shape[ 0] == nf: # train (yolo biases+weights) p.requires_grad = True else: # freeze layer p.requires_grad = False # Scheduler https://github.com/ultralytics/yolov3/issues/238 # lf = lambda x: 1 - x / epochs # linear ramp to zero # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs) # exp ramp # lf = lambda x: 1 - 10 ** (hyp['lrf'] * (1 - x / epochs)) # inverse exp ramp # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=range(59, 70, 1), gamma=0.8) # gradual fall to 0.1*lr0 scheduler = lr_scheduler.MultiStepLR( optimizer, milestones=[round(opt.epochs * x) for x in [0.8, 0.9]], gamma=0.1) scheduler.last_epoch = start_epoch - 1 # Horovod: Broadcast parameters from rank 0 to all other processes. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) start_epoch = hvd.broadcast(torch.tensor(start_epoch), root_rank=0, name='start_epoch').item() # # Plot lr schedule # y = [] # for _ in range(epochs): # scheduler.step() # y.append(optimizer.param_groups[0]['lr']) # plt.plot(y, label='LambdaLR') # plt.xlabel('epoch') # plt.ylabel('LR') # plt.tight_layout() # plt.savefig('LR.png', dpi=300) # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # Initialize distributed training # Horovod: Mark these due to conflic with Horovod ''' if torch.cuda.device_count() > 1: dist.init_process_group(backend='nccl', # 'distributed backend' init_method='tcp://127.0.0.1:9999', # distributed training init method world_size=1, # number of nodes for distributed training rank=0) # distributed training node rank model = torch.nn.parallel.DistributedDataParallel(model) model.yolo_layers = model.module.yolo_layers # move yolo layer indices to top level ''' # Dataset dataset = LoadImagesAndLabels( train_path, img_size, batch_size, augment=True, hyp=hyp, # augmentation hyperparameters rect=opt.rect, # rectangular training image_weights=opt.img_weights, cache_labels=True if epochs > 10 else False, cache_images=False if opt.prebias else opt.cache_images) # Horovod: use DistributedSampler to partition data among workers. Manually specify # `num_replicas=hvd.size()` and `rank=hvd.rank()`. train_sampler = torch.utils.data.distributed.DistributedSampler( dataset, num_replicas=hvd.size(), rank=hvd.rank()) # Dataloader dataloader = torch.utils.data.DataLoader( dataset, batch_size=batch_size, num_workers= 1, #Horovod: multi-worker will fail =min([os.cpu_count(), batch_size, 16]), shuffle= False, #not opt.rect, # Shuffle=True unless rectangular training is used pin_memory=True, sampler=train_sampler, #Horovod: sampler can not use with shuffle collate_fn=dataset.collate_fn) # Start training model.nc = nc # attach number of classes to model model.arc = opt.arc # attach yolo architecture model.hyp = hyp # attach hyperparameters to model # model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights torch_utils.model_info(model, report='summary') # 'full' or 'summary' nb = len(dataloader) maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' t0 = time.time() print('Starting %s for %g epochs...' % ('prebias' if opt.prebias else 'training', epochs)) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) # Freeze backbone at epoch 0, unfreeze at epoch 1 (optional) freeze_backbone = False if freeze_backbone and epoch < 2: for name, p in model.named_parameters(): if int(name.split('.')[1]) < cutoff: # if layer < 75 p.requires_grad = False if epoch == 0 else True # Update image weights (optional) if dataset.image_weights: w = model.class_weights.cpu().numpy() * (1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx mloss = torch.zeros(4).to(device) # mean losses # Horovod: set epoch to sampler for shuffling. train_sampler.set_epoch(epoch) if hvd.rank() == 0: pbar = tqdm(enumerate(dataloader), total=nb) # progress bar else: pbar = enumerate(dataloader) for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device) targets = targets.to(device) # Multi-Scale training if multi_scale: if ni / accumulate % 10 == 0: # adjust (67% - 150%) every 10 batches img_size = random.randrange(img_sz_min, img_sz_max + 1) * 32 sf = img_size / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [ math.ceil(x * sf / 32.) * 32 for x in imgs.shape[2:] ] # new shape (stretched to 32-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Plot images with bounding boxes # Horovod: only root can do it if ni == 0 and hvd.rank() == 0: fname = 'train_batch%g.jpg' % i plot_images(imgs=imgs, targets=targets, paths=paths, fname=fname) if tb_writer: tb_writer.add_image(fname, cv2.imread(fname)[:, :, ::-1], dataformats='HWC') # Hyperparameter burn-in # n_burn = nb - 1 # min(nb // 5 + 1, 1000) # number of burn-in batches # if ni <= n_burn: # for m in model.named_modules(): # if m[0].endswith('BatchNorm2d'): # m[1].momentum = 1 - i / n_burn * 0.99 # BatchNorm2d momentum falls from 1 - 0.01 # g = (i / n_burn) ** 4 # gain rises from 0 - 1 # for x in optimizer.param_groups: # x['lr'] = hyp['lr0'] * g # x['weight_decay'] = hyp['weight_decay'] * g # Run model pred = model(imgs) # Compute loss loss, loss_items = compute_loss(pred, targets, model) if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results # Scale loss by nominal batch_size of 64 loss *= batch_size / 64 # Compute gradient if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Accumulate gradient for x batches before optimizing if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() # Print batch results if hvd.rank() == 0: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = torch.cuda.memory_cached( ) / 1E9 if torch.cuda.is_available() else 0 # (GB) s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch, epochs - 1), '%.3gG' % mem, *mloss, len(targets), img_size) pbar.set_description(s) # end batch ------------------------------------------------------------------------------------------------ # Update scheduler scheduler.step() # Process epoch results final_epoch = epoch + 1 == epochs if opt.prebias: print_model_biases(model) else: # Calculate mAP (always test final epoch, skip first 10 if opt.nosave) if not (opt.notest or (opt.nosave and epoch < 10)) or final_epoch: with torch.no_grad(): results, maps = test.test( cfg, data, hvd=hvd, dist_test=opt.disttest, batch_size=batch_size, img_size=opt.img_size, model=model, conf_thres=0.001 if final_epoch and epoch > 0 else 0.1, # 0.1 for speed save_json=final_epoch and epoch > 0 and 'coco.data' in data) # Write epoch results if hvd.rank() == 0: with open(results_file, 'a') as f: f.write(s + '%10.3g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) # Write Tensorboard results if tb_writer: x = list(mloss) + list(results) titles = [ 'GIoU', 'Objectness', 'Classification', 'Train loss', 'Precision', 'Recall', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' ] for xi, title in zip(x, titles): tb_writer.add_scalar(title, xi, epoch) # Update best mAP fitness = sum(results[4:]) # total loss if fitness < best_fitness: best_fitness = fitness # Save training results save = (not opt.nosave) or (final_epoch and not opt.evolve) or opt.prebias # Horovod: save only on first rank. if save and hvd.rank() == 0: with open(results_file, 'r') as f: # Create checkpoint chkpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': model.module.state_dict() if type(model) is nn.parallel.DistributedDataParallel else model.state_dict(), 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last checkpoint torch.save(chkpt, last) if opt.bucket and not opt.prebias: os.system('gsutil cp %s gs://%s' % (last, opt.bucket)) # upload to bucket # Save best checkpoint if best_fitness == fitness: torch.save(chkpt, best) # Save backup every 10 epochs (optional) if epoch > 0 and epoch % 10 == 0: torch.save(chkpt, wdir + 'backup%g.pt' % epoch) # Delete checkpoint del chkpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if len(opt.name) and hvd.rank() == 0: os.rename('results.txt', 'results_%s.txt' % opt.name) os.rename(wdir + 'best.pt', wdir + 'best_%s.pt' % opt.name) plot_results() # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if torch.cuda.device_count() > 1 else None torch.cuda.empty_cache() # save to cloud # os.system(gsutil cp results.txt gs://...) # os.system(gsutil cp weights/best.pt gs://...) return results
def train( model, epochs=1000, batch_size=64, train_index_path="../data_aishell/train-sort.manifest", dev_index_path="../data_aishell/dev.manifest", labels_path="../data_aishell/labels.json", learning_rate=0.6, momentum=0.8, max_grad_norm=0.2, weight_decay=0, ): hvd.init() torch.manual_seed(1024) torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(1024) # dataset loader train_dataset = data.MASRDataset(train_index_path, labels_path) batchs = (len(train_dataset) + batch_size - 1) // batch_size train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_dataloader = data.MASRDataLoader(train_dataset, batch_size=batch_size, num_workers=4, sampler=train_sampler) dev_dataset = data.MASRDataset(dev_index_path, labels_path) dev_sampler = torch.utils.data.distributed.DistributedSampler( dev_dataset, num_replicas=hvd.size(), rank=hvd.rank()) dev_dataloader = data.MASRDataLoader(dev_dataset, batch_size=batch_size, num_workers=1, sampler=dev_sampler) # optimizer parameters = model.parameters() optimizer = torch.optim.SGD( parameters, lr=learning_rate * hvd.size(), momentum=momentum, nesterov=True, weight_decay=weight_decay, ) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression) ctcloss = nn.CTCLoss() # lr_sched = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.985) writer = tensorboard.SummaryWriter() gstep = 0 for epoch in range(epochs): epoch_loss = 0 # lr_sched.step() lr = get_lr(optimizer) if hvd.rank() == 0: writer.add_scalar("lr/epoch", lr, epoch) for i, (x, y, x_lens, y_lens) in enumerate(train_dataloader): x = x.to(device) out, out_lens = model(x, x_lens) out = out.transpose(0, 1).transpose(0, 2).log_softmax(2) loss = ctcloss(out, y, out_lens, y_lens) optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) optimizer.step() epoch_loss += loss.item() if hvd.rank() == 0: writer.add_scalar("loss/step", loss.item(), gstep) gstep += 1 print("[{}/{}][{}/{}]\tLoss = {}".format( epoch + 1, epochs, i, int(batchs), loss.item())) epoch_loss = epoch_loss / batchs cer = eval(model, dev_dataloader) writer.add_scalar("loss/epoch", epoch_loss, epoch) writer.add_scalar("cer/epoch", cer, epoch) print("Epoch {}: Loss= {}, CER = {}".format(epoch, epoch_loss, cer)) torch.save(model.state_dict(), "pretrained/model_{}.pth".format(epoch))
def train_and_eval(tag, dataroot, trans_type=TRANSFORMATION.clean, test_ratio=0.0, cv_fold=0, reporter=None, metric='last', save_path=None, only_eval=False, horovod=False): print('----------------------------') print('Augments for model training') print('>>> tag:', tag) print('>>> dataroot:', dataroot) print('>>> save_path:', save_path) print('>>> eval:', only_eval) print('>>> horovod:', horovod) print('----------------------------') if horovod: import horovod.torch as hvd hvd.init() device = torch.device('cuda', hvd.local_rank()) torch.cuda.set_device(device) if not reporter: reporter = lambda **kwargs: 0 max_epoch = C.get()['epoch'] start = time.monotonic() trainsampler, trainloader, validloader, testloader_ = get_dataloaders( C.get()['dataset'], C.get()['batch'], dataroot, trans_type=trans_type, split=test_ratio, split_idx=cv_fold, horovod=horovod) trans_cost = time.monotonic() - start print('Cost for transformation:', round(trans_cost / 60., 6)) # create a model & an optimizer model = get_model(C.get()['model'], num_class(C.get()['dataset']), data_parallel=(not horovod)) criterion = nn.CrossEntropyLoss() if C.get()['optimizer']['type'] == 'sgd': optimizer = optim.SGD(model.parameters(), lr=C.get()['lr'], momentum=C.get()['optimizer'].get( 'momentum', 0.9), weight_decay=C.get()['optimizer']['decay'], nesterov=C.get()['optimizer']['nesterov']) else: raise ValueError('invalid optimizer type=%s' % C.get()['optimizer']['type']) is_master = True if horovod: optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) optimizer._requires_update = set( ) # issue : https://github.com/horovod/horovod/issues/1099 hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) if hvd.rank() != 0: is_master = False logger.debug('is_master=%s' % is_master) lr_scheduler_type = C.get()['lr_schedule'].get('type', 'cosine') if lr_scheduler_type == 'cosine': scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=C.get()['epoch'], eta_min=0.) elif lr_scheduler_type == 'resnet': scheduler = adjust_learning_rate_resnet(optimizer) else: raise ValueError('invalid lr_schduler={}'.format(lr_scheduler_type)) if C.get()['lr_schedule'].get('warmup', None): scheduler = GradualWarmupScheduler( optimizer, multiplier=C.get()['lr_schedule']['warmup']['multiplier'], total_epoch=C.get()['lr_schedule']['warmup']['epoch'], after_scheduler=scheduler) if not tag or not is_master: from utils.metrics import SummaryWriterDummy as SummaryWriter logger.warning('tag not provided, no tensorboard log.') else: from tensorboardX import SummaryWriter writers = [ SummaryWriter(log_dir='./logs/{}/{}'.format(tag, x)) for x in ['train', 'valid', 'test'] ] result = OrderedDict() epoch_start = 1 if save_path and os.path.exists(save_path): logger.info('Found file [{}]. Loading...'.format(save_path)) data = torch.load(save_path) if 'model' in data or 'state_dict' in data: key = 'model' if 'model' in data else 'state_dict' logger.info('checkpoint epoch@{}'.format(data['epoch'])) if not isinstance(model, DataParallel): model.load_state_dict({ k.replace('module.', ''): v for k, v in data[key].items() }) else: model.load_state_dict({ k if 'module.' in k else 'module.' + k: v for k, v in data[key].items() }) optimizer.load_state_dict(data['optimizer']) if data['epoch'] < C.get()['epoch']: epoch_start = data['epoch'] else: only_eval = True else: model.load_state_dict({k: v for k, v in data.items()}) del data else: logger.info('[{}] file not found. Skip to pretrain weights...'.format( save_path)) if only_eval: logger.warning( 'model checkpoint not found. only-evaluation mode is off.') only_eval = False if only_eval: logger.info('evaluation only+') model.eval() rs = dict() rs['train'] = run_epoch(model, trainloader, criterion, None, desc_default='train', epoch=0, writer=writers[0]) rs['valid'] = run_epoch(model, validloader, criterion, None, desc_default='valid', epoch=0, writer=writers[1]) rs['test'] = run_epoch(model, testloader_, criterion, None, desc_default='*test', epoch=0, writer=writers[2]) for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'valid', 'test']): if setname not in rs: continue result['{}_{}'.format(key, setname)] = rs[setname][key] result['epoch'] = 0 return result # train loop best_top1 = 0 for epoch in range(epoch_start, max_epoch + 1): if horovod: trainsampler.set_epoch(epoch) model.train() rs = dict() rs['train'] = run_epoch(model, trainloader, criterion, optimizer, desc_default='train', epoch=epoch, writer=writers[0], verbose=is_master, scheduler=scheduler) model.eval() if math.isnan(rs['train']['loss']): raise Exception('train loss is NaN.') if epoch % 5 == 0 or epoch == max_epoch: rs['valid'] = run_epoch(model, validloader, criterion, None, desc_default='valid', epoch=epoch, writer=writers[1], verbose=is_master) rs['test'] = run_epoch(model, testloader_, criterion, None, desc_default='*test', epoch=epoch, writer=writers[2], verbose=is_master) if metric == 'last' or rs[metric]['top1'] > best_top1: if metric != 'last': best_top1 = rs[metric]['top1'] for key, setname in itertools.product( ['loss', 'top1', 'top5'], ['train', 'valid', 'test']): result['{}_{}'.format(key, setname)] = rs[setname][key] result['epoch'] = epoch writers[1].add_scalar('valid_top1/best', rs['valid']['top1'], epoch) writers[2].add_scalar('test_top1/best', rs['test']['top1'], epoch) reporter(loss_valid=rs['valid']['loss'], top1_valid=rs['valid']['top1'], loss_test=rs['test']['loss'], top1_test=rs['test']['top1']) # save checkpoint if is_master and save_path: logger.info('save model@%d to %s' % (epoch, save_path)) torch.save( { 'epoch': epoch, 'log': { 'train': rs['train'].get_dict(), 'valid': rs['valid'].get_dict(), 'test': rs['test'].get_dict(), }, 'optimizer': optimizer.state_dict(), 'model': model.state_dict() }, save_path) torch.save( { 'epoch': epoch, 'log': { 'train': rs['train'].get_dict(), 'valid': rs['valid'].get_dict(), 'test': rs['test'].get_dict(), }, 'optimizer': optimizer.state_dict(), 'model': model.state_dict() }, save_path.replace( '.pth', '_e%d_top1_%.3f_%.3f' % (epoch, rs['train']['top1'], rs['test']['top1']) + '.pth')) del model torch.cuda.empty_cache() result['top1_test'] = best_top1 result['trans_cost'] = trans_cost return result
def fit( self, train_loader, epoch, bert_optimizer=None, num_epochs=1, num_gpus=None, lr=2e-5, warmup_proportion=None, fp16_allreduce=False, num_train_optimization_steps=10, ): """ Method to fine-tune the bert classifier using the given training data Args: train_loader(torch.DataLoader): Torch Dataloader created from Torch Dataset epoch(int): Current epoch number of training. bert_optimizer(optimizer): optimizer can be BERTAdam for local and Dsitributed if Horovod num_epochs(int): the number of epochs to run num_gpus(int): the number of gpus. If None is specified, all available GPUs will be used. lr (float): learning rate of the adam optimizer. defaults to 2e-5. warmup_proportion (float, optional): proportion of training to perform linear learning rate warmup for. e.g., 0.1 = 10% of training. defaults to none. fp16_allreduce(bool): if true, use fp16 compression during allreduce num_train_optimization_steps: number of steps the optimizer should take. """ device, num_gpus = get_device(num_gpus) self.model = move_model_to_device(self.model, device) self.model = parallelize_model(self.model, device, num_gpus=num_gpus) if bert_optimizer is None: bert_optimizer = self.create_optimizer( num_train_optimization_steps=num_train_optimization_steps, lr=lr, warmup_proportion=warmup_proportion, fp16_allreduce=fp16_allreduce, ) if self.use_distributed: hvd.broadcast_parameters(self.model.state_dict(), root_rank=0) loss_func = nn.CrossEntropyLoss().to(device) # train self.model.train() # training mode token_type_ids_batch = None num_print = 1000 for batch_idx, data in enumerate(train_loader): x_batch = data["token_ids"] x_batch = x_batch.cuda() y_batch = data["labels"] y_batch = y_batch.cuda() mask_batch = data["input_mask"] mask_batch = mask_batch.cuda() if "token_type_ids" in data and data["token_type_ids"] is not None: token_type_ids_batch = data["token_type_ids"] token_type_ids_batch = token_type_ids_batch.cuda() bert_optimizer.zero_grad() y_h = self.model( input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=None, ) loss = loss_func(y_h, y_batch).mean() loss.backward() bert_optimizer.synchronize() bert_optimizer.step() if batch_idx % num_print == 0: print( "Train Epoch: {}/{} ({:.0f}%) \t Batch:{} \tLoss: {:.6f}". format( epoch, num_epochs, 100.0 * batch_idx / len(train_loader), batch_idx + 1, loss.item(), )) del [x_batch, y_batch, mask_batch, token_type_ids_batch] torch.cuda.empty_cache()
def start_training(cfg): set_random_seed(cfg.seed) n_gpu = hvd.size() cfg.n_gpu = n_gpu device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) if hvd.rank() != 0: LOGGER.disabled = True LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), bool(cfg.fp16))) model = setup_model(cfg, device=device) model.train() optimizer = setup_e2e_optimizer(model, cfg) # Horovod: (optional) compression algorithm.compressin compression = hvd.Compression.none optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) model, optimizer = amp.initialize(model, optimizer, enabled=cfg.fp16, opt_level='O2', keep_batchnorm_fp32=True) # prepare data tokenizer = BertTokenizerFast.from_pretrained(cfg.tokenizer_dir) train_loader, val_loader = setup_dataloaders(cfg, tokenizer) # compute the number of steps and update cfg total_n_examples = len(train_loader.dataset) * cfg.max_n_example_per_group total_train_batch_size = int(n_gpu * cfg.train_batch_size * cfg.gradient_accumulation_steps * cfg.max_n_example_per_group) cfg.num_train_steps = int( math.ceil(1. * cfg.num_train_epochs * total_n_examples / total_train_batch_size)) cfg.valid_steps = int( math.ceil(1. * cfg.num_train_steps / cfg.num_valid / cfg.min_valid_steps)) * cfg.min_valid_steps actual_num_valid = int( math.floor(1. * cfg.num_train_steps / cfg.valid_steps)) + 1 # restore restorer = TrainingRestorer(cfg, model, optimizer) global_step = restorer.global_step TB_LOGGER.global_step = global_step if hvd.rank() == 0: LOGGER.info("Saving training meta...") save_training_meta(cfg) path = join(cfg.output_dir, 'log', "detectron2_model_cfg.yaml") with open(path, "w") as f: f.write(model.cnn.config_file) LOGGER.info("Saving training done...") TB_LOGGER.create(join(cfg.output_dir, 'log')) model_saver = ModelSaver(join(cfg.output_dir, "ckpt")) add_log_to_file(join(cfg.output_dir, "log", "log.txt")) pbar = tqdm(total=cfg.num_train_steps) else: LOGGER.disabled = True model_saver = NoOp() restorer = NoOp() pbar = NoOp() if global_step > 0: pbar.update(global_step) LOGGER.info(cfg) LOGGER.info("Starting training...") LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info( f" Single-GPU Non-Accumulated batch size = {cfg.train_batch_size}") LOGGER.info(f" max_n_example_per_group = {cfg.max_n_example_per_group}") LOGGER.info(f" Accumulate steps = {cfg.gradient_accumulation_steps}") LOGGER.info( f" Total batch size = #GPUs * Single-GPU batch size * " f"max_n_example_per_group * Accumulate steps [Image] = {total_train_batch_size}" ) LOGGER.info(f" Total #epochs = {cfg.num_train_epochs}") LOGGER.info(f" Total #steps = {cfg.num_train_steps}") LOGGER.info( f" Validate every {cfg.valid_steps} steps, in total {actual_num_valid} times" ) # quick hack for amp delay_unscale bug with optimizer.skip_synchronize(): optimizer.zero_grad() if global_step == 0: optimizer.step() debug_step = 3 running_loss = RunningMeter('train_loss') for step, batch in enumerate(InfiniteIterator(train_loader)): # forward pass outputs, question_ids = forward_step(model, batch) loss = outputs["loss"].mean() loss = loss.float() * cfg.num_labels running_loss(loss.item()) # backward pass delay_unscale = (step + 1) % cfg.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale) as scaled_loss: scaled_loss.backward() zero_none_grad(model) optimizer.synchronize() # optimizer if (step + 1) % cfg.gradient_accumulation_steps == 0: global_step += 1 TB_LOGGER.add_scalar('train/loss', running_loss.val, global_step) n_epoch = int(1. * total_train_batch_size * global_step / total_n_examples) # learning rate scheduling transformer lr_this_step_transformer = get_lr_sched( global_step, cfg.decay, cfg.learning_rate, cfg.num_train_steps, warmup_ratio=cfg.warmup_ratio, decay_epochs=cfg.step_decay_epochs, multi_step_epoch=n_epoch) # learning rate scheduling cnn lr_this_step_cnn = get_lr_sched( global_step, cfg.cnn_lr_decay, cfg.cnn_learning_rate, cfg.num_train_steps, warmup_ratio=cfg.warmup_ratio, decay_epochs=cfg.cnn_step_decay_epochs, multi_step_epoch=n_epoch) # Hardcoded param group length assert len(optimizer.param_groups) == 8 for pg_n, param_group in enumerate(optimizer.param_groups): if pg_n in [0, 1]: param_group['lr'] = (cfg.transformer_lr_mul * lr_this_step_transformer) elif pg_n in [2, 3]: param_group['lr'] = lr_this_step_transformer elif pg_n in [4, 5]: param_group['lr'] = (cfg.cnn_lr_mul * lr_this_step_cnn) else: param_group['lr'] = lr_this_step_cnn TB_LOGGER.add_scalar("train/lr_transformer", lr_this_step_transformer, global_step) TB_LOGGER.add_scalar("train/lr_cnn", lr_this_step_cnn, global_step) # update model params if cfg.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), cfg.grad_norm) TB_LOGGER.add_scalar("train/grad_norm", grad_norm, global_step) TB_LOGGER.step() # Check if there is None grad none_grads = [ p[0] for p in model.named_parameters() if p[1].requires_grad and p[1].grad is None ] assert len(none_grads) == 0, f"{none_grads}" with optimizer.skip_synchronize(): optimizer.step() optimizer.zero_grad() restorer.step() pbar.update(1) # checkpoint if global_step % cfg.valid_steps == 0: LOGGER.info(f'Step {global_step}: start validation') vqa_results = validate(model, val_loader, cfg, global_step) model_saver.save(step=global_step, model=model) if global_step >= cfg.num_train_steps: break if cfg.debug and global_step >= debug_step: break if global_step % cfg.valid_steps != 0: LOGGER.info(f'Step {global_step}: start validation') vqa_results = validate(model, val_loader, cfg, global_step) model_saver.save(step=global_step, model=model)
def run(self, nepoch, batchsize=None, loss='energy', clip_loss=False, grad='manual', hdf5_group='wf_opt', num_threads=1, chkpt_every=None): """Run the optimization Args: nepoch (int): Number of optimization step batchsize (int, optional): Number of sample in a mini batch. If None, all samples are used. Defaults to None. loss (str, optional): method to compute the loss: variance or energy. Defaults to 'energy'. clip_loss (bool, optional): Clip the loss values at +/- 5std. Defaults to False. grad (str, optional): method to compute the gradients: 'auto' or 'manual'. Defaults to 'auto'. hdf5_group (str, optional): name of the hdf5 group where to store the data. Defaults to 'wf_opt' """ logd(hvd.rank(), '') logd( hvd.rank(), ' Distributed Optimization on {num} process'.format( num=hvd.size())) log.info(' - Process {id} using {nw} walkers'.format( id=hvd.rank(), nw=self.sampler.nwalkers)) # observable if not hasattr(self, 'observable'): self.track_observable(['local_energy']) self.evaluate_gradient = { 'auto': self.evaluate_grad_auto, 'manual': self.evaluate_grad_manual }[grad] if 'lpos_needed' not in self.opt.__dict__.keys(): self.opt.lpos_needed = False self.wf.train() hvd.broadcast_parameters(self.wf.state_dict(), root_rank=0) torch.set_num_threads(num_threads) # get the loss self.loss = Loss(self.wf, method=loss, clip=clip_loss) self.loss.use_weight = (self.resampling_options.resample_every > 1) # orthogonalization penalty for the MO coeffs self.ortho_loss = OrthoReg() self.prepare_optimization(batchsize, chkpt_every) # log data if hvd.rank() == 0: self.log_data_opt(nepoch, 'wave function optimization') # sample the wave function if hvd.rank() == 0: pos = self.sampler(self.wf.pdf) else: pos = self.sampler(self.wf.pdf, with_tqdm=False) # requried to build the distributed data container pos.requires_grad_(False) # handle the batch size if batchsize is None: batchsize = len(pos) # get the initial observable if hvd.rank() == 0: self.store_observable(pos) # change the number of steps/walker size _nstep_save = self.sampler.nstep _ntherm_save = self.sampler.ntherm _nwalker_save = self.sampler.walkers.nwalkers if self.resampling_options.mode == 'update': self.sampler.ntherm = -1 self.sampler.nstep = self.resampling_options.nstep_update self.sampler.walkers.nwalkers = pos.shape[0] self.sampler.nwalkers = pos.shape[0] # create the data loader self.dataset = DataSet(pos) if self.cuda: kwargs = {'num_workers': num_threads, 'pin_memory': True} else: kwargs = {'num_workers': num_threads} self.dataloader = DataLoader(self.dataset, batch_size=batchsize, **kwargs) min_loss = 1E3 for n in range(nepoch): tstart = time() logd(hvd.rank(), '') logd(hvd.rank(), ' epoch %d' % n) cumulative_loss = 0. for ibatch, data in enumerate(self.dataloader): # get data lpos = data.to(self.device) lpos.requires_grad = True # get the gradient loss, eloc = self.evaluate_gradient(lpos) cumulative_loss += loss # optimize the parameters self.optimization_step(lpos) # observable if hvd.rank() == 0: self.store_observable(pos, local_energy=eloc, ibatch=ibatch) cumulative_loss = self.metric_average(cumulative_loss, 'cum_loss') if hvd.rank() == 0: if n == 0 or cumulative_loss < min_loss: self.observable.models.best = dict(self.wf.state_dict()) min_loss = cumulative_loss if self.chkpt_every is not None: if (n > 0) and (n % chkpt_every == 0): self.save_checkpoint(n, cumulative_loss) self.print_observable(cumulative_loss) # resample the data pos = self.resample(n, pos) pos.requires_grad = False # scheduler step if self.scheduler is not None: self.scheduler.step() logd(hvd.rank(), ' epoch done in %1.2f sec.' % (time() - tstart)) # restore the sampler number of step self.sampler.nstep = _nstep_save self.sampler.ntherm = _ntherm_save self.sampler.walkers.nwalkers = _nwalker_save self.sampler.nwalkers = _nwalker_save if hvd.rank() == 0: dump_to_hdf5(self.observable, self.hdf5file, hdf5_group) add_group_attr(self.hdf5file, hdf5_group, {'type': 'opt'}) return self.observable
def setup(self, model): # call setup after the ddp process has connected self.trainer.call_setup_hook(model) if torch.cuda.is_available() and self.trainer.on_gpu: # Horovod: pin GPU to local rank assert self.trainer.root_gpu == hvd.local_rank() torch.cuda.set_device(self.trainer.root_gpu) model.cuda(self.trainer.root_gpu) # avoid duplicating progress bar if hvd.rank() != 0 and self.trainer.progress_bar_callback is not None: self.trainer.progress_bar_callback.disable() # CHOOSE OPTIMIZER # allow for lr schedulers as well self.setup_optimizers(model) # Horovod: scale the learning rate by the number of workers to account for # increased total batch size for optimizer in self.trainer.optimizers: for param_group in optimizer.param_groups: param_group['lr'] *= hvd.size() # Horovod: adjust base LR used by schedulers to match scaled optimizer initial LR for scheduler in self.trainer.lr_schedulers: scheduler = scheduler['scheduler'] if isinstance(scheduler, _LRScheduler): scheduler.base_lrs = [ lr * hvd.size() for lr in scheduler.base_lrs ] # Horovod: broadcast parameters & optimizer state to ensure consistent initialization hvd.broadcast_parameters(model.state_dict(), root_rank=0) for optimizer in self.trainer.optimizers: hvd.broadcast_optimizer_state(optimizer, root_rank=0) def _filter_named_parameters(model, optimizer): opt_params = set([ p for group in optimizer.param_groups for p in group.get('params', []) ]) return [(name, p) for name, p in model.named_parameters() if p in opt_params] # Horovod: wrap optimizers to perform gradient aggregation via allreduce self.trainer.optimizers = [ hvd.DistributedOptimizer(optimizer, named_parameters=_filter_named_parameters( model, optimizer)) for optimizer in self.trainer.optimizers ] # 16-bit model = self.trainer.precision_connector.connect(model) # Update logger rank info from Horovod to avoid race conditions from different ranks # creating directories / writing files in the same locations. self.trainer.global_rank = hvd.rank() rank_zero_only.rank = self.trainer.global_rank self.trainer.model = model
def main(): parser = argparse.ArgumentParser() parser.add_argument("-config") parser.add_argument("-data", help="data yaml file") parser.add_argument("-dataPath", default='', type=str, help="path of data files") parser.add_argument("-seed_model", help="the seed nerual network model") parser.add_argument("-exp_dir", help="the directory to save the outputs") parser.add_argument("-transform", help="feature transformation matrix or mvn statistics") parser.add_argument("-criterion", type=str, choices=["mmi", "mpfe", "smbr"], help="set the sequence training crtierion") parser.add_argument( "-trans_model", help="the HMM transistion model, used for lattice generation") parser.add_argument( "-prior_path", help="the prior for decoder, usually named as final.occs in kaldi setup" ) parser.add_argument( "-den_dir", help="the decoding graph directory to find HCLG and words.txt files") parser.add_argument("-lr", type=float, help="set the learning rate") parser.add_argument("-ce_ratio", default=0.1, type=float, help="the ratio for ce regularization") parser.add_argument("-momentum", default=0, type=float, help="set the momentum") parser.add_argument("-batch_size", default=32, type=int, help="Override the batch size in the config") parser.add_argument("-dropout", default=0, type=float, help="set the dropout ratio") parser.add_argument("-nheads", default=4, type=int, help="the number of attention heads") parser.add_argument("-dim_model", default=512, type=int, help="the model dimension") parser.add_argument("-ff_size", default=2048, type=int, help="the size of feed-forward layer") parser.add_argument("-nlayers", default=6, type=int, help="the number of layers") parser.add_argument("-look_ahead", default=-1, type=int, help="the number of frames to look ahead") parser.add_argument("-data_loader_threads", default=0, type=int, help="number of workers for data loading") parser.add_argument("-max_grad_norm", default=5, type=float, help="max_grad_norm for gradient clipping") parser.add_argument("-sweep_size", default=100, type=float, help="process n hours of data per sweep (default:60)") parser.add_argument("-num_epochs", default=1, type=int, help="number of training epochs (default:1)") parser.add_argument('-print_freq', default=10, type=int, metavar='N', help='print frequency (default: 10)') parser.add_argument('-save_freq', default=1000, type=int, metavar='N', help='save model frequency (default: 1000)') args = parser.parse_args() #args.exp_dir = args.modelPath with open(args.config) as f: config = yaml.safe_load(f) config['data_path'] = args.dataPath config["sweep_size"] = args.sweep_size print("pytorch version:{}".format(th.__version__)) with open(args.data) as f: data = yaml.safe_load(f) config["source_paths"] = [j for i, j in data['clean_source'].items()] print("Experiment starts with config {}".format( json.dumps(config, sort_keys=True, indent=4))) # Initialize Horovod hvd.init() th.cuda.set_device(hvd.local_rank()) print("Run experiments with world size {}".format(hvd.size())) dataset = SpeechDataset(config) transform = None if args.transform is not None and os.path.isfile(args.transform): with open(args.transform, 'rb') as f: transform = pickle.load(f) dataset.transform = transform train_dataloader = SeqDataloader(dataset, batch_size=args.batch_size, num_workers=args.data_loader_threads, distributed=True, test_only=False) print("Data loader set up successfully!") print("Number of minibatches: {}".format(len(train_dataloader))) if not os.path.isdir(args.exp_dir): os.makedirs(args.exp_dir) # ceate model model_config = config["model_config"] model = transformer.TransformerAM(model_config["feat_dim"], args.dim_model, args.nheads, args.ff_size, args.nlayers, args.dropout, model_config["label_size"]) model.cuda() # setup the optimizer optimizer = th.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) # Broadcast parameters and opterimizer state from rank 0 to all other processes. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Add Horovod Distributed Optimizer optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) if os.path.isfile(args.seed_model): checkpoint = th.load(args.seed_model) state_dict = checkpoint['model'] model.load_state_dict(state_dict) print("=> loaded checkpoint '{}' ".format(args.seed_model)) else: sys.stderr.write('ERROR: The model file %s does not exist!\n' % (args.seed_model)) sys.exit(0) HCLG = args.den_dir + "/HCLG.fst" words_txt = args.den_dir + "/words.txt" silence_phones = args.den_dir + "/phones/silence.csl" if not os.path.isfile(HCLG): sys.stderr.write('ERROR: The HCLG file %s does not exist!\n' % (HCLG)) sys.exit(0) if not os.path.isfile(words_txt): sys.stderr.write('ERROR: The words.txt file %s does not exist!\n' % (words_txt)) sys.exit(0) if not os.path.isfile(silence_phones): sys.stderr.write('ERROR: The silence phone file %s does not exist!\n' % (silence_phones)) sys.exit(0) with open(silence_phones) as f: silence_ids = [int(i) for i in f.readline().strip().split(':')] f.close() if os.path.isfile(args.trans_model): trans_model = kaldi_hmm.TransitionModel() with kaldi_util.io.xopen(args.trans_model) as ki: trans_model.read(ki.stream(), ki.binary) else: sys.stderr.write('ERROR: The trans_model %s does not exist!\n' % (args.trans_model)) sys.exit(0) # now we can setup the decoder decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = config["decoder_config"]["beam"] decoder_opts.lattice_beam = config["decoder_config"]["lattice_beam"] decoder_opts.max_active = config["decoder_config"]["max_active"] acoustic_scale = config["decoder_config"]["acoustic_scale"] decoder_opts.determinize_lattice = False #To produce raw state-level lattice instead of compact lattice asr_decoder = MappedLatticeFasterRecognizer.from_files( args.trans_model, HCLG, words_txt, acoustic_scale=acoustic_scale, decoder_opts=decoder_opts) prior = kaldi_util.io.read_matrix(args.prior_path).numpy() log_prior = th.tensor(np.log(prior[0] / np.sum(prior[0])), dtype=th.float) model.train() model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print(params) for epoch in range(args.num_epochs): run_train_epoch(model, optimizer, log_prior.cuda(), train_dataloader, epoch, asr_decoder, trans_model, silence_ids, args) # save model if hvd.rank() == 0: checkpoint = {} checkpoint['model'] = model.state_dict() checkpoint['optimizer'] = optimizer.state_dict() checkpoint['epoch'] = epoch output_file = args.exp_dir + '/model.se.' + str(epoch) + '.tar' th.save(checkpoint, output_file)
def fit(self, epochs, optimizer='Adam', learning_rate=1e-3, lbfgs_finetuning=True, writing_cylcle=30, save_model=True, pinn_path='best_model_pinn.pt', hpm_path='best_model_hpm.pt'): """ Function for optimizing the parameters of the PINN-Model Args: epochs (int) : number of epochs used for training optimizer (String, torch.optim.Optimizer) : Optimizer used for training. At the moment only ADAM and LBFGS are supported by string command. It is also possible to give instances of torch optimizers as a parameter learning_rate: The learning rate of the optimizer lbfgs_finetuning: Enables LBFGS finetuning after main training writing_cylcle: defines the cylcus of model writing save_model: enables or disables checkpointing pinn_path: defines the path where the pinn get stored hpm_path: defines the path where the hpm get stored """ if isinstance(self.pde_loss, HPMLoss): params = list(self.model.parameters()) + list( self.pde_loss.hpm_model.parameters()) named_parameters = chain( self.model.named_parameters(), self.pde_loss.hpm_model.named_parameters()) if self.use_horovod and lbfgs_finetuning: raise ValueError( "LBFGS Finetuning is not possible with horovod") if optimizer == 'Adam': optim = torch.optim.Adam(params, lr=learning_rate) elif optimizer == 'LBFGS': if self.use_horovod: raise TypeError("LBFGS is not supported with Horovod") else: optim = torch.optim.LBFGS(params, lr=learning_rate) else: optim = optimizer if lbfgs_finetuning and not self.use_horovod: lbfgs_optim = torch.optim.LBFGS(params, lr=0.9) def closure(): lbfgs_optim.zero_grad() pinn_loss = self.pinn_loss(training_data) pinn_loss.backward() return pinn_loss else: named_parameters = self.model.named_parameters() if optimizer == 'Adam': optim = torch.optim.Adam(self.model.parameters(), lr=learning_rate) elif optimizer == 'LBFGS': optim = torch.optim.LBFGS(self.model.parameters(), lr=learning_rate) else: optim = optimizer if lbfgs_finetuning and not self.use_horovod: lbfgs_optim = torch.optim.LBFGS(self.model.parameters(), lr=0.9) def closure(): lbfgs_optim.zero_grad() pinn_loss = self.pinn_loss(training_data) pinn_loss.backward() return pinn_loss minimum_pinn_loss = float("inf") if self.use_horovod: # Partition dataset among workers using DistributedSampler train_sampler = torch.utils.data.distributed.DistributedSampler( self.dataset, num_replicas=hvd.size(), rank=hvd.rank()) data_loader = DataLoader(self.dataset, batch_size=1, sampler=train_sampler) optim = hvd.DistributedOptimizer(optim, named_parameters=named_parameters) # Broadcast parameters from rank 0 to all other processes. hvd.broadcast_parameters(self.model.state_dict(), root_rank=0) if isinstance(self.pde_loss, HPMLoss): hvd.broadcast_parameters(self.pinn_loss.hpm_model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optim, root_rank=0) else: data_loader = DataLoader(self.dataset, batch_size=1) for epoch in range(epochs): for training_data in data_loader: training_data = training_data optim.zero_grad() pinn_loss = self.pinn_loss(training_data) pinn_loss.backward() if not self.rank: print("PINN Loss {} Epoch {} from {}".format( pinn_loss, epoch, epochs)) optim.step() if (pinn_loss < minimum_pinn_loss) and not ( epoch % writing_cylcle) and save_model and not self.rank: self.save_model(pinn_path, hpm_path) minimum_pinn_loss = pinn_loss if lbfgs_finetuning: lbfgs_optim.step(closure) print("After LBFGS-B: PINN Loss {} Epoch {} from {}".format( pinn_loss, epoch, epochs)) if (pinn_loss < minimum_pinn_loss ) and not (epoch % writing_cylcle) and save_model: self.save_model(pinn_path, hpm_path)
def train(self): dset = ConcatDataset( [eval(cls)(**params) for cls, params in self.dataset]) # eval(cls) means to call the Dataset,e.g:DAVISDataset # (**params) means to delivery the initial params[dict] into Dataset. e.g:DAVISDataset(params) # Finally, concat these Datasets. # Partition dataset among workers using DistributedSampler train_sampler = torch.utils.data.distributed.DistributedSampler( dset, num_replicas=hvd.size(), rank=hvd.rank()) loader = DataLoader(dset, batch_size=self.batch_size, sampler=train_sampler, num_workers=self.num_workers, pin_memory=True, shuffle=False) # Add Horovod Distributed Optimizer backward_passes_per_step = dset.datasets[ 0].sample_size - 1 # e.g:3 frames has 2 backward() self.optimizer = hvd.DistributedOptimizer( self.optimizer, named_parameters=self.model.named_parameters(), backward_passes_per_step=backward_passes_per_step) # Broadcast parameters from rank 0 to all other processes. hvd.broadcast_parameters(self.model.state_dict(), root_rank=0) for epoch in range(self.epoch + 1, self.max_epochs + 1): self.epoch = epoch self.stats = ddict(AverageMeter) t0 = None runtime = AverageMeter() for i, batch in enumerate(loader, 1): t0 = time( ) if t0 is None else t0 # Ignore loader startup pause self.optimizer.zero_grad() stats = self.model(*batch) self.optimizer.step() runtime.update(time() - t0) t0 = time() stats['stats/lr'] = self.scheduler.get_last_lr()[0] self.update_stats(stats, i, len(loader), runtime, do_print=True) if hvd.rank() == 0: self.log_stats() # tensorboard self.scheduler.step() lr_dict = hvd.broadcast_object(self.scheduler.state_dict(), 0) if hvd.rank() > 0: self.scheduler.load_state_dict(lr_dict) if self.epoch % self.save_interval == 0 and hvd.rank() == 0: self.save_checkpoint() print("%s done" % self.name)
def broadcast_value(self, val, name): hvd.broadcast_parameters({name: val}, root_rank=0)
model.cuda() # Horovod: scale learning rate by the number of GPUs. optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(), momentum=args.momentum) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(), compression=compression) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) def log(s, nl=True): if hvd.rank() != 0: return print(s, end='\n' if nl else '') log('Batch size: %d' % args.batch_size) device = 'GPU' if args.cuda else 'CPU' log('Number of %ss: %d' % (device, hvd.size())) def train(epoch): model.train() # Horovod: set epoch to sampler for shuffling. train_sampler.set_epoch(epoch)
def horovod_train(self, model): if torch.cuda.is_available() and self.on_gpu: # Horovod: pin GPU to local rank assert self.root_gpu == hvd.local_rank() torch.cuda.set_device(self.root_gpu) model.cuda(self.root_gpu) self._device = torch.device('cuda', self.root_gpu) # avoid duplicating progress bar if hvd.rank() != 0 and self.progress_bar_callback is not None: self.progress_bar_callback.disable() # CHOOSE OPTIMIZER # allow for lr schedulers as well self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers( model) # Horovod: scale the learning rate by the number of workers to account for # increased total batch size for optimizer in self.optimizers: for param_group in optimizer.param_groups: param_group['lr'] *= hvd.size() if self.use_amp: # An example model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level) self.optimizers = optimizers # Horovod: broadcast parameters & optimizer state to ensure consistent initialization hvd.broadcast_parameters(model.state_dict(), root_rank=0) for optimizer in self.optimizers: hvd.broadcast_optimizer_state(optimizer, root_rank=0) def filter_named_parameters(model, optimizer): opt_params = set([ p for group in optimizer.param_groups for p in group.get('params', []) ]) return [(name, p) for name, p in model.named_parameters() if p in opt_params] # Horovod: wrap optimizers to perform gradient aggregation via allreduce self.optimizers = [ hvd.DistributedOptimizer(optimizer, named_parameters=filter_named_parameters( model, optimizer)) for optimizer in self.optimizers ] # Update logger rank info from Horovod to avoid race conditions from different ranks # creating directories / writing files in the same locations. self.proc_rank = hvd.rank() rank_zero_only.rank = self.proc_rank with ExitStack() as stack: for optimizer in self.optimizers: # Synchronization will be performed explicitly following backward() stack.enter_context(optimizer.skip_synchronize()) self.run_pretrain_routine(model) # Make sure all workers have finished training before returning to the user hvd.join()
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) x = x.view(-1, 320) x = F.relu(self.fc1(x)) x = F.dropout(x, training=self.training) x = self.fc2(x) return F.log_softmax(x) model = Net() if args.cuda: # Move model to GPU. model.cuda() # Horovod: broadcast parameters. hvd.broadcast_parameters(model.state_dict(), root_rank=0) # Horovod: scale learning rate by the number of GPUs. optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(), momentum=args.momentum) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) def train(epoch): model.train() train_sampler.set_epoch(epoch) for batch_idx, (data, target) in enumerate(train_loader): if args.cuda:
def main_worker(args_): args_.cuda = not args_.no_cuda and torch.cuda.is_available() allreduce_batch_size = args_.batch_size * args_.batches_per_allreduce hvd.init() # torch.distributed.init_process_group('ddl', init_method='env://') if args_.cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) # torch.cuda.manual_seed(args_.seed) # cudnn.benchmark = True # # If set > 0, will resume training from a given checkpoint. # resume_from_epoch = 0 # for try_epoch in range(args_.epochs, 0, -1): # if os.path.exists(args_.checkpoint_format.format(epoch=try_epoch)): # resume_from_epoch = try_epoch # break # # # Horovod: broadcast resume_from_epoch from rank 0 (which will have # # checkpoints) to other ranks. # resume_from_epoch = hvd.broadcast(torch.tensor(resume_from_epoch), root_rank=0, # name='resume_from_epoch').item() # # Horovod: print logs on the first worker. # verbose = 1 if hvd.rank() == 0 else 0 # # # Horovod: write TensorBoard logs on first worker. # try: # if LooseVersion(torch.__version__) >= LooseVersion('1.2.0'): # from torch.utils.tensorboard import SummaryWriter # else: # from tensorboardX import SummaryWriter # os.makedirs(os.path.join(args_.model_output_dir, 'logs'), exist_ok=True) # log_writer = SummaryWriter(os.path.join(args_.model_output_dir, 'logs')) if hvd.rank() == 0 else None # except ImportError: # log_writer = None ### MODEL CREATION ### # create model model1 = VQ_VAE(num_inputs=1, weight_matching=0., channel_var=np.ones((1, ))) model2 = VQ_VAE(num_inputs=1, weight_matching=0.0005, channel_var=np.ones((1, ))) model1.cuda() model2.cuda() model1 = torch.nn.parallel.DistributedDataParallel(model1) model2 = torch.nn.parallel.DistributedDataParallel(model2) # By default, Adasum doesn't need scaling up learning rate. # For sum/average with gradient Accumulation: scale learning rate by batches_per_allreduce if args_.cuda and args_.use_adasum and hvd.nccl_built(): # If using GPU Adasum allreduce, scale learning rate by local_size. lr_scaler = args_.batches_per_allreduce * hvd.local_size() elif not args_.use_adasum: lr_scaler = args_.batches_per_allreduce * hvd.size() else: lr_scaler = 1 # Horovod: scale learning rate by the number of GPUs. optimizer1 = t.optim.Adam(model1.parameters(), lr=(args_.base_lr * lr_scaler), betas=(.9, .999)) optimizer2 = t.optim.Adam(model2.parameters(), lr=(args_.base_lr * lr_scaler), betas=(.9, .999)) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args_.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer1 = hvd.DistributedOptimizer( optimizer1, named_parameters=model1.named_parameters(), compression=compression, backward_passes_per_step=args_.batches_per_allreduce, op=hvd.Adasum if args_.use_adasum else hvd.Average) optimizer2 = hvd.DistributedOptimizer( optimizer2, named_parameters=model2.named_parameters(), compression=compression, backward_passes_per_step=args_.batches_per_allreduce, op=hvd.Adasum if args_.use_adasum else hvd.Average) # # Restore from a previous checkpoint, if initial_epoch is specified. # # Horovod: restore on the first worker which will broadcast weights to other workers. # if resume_from_epoch > 0 and hvd.rank() == 0: # filepath = args.checkpoint_format.format(epoch=resume_from_epoch) # checkpoint = torch.load(filepath) # model.load_state_dict(checkpoint['model']) # optimizer.load_state_dict(checkpoint['optimizer']) ### Settings ### model_output_dir = args_.model_output_dir project_dir = args_.project_dir ### Prepare Data ### log.info("LOADING FILES") # ======= load data using pytorch systems ======== torch.set_num_threads(4) dataset = DatasetFolderWithPaths(root=project_dir + "/JUNE" + "/raw_patches", loader=npy_loader, extensions='.npy') dataset_mask = DatasetFolderWithPaths(root=project_dir + "/JUNE" + "/raw_masks", loader=npy_loader, extensions='.npy') relation_mat = np.load(os.path.join(project_dir, "JUNE", "raw_patches", "relation_mat.npy"), allow_pickle=True) # Horovod: use DistributedSampler to partition data among workers. Manually specify # `num_replicas=hvd.size()` and `rank=hvd.rank()`. train_sampler = torch.utils.data.distributed.DistributedSampler( dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_sampler_mask = torch.utils.data.distributed.DistributedSampler( dataset_mask, num_replicas=hvd.size(), rank=hvd.rank()) # ========================================================= os.makedirs(os.path.join(model_output_dir, "stage1"), exist_ok=True) os.makedirs(os.path.join(model_output_dir, "stage2"), exist_ok=True) # ==================================== log.info("TRAINING: STARTING STAGE 1") kwargs = {'num_workers': 4, 'pin_memory': True} if args_.cuda else {} train_loader = torch.utils.data.DataLoader(dataset, batch_size=allreduce_batch_size, sampler=train_sampler, **kwargs) train_mask_loader = torch.utils.data.DataLoader( dataset_mask, batch_size=allreduce_batch_size, sampler=train_sampler_mask, **kwargs) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model1.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer1, root_rank=0) output_dir = os.path.join(model_output_dir, "stage1") writer = SummaryWriter(output_dir) log.info(f"\ttensorboard logs written to {output_dir}") for epoch in range(args_.stage1_epochs): model1.train() train_sampler.set_epoch(epoch) mean_loss = train( model1, train_loader, optimizer1, # relation_mat=relation_mat, mask_loader=train_mask_loader, args_=args_) for key, loss in mean_loss.items(): mean_loss[key] = sum(loss) / len(loss) if len(loss) > 0 else -1. writer.add_scalar('Loss/' + key, mean_loss[key], epoch) writer.flush() log.info('\tepoch %d' % epoch) log.info('\t'.join([ '{}:{:0.4f} '.format(key, loss) for key, loss in mean_loss.items() ])) # only master process should save checkpoints. if torch.distributed.get_rank() == 0: log.info(f'\t saving epoch {epoch}') t.save(model1.state_dict(), os.path.join(output_dir, 'model_epoch%d.pt' % epoch)) writer.close() # ==================================== log.info("TRAINING: STARTING STAGE 2") # get the last saved epoch. on IBM, use max(). on OSX use min() # s1_epochs = glob.glob(os.path.join(model_output_dir, "stage1", "/*")) s1_epochs = glob.glob(os.path.join(model_output_dir, "stage1") + '/*.pt') last_epoch = max(s1_epochs, key=os.path.getctime) log.info(f"\tloading last epoch = {last_epoch}") train_loader = torch.utils.data.DataLoader(dataset, batch_size=allreduce_batch_size, sampler=train_sampler) train_mask_loader = torch.utils.data.DataLoader( dataset_mask, batch_size=allreduce_batch_size, sampler=train_sampler_mask) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model2.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer2, root_rank=0) output_dir = os.path.join(model_output_dir, "stage2") writer = SummaryWriter(output_dir) log.info(f"\ttensorboard logs written to {output_dir}") model2.load_state_dict(t.load(last_epoch)) for epoch in range(args_.stage2_epochs): model2.train() train_sampler.set_epoch(epoch) mean_loss = train( model2, train_loader, optimizer2, # relation_mat=relation_mat, mask_loader=train_mask_loader) # shuffle samples ids at the end of the epoch # if shuffle_data: # np.random.shuffle(sample_ids) for key, loss in mean_loss.items(): mean_loss[key] = sum(loss) / len(loss) if len(loss) > 0 else -1. writer.add_scalar('Loss/' + key, mean_loss[key], epoch) writer.flush() log.info('\tepoch %d' % epoch) log.info('\t'.join([ '{}:{:0.4f} '.format(key, loss) for key, loss in mean_loss.items() ])) if torch.distributed.get_rank() == 0: log.info(f'\t saving epoch {epoch}') t.save(model2.state_dict(), os.path.join(output_dir, 'model_epoch%d.pt' % epoch)) writer.close()