def train(serialized_model): import horovod.torch as hvd if random_seed is not None: pl.utilities.seed.seed_everything(seed=random_seed) # Horovod: initialize library. hvd.init() if verbose: import horovod as _horovod print( f"Shared lib path is pointing to: {_horovod.common.process_sets._basics.MPI_LIB_CTYPES}" ) _checkpoint_callback = None require_checkpoint = False with remote_store.get_local_output_dir() as run_output_dir: logs_path = os.path.join(run_output_dir, remote_store.logs_subdir) os.makedirs(logs_path, exist_ok=True) print(f"Made directory {logs_path} for horovod rank {hvd.rank()}") ckpt_dir = run_output_dir ckpt_filename = remote_store.checkpoint_filename if logger is None: # Use default logger if no logger is supplied train_logger = TensorBoardLogger(logs_path) print(f"Setup logger: Using TensorBoardLogger: {train_logger}") elif isinstance(logger, CometLogger): if logger._experiment_key: # use logger passed in. train_logger = logger train_logger._save_dir = logs_path print( f"Setup logger: change save_dir of the logger to {logs_path}" ) elif logger_experiment_key: # Resume logger experiment with new log path if key passed correctly from CPU. train_logger = CometLogger( save_dir=logs_path, api_key=logger.api_key, experiment_key=logger_experiment_key, ) print( f"Setup logger: Resume comet logger: {vars(train_logger)}" ) else: print( f"Failed to setup or resume comet logger. origin logger: {vars(logger)}" ) else: # use logger passed in. train_logger = logger train_logger.save_dir = logs_path print( f"Setup logger: Using logger passed from estimator: {train_logger}" ) # Lightning requires to add checkpoint callbacks for all ranks. # Otherwise we are seeing hanging in training. for cb in callbacks: if isinstance(cb, ModelCheckpoint): cb.dirpath = ckpt_dir cb.filename = ckpt_filename _checkpoint_callback = cb require_checkpoint = True break if not _checkpoint_callback: # By default 'monitor'=None which saves a checkpoint only for the last epoch. _checkpoint_callback = ModelCheckpoint(dirpath=ckpt_dir, filename=ckpt_filename, verbose=True) callbacks.append(_checkpoint_callback) if remote_store.saving_runs and hvd.rank() == 0: # Horovod: sync checkpoint and logging files only on rank 0 to # prevent other ranks from corrupting them. class _SyncCallback(Callback): def on_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: remote_store.sync(run_output_dir) callbacks.append(_SyncCallback()) model = deserialize(serialized_model) _train_steps_per_epoch = train_steps_per_epoch if train_steps_per_epoch else \ int(math.floor(float(train_rows) / batch_size / hvd.size())) _val_steps_per_epoch = val_steps_per_epoch if val_steps_per_epoch else \ int(math.floor(float(val_rows) / val_batch_size / hvd.size())) shuffle_size = calculate_shuffle_buffer_size() if verbose: print( f"Training data of rank[{hvd.local_rank()}]: Epochs: {epochs}, " f"Shuffle_size: {shuffle_size}, Random seed: {random_seed}\n" f"Train rows: {train_rows}, Train batch size: {batch_size}, Train_steps_per_epoch: {_train_steps_per_epoch}\n" f"Val rows: {val_rows}, Val batch size: {val_batch_size}, Val_steps_per_epoch: {_val_steps_per_epoch}\n" f"Checkpoint file: {remote_store.checkpoint_path}, Logs dir: {remote_store.logs_path}\n" ) cuda_available = torch.cuda.is_available() # We need to check all ranks have same device type for traning. # Horovod doesn't support heterogeneous allreduce for gradients. cuda_avail_list = hvd.allgather_object(cuda_available, name='device type') if cuda_avail_list.count(cuda_available) != hvd.size(): raise RuntimeError("All ranks don't have same device type!") if cuda_available: # Horovod: pin GPU to local rank or the assigned GPU from spark. torch.cuda.set_device( _get_assigned_gpu_or_default(default=hvd.local_rank())) # Move model to GPU. model.cuda() _num_gpus = num_gpus if _num_gpus is None: _num_gpus = 1 if cuda_available else 0 # Set bar refresh to 1 / epoch, detailed loss and metrics is avaialbe in logger, # no need to print in screen here. User can still override this in trainer_args progress_bar_refresh_rate = _train_steps_per_epoch kwargs = { 'accelerator': 'horovod', 'gpus': _num_gpus, 'callbacks': callbacks, 'max_epochs': epochs, 'logger': train_logger, 'log_every_n_steps': log_every_n_steps, 'num_sanity_val_steps': 0, 'reload_dataloaders_every_epoch': False, 'progress_bar_refresh_rate': progress_bar_refresh_rate, 'terminate_on_nan': terminate_on_nan, 'profiler': profiler } if trainer_args: kwargs.update(trainer_args) if verbose and hvd.rank() == 0: print("Creating trainer with: \n ", kwargs) trainer = Trainer(**kwargs) if profiler != 'simple' and trainer.profiler: print( f"Set profiler's logs_path for {hvd.rank()} to {logs_path}" ) trainer.profiler.dirpath = logs_path # filename where the profiler results will be saved instead of # printing to stdout. The .txt extension will be used automatically. trainer.profiler.filename = "profile" if verbose and hvd.rank() == 0: print(f"pytorch_lightning version={pl.__version__}") data_module_kwargs = { 'train_dir': remote_store.train_data_path, 'val_dir': remote_store.val_data_path, 'num_train_epochs': epochs, 'has_val': should_validate is not None, 'train_batch_size': batch_size, 'val_batch_size': val_batch_size, 'shuffle_size': shuffle_size, 'num_reader_epochs': loader_num_epochs, 'reader_pool_type': reader_pool_type, 'reader_worker_count': train_reader_worker_count, 'transform_spec': transformation, 'inmemory_cache_all': inmemory_cache_all, 'cur_shard': hvd.rank(), 'shard_count': hvd.size(), 'schema_fields': schema_fields, 'storage_options': storage_options, 'steps_per_epoch_train': _train_steps_per_epoch, 'steps_per_epoch_val': _val_steps_per_epoch, 'verbose': verbose, 'debug_data_loader': debug_data_loader, 'train_async_data_loader_queue_size': train_async_data_loader_queue_size, 'val_async_data_loader_queue_size': val_async_data_loader_queue_size, } if debug_data_loader and hvd.rank() == 0: print( f"Creating data module with args:\n {data_module_kwargs}") dataset = data_module(**data_module_kwargs) trainer.fit(model, dataset) if hvd.rank() == 0: if remote_store.saving_runs and trainer.profiler: # One more file sync to push profiler result. remote_store.sync(logs_path) # rank 0 overwrites model with best checkpoint and returns. if require_checkpoint: if verbose: print("load from checkpoint best model path:", _checkpoint_callback.best_model_path) best_model = model.load_from_checkpoint( _checkpoint_callback.best_model_path) else: best_model = model serialized_checkpoint = io.BytesIO() module = best_model if not is_legacy else best_model._model output = { 'model': module.state_dict(), 'logged_metrics': trainer.logged_metrics } torch.save(output, serialized_checkpoint) return serialized_checkpoint
def train(serialized_model, optimizer_cls, model_opt_state_serialized, train_rows, val_rows, avg_row_size): from petastorm import TransformSpec, make_reader, make_batch_reader from petastorm.pytorch import BatchedDataLoader, InMemBatchedDataLoader import torch import horovod.torch as hvd # Deserializing objects model_opt_state = torch.load(model_opt_state_serialized) model = deserialize(serialized_model) if loss_fns_pre_train: loss_fns = loss_fns_pre_train if loss_constructors: local_vars = locals() loss_fns = [ loss_constructor(**local_vars) for loss_constructor in loss_constructors ] # Horovod: initialize library. hvd.init() if not user_shuffle_buffer_size: shuffle_buffer_size = \ calculate_shuffle_buffer_size(hvd, avg_row_size, train_rows / hvd.size()) else: shuffle_buffer_size = user_shuffle_buffer_size cuda_available = torch.cuda.is_available() # We need to check all ranks have same device type for traning. # Horovod doesn't support heterogeneous allreduce for gradients. cuda_avail_list = hvd.allgather_object(cuda_available, name='device type') if cuda_avail_list.count(cuda_available) != hvd.size(): raise RuntimeError("All ranks don't have same device type!") if cuda_available: # Horovod: pin GPU to local rank or the assigned GPU from spark. torch.cuda.set_device( _get_assigned_gpu_or_default(default=hvd.local_rank())) # Move model to GPU. model.cuda() # Optimizer object needs to be re-instantiated. Internally, it uses memory addresses of # objects as their identity and therefore it cannot be serialized and then # deserialized. The deserialized optimizer object stores the names of the parameters # with their old memory addresses but in reality those are different than the # reconstructed deserialized object and that creates problem. # Learning rate is a required parameters in SGD optimizer. It will be overridden with # load_state_dict. optimizer = optimizer_cls(model.parameters(), lr=1) optimizer_state = model_opt_state['optimizer'] if last_checkpoint_state is not None: model.load_state_dict(last_checkpoint_state['model']) optimizer.load_state_dict(last_checkpoint_state['optimizer']) else: # scale the learning rate with the number of horovod workers for i in range(len(optimizer_state['param_groups'])): optimizer_state['param_groups'][i]['lr'] = \ optimizer_state['param_groups'][i]['lr'] * hvd.size() optimizer.load_state_dict(optimizer_state) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) for group in optimizer.param_groups: for p in group['params']: if id(p) not in optimizer.state_dict()['state']: p.grad = p.data.new(p.size()).zero_() optimizer.step() hvd.broadcast_optimizer_state(optimizer, root_rank=0) dist_optimizer_args = dict(optimizer=optimizer, named_parameters=model.named_parameters()) if gradient_compression: # Pass the compression arg only if it is specified by the user. dist_optimizer_args['compression'] = gradient_compression # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(**dist_optimizer_args) # This function takes the current optimizer and constructs a new optimizer with the # same state except with learning rate scaled down with the number of horovod workers. # This is important the retraining of the model. User may retrain the model with # different number of workers and we need the raw learning rate to adjust with the # new number of workers. transform_spec = None if transformation: transform_spec = TransformSpec(transformation) schema_fields = feature_columns + label_columns if sample_weight_col: schema_fields.append(sample_weight_col) if train_steps_per_epoch is None: steps_per_epoch = int( math.floor(float(train_rows) / batch_size / hvd.size())) else: steps_per_epoch = train_steps_per_epoch with remote_store.get_local_output_dir() as run_output_dir: logs_dir = os.path.join(run_output_dir, remote_store.logs_subdir) log_writer = SummaryWriter(logs_dir) if hvd.rank() == 0 else None ckpt_file = os.path.join(run_output_dir, remote_store.checkpoint_filename) def save_checkpoint(): model.cpu() optimizer_with_scaled_down_lr = \ get_optimizer_with_unscaled_lr(hvd, optimizer, optimizer_cls, model) state = { 'model': model.state_dict(), 'optimizer': optimizer_with_scaled_down_lr.state_dict(), } torch.save(state, ckpt_file) if cuda_available: model.cuda() if hvd.rank() == 0 and user_verbose: print( f"Training parameters: Epochs: {epochs}\n" f"Train rows: {train_rows}, Train batch size: {batch_size}, Train_steps_per_epoch: {steps_per_epoch}\n" f"Checkpoint file: {ckpt_file}, Logs dir: {logs_dir}\n") # In general, make_batch_reader is faster than make_reader for reading the dataset. # However, we found out that make_reader performs data transformations much faster than # make_batch_reader with parallel worker processes. Therefore, the default reader # we choose is make_batch_reader unless there are data transformations. reader_factory = None reader_factory_kwargs = dict() if transform_spec: reader_factory = make_reader reader_factory_kwargs['pyarrow_serialize'] = True else: reader_factory = make_batch_reader # Petastorm: read data from the store with the correct shard for this rank # setting num_epochs=None will cause an infinite iterator # and enables ranks to perform training and validation with # unequal number of samples with reader_factory(remote_store.train_data_path, num_epochs=None, cur_shard=hvd.rank(), reader_pool_type=reader_pool_type, workers_count=train_reader_worker_count, shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER, schema_fields=schema_fields, transform_spec=transform_spec, storage_options=storage_options, **reader_factory_kwargs) as train_reader: with reader_factory(remote_store.val_data_path, num_epochs=None, cur_shard=hvd.rank(), reader_pool_type=reader_pool_type, workers_count=val_reader_worker_count, shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER, schema_fields=schema_fields, transform_spec=transform_spec, storage_options=storage_options, **reader_factory_kwargs) \ if should_validate else empty_batch_reader() as val_reader: if inmemory_cache_all: # Petastorm introduced InMemBatchedDataLoader class in v0.11.0 train_loader = InMemBatchedDataLoader( train_reader, batch_size=batch_size, num_epochs=epochs, rows_capacity=steps_per_epoch * batch_size, shuffle=True) else: train_loader = BatchedDataLoader( train_reader, batch_size=batch_size, shuffling_queue_capacity=shuffle_buffer_size) train_loader_iter = iter(train_loader) def prepare_batch(row): inputs = [ prepare_np_data(row[col].float(), col, metadata).reshape(shape) for col, shape in zip(feature_columns, input_shapes) ] labels = [ prepare_np_data(row[col].float(), col, metadata) for col in label_columns ] sample_weights = row.get(sample_weight_col, None) if sample_weights is not None: sample_weights = sample_weights.float() if cuda_available: inputs = [input.cuda() for input in inputs] labels = [label.cuda() for label in labels] if sample_weights is not None: sample_weights = sample_weights.cuda() return inputs, labels, sample_weights def transform_outputs(outputs, labels): if not isinstance(outputs, tuple) and not isinstance( outputs, list): outputs = [outputs] # reshape labels to match the output shape of the model if hasattr(outputs[0], 'shape'): if label_shapes: labels = [ label.reshape(label_shape) for label, label_shape in zip( labels, label_shapes) ] else: # If label_shapes parameter is not provided, reshape the label # columns data to match the shape of the model output labels = [ label.reshape(output.shape) if output.shape.numel() == label.shape.numel() else label for label, output in zip(labels, outputs) ] return outputs, labels def aggregate_metrics(stage, epoch, loss, metric_value_groups): all_metric_groups_values = get_metric_avgs( metric_value_groups) if remote_store.saving_runs: write_metrics_summary(stage, epoch, loss, all_metric_groups_values, log_writer) return { loss.name: loss.avg.item(), 'all_metrics': all_metric_groups_values } def loss_fn(outputs, labels, sample_weights): loss = calculate_loss(outputs, labels, loss_weights, loss_fns, sample_weights) return loss def print_metrics(batch_idx, loss, metric_value_groups, phase): if user_verbose > 0 and hvd.rank() == 0 and \ batch_idx % METRIC_PRINT_FREQUENCY == 0: print( "{phase}\tepoch:\t{epoch}\tstep\t{batch_idx}:\t{metrics}" .format(phase=phase, epoch=epoch, batch_idx=batch_idx, metrics=aggregate_metrics( phase, epoch, loss, metric_value_groups))) def _train(epoch): model.train() train_loss = metric_cls('loss', hvd) metric_value_groups = construct_metric_value_holders( metric_cls, metric_fn_groups, label_columns, hvd) # iterate on one epoch for batch_idx in range(steps_per_epoch): row = next(train_loader_iter) inputs, labels, sample_weights = prepare_batch(row) outputs, loss = train_minibatch( model, optimizer, transform_outputs, loss_fn, inputs, labels, sample_weights) update_metrics(metric_value_groups, outputs, labels) train_loss.update(loss) print_metrics(batch_idx, train_loss, metric_value_groups, 'train') return aggregate_metrics('train', epoch, train_loss, metric_value_groups) if should_validate: if validation_steps_per_epoch is None: validation_steps = int( math.ceil( float(val_rows) / val_batch_size / hvd.size())) else: validation_steps = validation_steps_per_epoch if hvd.rank() == 0 and user_verbose: print( f"Val rows: {val_rows}, Val batch size: {val_batch_size}, Val_steps_per_epoch: {validation_steps}\n" ) if inmemory_cache_all: # Petastorm introduced InMemBatchedDataLoader class in v0.11.0 val_loader = InMemBatchedDataLoader( val_reader, batch_size=val_batch_size, num_epochs=epochs, rows_capacity=validation_steps * val_batch_size, shuffle=False) else: val_loader = BatchedDataLoader( val_reader, batch_size=val_batch_size, shuffling_queue_capacity=0) val_loader_iter = iter(val_loader) def _validate(epoch): model.eval() val_loss = metric_cls('loss', hvd) metric_value_groups = construct_metric_value_holders( metric_cls, metric_fn_groups, label_columns, hvd) # iterate on one epoch for batch_idx in range(validation_steps): row = next(val_loader_iter) inputs, labels, sample_weights = prepare_batch( row) outputs = model(*inputs) outputs, labels = transform_outputs( outputs, labels) loss = calculate_loss(outputs, labels, loss_weights, loss_fns, sample_weights) val_loss.update(loss) update_metrics(metric_value_groups, outputs, labels) print_metrics(batch_idx, val_loss, metric_value_groups, 'val') return aggregate_metrics('val', epoch, val_loss, metric_value_groups) history = [] for epoch in range(epochs): epoch_metrics = { 'epoch': epoch, 'train': _train(epoch) } if should_validate: epoch_metrics['validation'] = _validate(epoch) if user_verbose > 0: pdt_dt = datetime.now(timezone.utc) pdt_time_str = pdt_dt.strftime( "%Y-%b-%d %H:%M:%S UTC") print(pdt_time_str, epoch_metrics) history.append(epoch_metrics) if hvd.rank() == 0: # Save model after every epoch save_checkpoint() if remote_store.saving_runs: remote_store.sync(run_output_dir) if hvd.rank() == 0: best_checkpoint = torch.load(ckpt_file) serialized_checkpoint = io.BytesIO() torch.save(best_checkpoint, serialized_checkpoint) serialized_checkpoint.seek(0) return history, serialized_checkpoint
def train(serialized_model): import horovod.torch as hvd # Horovod: initialize library. hvd.init() with tempfile.TemporaryDirectory( ) as last_ckpt_dir, remote_store.get_local_output_dir( ) as run_output_dir: last_ckpt_file = os.path.join(last_ckpt_dir, 'last.ckpt') if ckpt_bytes: with open(last_ckpt_file, 'wb') as f: f.write(ckpt_bytes) # TODO: Pass the logger from estimator constructor logs_path = os.path.join(run_output_dir, remote_store.logs_subdir) # Use default logger if no logger is supplied train_logger = logger if train_logger is None: train_logger = TensorBoardLogger(logs_path) # TODO: find out a way to use ckpt_path created from remote store, but all other parameters ingest from estimator config # ckpt_path = os.path.join(run_output_dir, remote_store.checkpoint_filename) # os.makedirs(ckpt_path, exist_ok=True) # model_checkpoint_callback = ModelCheckpoint(dirpath=ckpt_path) # callbacks.append(model_checkpoint_callback) is_model_checkpoint_callback_exist = False if callbacks is not None: for cb in callbacks: if isinstance(cb, ModelCheckpoint): is_model_checkpoint_callback_exist = True break model = deserialize(serialized_model) _train_steps_per_epoch = train_steps_per_epoch if train_steps_per_epoch else \ int(math.floor(float(train_rows) / batch_size / hvd.size())) _val_steps_per_epoch = val_steps_per_epoch if val_steps_per_epoch else \ int(math.floor(float(val_rows) / val_batch_size / hvd.size())) print( f"Training data of rank[{hvd.local_rank()}]: train_rows:{train_rows}, batch_size:{batch_size}, _train_steps_per_epoch:{_train_steps_per_epoch}." ) print( f"Validation data of rank[{hvd.local_rank()}]: val_rows:{val_rows}, val_batch_size:{val_batch_size}, _val_steps_per_epoch:{_val_steps_per_epoch}, should_validate:{should_validate}" ) cuda_available = torch.cuda.is_available() # We need to check all ranks have same device type for traning. # Horovod doesn't support heterogeneous allreduce for gradients. cuda_avail_list = hvd.allgather_object(cuda_available, name='device type') if cuda_avail_list.count(cuda_available) != hvd.size(): raise RuntimeError("All ranks don't have same device type!") if cuda_available: # Horovod: pin GPU to local rank or the assigned GPU from spark. torch.cuda.set_device( _get_assigned_gpu_or_default(default=hvd.local_rank())) # Move model to GPU. model.cuda() _num_gpus = num_gpus if _num_gpus is None: _num_gpus = 1 if cuda_available else 0 kwargs = { 'accelerator': 'horovod', 'gpus': _num_gpus, 'callbacks': callbacks, 'max_epochs': epochs, 'logger': train_logger, 'log_every_n_steps': log_every_n_steps, 'resume_from_checkpoint': (last_ckpt_file if ckpt_bytes else None), 'checkpoint_callback': is_model_checkpoint_callback_exist, 'num_sanity_val_steps': 0, 'reload_dataloaders_every_epoch': False, 'progress_bar_refresh_rate': _train_steps_per_epoch // 10 } print("Creating trainer with: \n ", kwargs) trainer = Trainer(**kwargs) print(f"pytorch_lightning version={pl.__version__}") # print row group # pq.ParquetFile(remote_store.train_data_path) # for rowgroup in range(pq_file.metadata.num_row_groups): # row_group = pq_file.metadata.row_group(rowgroup) # print(row_group) with set_data_loader(model, remote_store.train_data_path, 'train_dataloader', train_reader_worker_count, reader_pool_type, calculate_shuffle_buffer_size(), name="train_dataloader", limit_step_per_epoch=_train_steps_per_epoch), \ set_data_loader(model, remote_store.val_data_path, 'val_dataloader', val_reader_worker_count, reader_pool_type, 0, should_validate, name="val_dataloader", limit_step_per_epoch=_val_steps_per_epoch): trainer.fit(model) serialized_checkpoint = io.BytesIO() module = model if not is_legacy else model._model # TODO: find a way to pass trainer.logged_metrics out. output = {'model': module.state_dict()} torch.save(output, serialized_checkpoint) serialized_checkpoint.seek(0) return serialized_checkpoint
def train(serialized_model): import horovod.torch as hvd # Horovod: initialize library. hvd.init() with tempfile.TemporaryDirectory( ) as last_ckpt_dir, remote_store.get_local_output_dir( ) as run_output_dir: last_ckpt_file = os.path.join(last_ckpt_dir, 'last.ckpt') if ckpt_bytes: with open(last_ckpt_file, 'wb') as f: f.write(ckpt_bytes) # TODO: Pass the logger from estimator constructor logs_path = os.path.join(run_output_dir, remote_store.logs_subdir) # Use default logger if no logger is supplied train_logger = logger if train_logger is None: train_logger = TensorBoardLogger(logs_path) elif isinstance(train_logger, CometLogger) and train_logger._save_dir is None: # Setting the CometLogger's save_dir allows us to sync checkpoints and profiler output train_logger._save_dir = logs_path # TODO: find out a way to use ckpt_path created from remote store, but all other parameters ingest from estimator config # ckpt_path = os.path.join(run_output_dir, remote_store.checkpoint_filename) # os.makedirs(ckpt_path, exist_ok=True) # model_checkpoint_callback = ModelCheckpoint(dirpath=ckpt_path) # callbacks.append(model_checkpoint_callback) is_model_checkpoint_callback_exist = False for cb in callbacks: if isinstance(cb, ModelCheckpoint): is_model_checkpoint_callback_exist = True break if remote_store.saving_runs and hvd.rank() == 0: class _SyncCallback(Callback): def on_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: print("Syncing to remote_store.") remote_store.sync(logs_path) callbacks.append(_SyncCallback()) model = deserialize(serialized_model) _train_steps_per_epoch = train_steps_per_epoch if train_steps_per_epoch else \ int(math.floor(float(train_rows) / batch_size / hvd.size())) _val_steps_per_epoch = val_steps_per_epoch if val_steps_per_epoch else \ int(math.floor(float(val_rows) / val_batch_size / hvd.size())) print( f"Training data of rank[{hvd.local_rank()}]: train_rows:{train_rows}, batch_size:{batch_size}, _train_steps_per_epoch:{_train_steps_per_epoch}." ) cuda_available = torch.cuda.is_available() # We need to check all ranks have same device type for traning. # Horovod doesn't support heterogeneous allreduce for gradients. cuda_avail_list = hvd.allgather_object(cuda_available, name='device type') if cuda_avail_list.count(cuda_available) != hvd.size(): raise RuntimeError("All ranks don't have same device type!") if cuda_available: # Horovod: pin GPU to local rank or the assigned GPU from spark. torch.cuda.set_device( _get_assigned_gpu_or_default(default=hvd.local_rank())) # Move model to GPU. model.cuda() _num_gpus = num_gpus if _num_gpus is None: _num_gpus = 1 if cuda_available else 0 kwargs = { 'accelerator': 'horovod', 'gpus': _num_gpus, 'callbacks': callbacks, 'max_epochs': epochs, 'logger': train_logger, 'log_every_n_steps': log_every_n_steps, 'resume_from_checkpoint': (last_ckpt_file if ckpt_bytes else None), 'checkpoint_callback': is_model_checkpoint_callback_exist, 'num_sanity_val_steps': 0, 'reload_dataloaders_every_epoch': False, 'progress_bar_refresh_rate': _train_steps_per_epoch // 10, 'terminate_on_nan': terminate_on_nan, 'profiler': estimator.getProfiler() } print("Creating trainer with: \n ", kwargs) trainer = Trainer(**kwargs) print(f"pytorch_lightning version={pl.__version__}") dataset = data_module( train_dir=remote_store.train_data_path, val_dir=remote_store.val_data_path, num_train_epochs=epochs, has_val=should_validate is not None, train_batch_size=batch_size, val_batch_size=val_batch_size, shuffle_size=calculate_shuffle_buffer_size(), num_reader_epochs=loader_num_epochs, reader_pool_type=reader_pool_type, reader_worker_count=train_reader_worker_count, transform_spec=transformation, inmemory_cache_all=inmemory_cache_all, cur_shard=hvd.rank(), shard_count=hvd.size(), schema_fields=schema_fields, storage_options=storage_options, steps_per_epoch_train=_train_steps_per_epoch, steps_per_epoch_val=_val_steps_per_epoch, verbose=verbose) trainer.fit(model, dataset) serialized_checkpoint = io.BytesIO() module = model if not is_legacy else model._model # TODO: find a way to pass trainer.logged_metrics out. output = {'model': module.state_dict()} torch.save(output, serialized_checkpoint) serialized_checkpoint.seek(0) return serialized_checkpoint