def load_batch_into_buffer( self, batch: SampleBatch, buffer_index: int = 0, ) -> int: # Set the is_training flag of the batch. batch.is_training = True # Shortcut for 1 CPU only: Store batch in # `self._loaded_single_cpu_batch`. if len(self.devices) == 1 and self.devices[0] == "/cpu:0": assert buffer_index == 0 self._loaded_single_cpu_batch = batch return len(batch) input_dict = self._get_loss_inputs_dict(batch, shuffle=False) data_keys = list(self._loss_input_dict_no_rnn.values()) if self._state_inputs: state_keys = self._state_inputs + [self._seq_lens] else: state_keys = [] inputs = [input_dict[k] for k in data_keys] state_inputs = [input_dict[k] for k in state_keys] return self.multi_gpu_tower_stacks[buffer_index].load_data( sess=self.get_session(), inputs=inputs, state_inputs=state_inputs, )
def compute_gradients(self, postprocessed_batch: SampleBatch) -> ModelGradients: assert len(self.devices) == 1 # If not done yet, see whether we have to zero-pad this batch. if not postprocessed_batch.zero_padded: pad_batch_to_sequences_of_same_size( batch=postprocessed_batch, max_seq_len=self.max_seq_len, shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, ) postprocessed_batch.is_training = True self._lazy_tensor_dict(postprocessed_batch, device=self.devices[0]) # Do the (maybe parallelized) gradient calculation step. tower_outputs = self._multi_gpu_parallel_grad_calc( [postprocessed_batch]) all_grads, grad_info = tower_outputs[0] grad_info["allreduce_latency"] /= len(self._optimizers) grad_info.update(self.extra_grad_info(postprocessed_batch)) fetches = self.extra_compute_grad_fetches() return all_grads, dict(fetches, **{LEARNER_STATS_KEY: grad_info})
def load_batch_into_buffer( self, batch: SampleBatch, buffer_index: int = 0, ) -> int: # Set the is_training flag of the batch. batch.is_training = True # Shortcut for 1 CPU only: Store batch in `self._loaded_batches`. if len(self.devices) == 1 and self.devices[0].type == "cpu": assert buffer_index == 0 pad_batch_to_sequences_of_same_size( batch=batch, max_seq_len=self.max_seq_len, shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, ) self._lazy_tensor_dict(batch) self._loaded_batches[0] = [batch] return len(batch) # Batch (len=28, seq-lens=[4, 7, 4, 10, 3]): # 0123 0123456 0123 0123456789ABC # 1) split into n per-GPU sub batches (n=2). # [0123 0123456] [012] [3 0123456789 ABC] # (len=14, 14 seq-lens=[4, 7, 3] [1, 10, 3]) slices = batch.timeslices(num_slices=len(self.devices)) # 2) zero-padding (max-seq-len=10). # - [0123000000 0123456000 0120000000] # - [3000000000 0123456789 ABC0000000] for slice in slices: pad_batch_to_sequences_of_same_size( batch=slice, max_seq_len=self.max_seq_len, shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, ) # 3) Load splits into the given buffer (consisting of n GPUs). slices = [ slice.to_device(self.devices[i]) for i, slice in enumerate(slices) ] self._loaded_batches[buffer_index] = slices # Return loaded samples per-device. return len(slices[0])
def _get_loss_inputs_dict(self, train_batch: SampleBatch, shuffle: bool): """Return a feed dict from a batch. Args: train_batch (SampleBatch): batch of data to derive inputs from. shuffle (bool): whether to shuffle batch sequences. Shuffle may be done in-place. This only makes sense if you're further applying minibatch SGD after getting the outputs. Returns: Feed dict of data. """ if not isinstance(train_batch, SampleBatch) or not train_batch.zero_padded: pad_batch_to_sequences_of_same_size( train_batch, max_seq_len=self._max_seq_len, shuffle=shuffle, batch_divisibility_req=self._batch_divisibility_req, feature_keys=list(self._loss_input_dict_no_rnn.keys()), view_requirements=self.view_requirements, ) else: train_batch["seq_lens"] = train_batch.seq_lens # Get batch ready for RNNs, if applicable. # Mark the batch as "is_training" so the Model can use this # information. train_batch.is_training = True # Build the feed dict from the batch. feed_dict = {} for key, placeholder in self._loss_input_dict.items(): feed_dict[placeholder] = train_batch[key] state_keys = [ "state_in_{}".format(i) for i in range(len(self._state_inputs)) ] for key in state_keys: feed_dict[self._loss_input_dict[key]] = train_batch[key] if state_keys: feed_dict[self._seq_lens] = train_batch["seq_lens"] return feed_dict
def compute_gradients(self, postprocessed_batch: SampleBatch) -> ModelGradients: assert len(self.devices) == 1 postprocessed_batch.is_training = True self._lazy_tensor_dict(postprocessed_batch, device=self.devices[0]) # Do the (maybe parallelized) gradient calculation step. tower_outputs = self._multi_gpu_parallel_grad_calc( [postprocessed_batch]) all_grads, grad_info = tower_outputs[0] grad_info["allreduce_latency"] /= len(self._optimizers) grad_info.update(self.extra_grad_info(postprocessed_batch)) fetches = self.extra_compute_grad_fetches() return all_grads, dict(fetches, **{LEARNER_STATS_KEY: grad_info})
def compute_gradients(self, postprocessed_batch: SampleBatch) -> ModelGradients: if not isinstance(postprocessed_batch, SampleBatch) or \ not postprocessed_batch.zero_padded: pad_batch_to_sequences_of_same_size( postprocessed_batch, max_seq_len=self.max_seq_len, shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, ) # Mark the batch as "is_training" so the Model can use this # information. postprocessed_batch.is_training = True # Single device case: Use batch as-is (no slicing). if len(self.devices) == 1: batches = [self._lazy_tensor_dict(postprocessed_batch)] # Multi-GPU case: Slice inputs into n (roughly) equal batches. else: len_ = len(postprocessed_batch) batches = [] start = 0 for i, device in enumerate(self.devices): shard_len = len_ // (len(self.devices) - i) batch = self._lazy_tensor_dict(postprocessed_batch.slice( start, start + shard_len), device=device) batches.append(batch) len_ -= shard_len start += shard_len # Copy weights of main model to all towers. state_dict = self.model.state_dict() for tower in self.model_gpu_towers: tower.load_state_dict(state_dict) # Do the (maybe parallelized) gradient calculation step. tower_outputs = self._multi_gpu_parallel_grad_calc(batches) # Multi device (GPU) case. if len(self.devices) > 1: # Mean-reduce over GPU-towers. all_grads = [] for i in range(len(tower_outputs[0][0])): if tower_outputs[0][0][i] is not None: all_grads.append( torch.mean(torch.stack( [t[0][i].to(self.device) for t in tower_outputs]), dim=0)) else: all_grads.append(None) # Set main model's grads to mean-reduced values. for i, p in enumerate(self.model.parameters()): p.grad = all_grads[i] # Reduce stats over towers as well. from ray.rllib.execution.train_ops import all_tower_reduce grad_info = tree.map_structure_with_path( lambda p, *t: all_tower_reduce(p, *t), *[t[1] for t in tower_outputs]) # Single device case. else: all_grads, grad_info = tower_outputs[0] grad_info["allreduce_latency"] /= len(self._optimizers) grad_info.update(self.extra_grad_info(postprocessed_batch)) fetches = self.extra_compute_grad_fetches() return all_grads, dict(fetches, **{LEARNER_STATS_KEY: grad_info})
def compute_gradients(self, postprocessed_batch: SampleBatch) -> ModelGradients: if not isinstance(postprocessed_batch, SampleBatch) or \ not postprocessed_batch.zero_padded: pad_batch_to_sequences_of_same_size( postprocessed_batch, max_seq_len=self.max_seq_len, shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, ) else: postprocessed_batch["seq_lens"] = postprocessed_batch.seq_lens # Mark the batch as "is_training" so the Model can use this # information. postprocessed_batch.is_training = True train_batch = self._lazy_tensor_dict(postprocessed_batch) # Calculate the actual policy loss. loss_out = force_list( self._loss(self, self.model, self.dist_class, train_batch)) # Call Model's custom-loss with Policy loss outputs and train_batch. if self.model: loss_out = self.model.custom_loss(loss_out, train_batch) # Give Exploration component that chance to modify the loss (or add # its own terms). if hasattr(self, "exploration"): loss_out = self.exploration.get_exploration_loss( loss_out, train_batch) assert len(loss_out) == len(self._optimizers) # assert not any(torch.isnan(l) for l in loss_out) fetches = self.extra_compute_grad_fetches() # Loop through all optimizers. grad_info = {"allreduce_latency": 0.0} all_grads = [] for i, opt in enumerate(self._optimizers): # Erase gradients in all vars of this optimizer. opt.zero_grad() # Recompute gradients of loss over all variables. loss_out[i].backward(retain_graph=(i < len(self._optimizers) - 1)) grad_info.update(self.extra_grad_process(opt, loss_out[i])) grads = [] # Note that return values are just references; # Calling zero_grad would modify the values. for param_group in opt.param_groups: for p in param_group["params"]: if p.grad is not None: grads.append(p.grad) all_grads.append(p.grad.data.cpu().numpy()) else: all_grads.append(None) if self.distributed_world_size: start = time.time() if torch.cuda.is_available(): # Sadly, allreduce_coalesced does not work with CUDA yet. for g in grads: torch.distributed.all_reduce( g, op=torch.distributed.ReduceOp.SUM) else: torch.distributed.all_reduce_coalesced( grads, op=torch.distributed.ReduceOp.SUM) for param_group in opt.param_groups: for p in param_group["params"]: if p.grad is not None: p.grad /= self.distributed_world_size grad_info["allreduce_latency"] += time.time() - start grad_info["allreduce_latency"] /= len(self._optimizers) grad_info.update(self.extra_grad_info(train_batch)) return all_grads, dict(fetches, **{LEARNER_STATS_KEY: grad_info})