def gradient_descent_step(w: torch.tensor, L: torch.tensor, lr: float = 1e-3) -> None: """Perform a single step of gradient descent. Note: you need to update the input w itself and not return anything Args: w: input variable. L: loss. lr (optional): learning rate/step size. Defaults to 1e-3. """ # manually zero out the gradient to prevent accumulation if w.grad is not None: w.grad.zero_() # perform backward on loss (we need to retain graph here otherwise Pytorch will throw it away) L.backward(retain_graph=True) gradient = w.grad step = None with torch.no_grad(): ############################################################################ # Student code begin ############################################################################ # raise NotImplementedError("gradient_descent_step not implemented") w -= lr * gradient
def optimization_step(self, network: torch.nn.Module, optimizer: torch.optim.Optimizer, loss: torch.tensor) -> float: """Perform an optimization step of the given network with respect to the given loss. This should be manually called in self.training_step(). If gradient clipping callback is set on the trainer, then it will get called here on the given model. Args: network: The network to update (needed for any callbacks which need the networks parameters) optimizer: The optimizer whose params should match the given network's params loss: Loss for updating the network Returns: The loss value. """ optimizer.zero_grad() self.trainer.loss = loss _loss = loss.item() self.trainer.grad_clip_buffer.append(network) if self.trainer.use_amp: self.trainer.scaler.scale(loss).backward() if True in [isinstance(c, GradClipping) for c in self.trainer.cbs]: self.trainer.scaler.unscale_(optimizer) self.trainer._after_backward( ) # Will run grad_clipping if necessary self.trainer.scaler.step(optimizer) self.trainer.scaler.update() else: loss.backward() self.trainer._after_backward( ) # Will run grad_clipping if necessary optimizer.step() return _loss
def update_weights(self, p: torch.tensor, p_next: torch.tensor): delta = p_next - p self.zero_grad() p.backward(retain_graph=True) for i, param in enumerate(self.parameters()): self.eligibility_traces[ i] = self.gamma * self.lambd * self.eligibility_traces[ i] + param.grad.data new_weights = param.data + self.lr * delta * self.eligibility_traces[ i] param.data = new_weights.data return delta
def _compute_gradients(loss: tensor) -> None: loss.backward()