def compute_importances(self, model, criterion, optimizer, dataset, device, batch_size): """ Compute EWC importance matrix for each parameter """ model.train() # list of list importances = zerolike_params_dict(model) dataloader = DataLoader(dataset, batch_size=batch_size) for i, (x, y, task_labels) in enumerate(dataloader): x, y = x.to(device), y.to(device) optimizer.zero_grad() out = avalanche_forward(model, x, task_labels) loss = criterion(out, y) loss.backward() for (k1, p), (k2, imp) in zip(model.named_parameters(), importances): assert (k1 == k2) if p.grad is not None: imp += p.grad.data.clone().pow(2) # average over mini batch length for _, imp in importances: imp /= float(len(dataloader)) return importances
def _update_grad(self, strategy): model = strategy.model batch = strategy.mbatch model.eval() # Set RNN-like modules on GPU to training mode to avoid CUDA error if strategy.device == "cuda": for module in model.modules(): if isinstance(module, torch.nn.RNNBase): warnings.warn( "RNN-like modules do not support " "backward calls while in `eval` mode on CUDA " "devices. Setting all `RNNBase` modules to " "`train` mode. May produce inconsistent " "output if such modules have `dropout` > 0.") module.train() x, y, task_labels = batch[0], batch[1], batch[-1] strategy.optimizer.zero_grad() out = avalanche_forward(model, x, task_labels) loss = strategy._criterion(out, y) # noqa loss.backward() self.iter_grad = copy_params_dict(model, copy_grad=True)
def _get_importance(self, strategy: BaseSGDTemplate): # Initialize importance matrix importance = dict(zerolike_params_dict(strategy.model)) if not strategy.experience: raise ValueError("Current experience is not available") if strategy.experience.dataset is None: raise ValueError("Current dataset is not available") # Do forward and backward pass to accumulate L2-loss gradients strategy.model.train() dataloader = DataLoader( strategy.experience.dataset, batch_size=strategy.train_mb_size, ) # type: ignore # Progress bar if self.verbose: print("Computing importance") dataloader = tqdm(dataloader) for _, batch in enumerate(dataloader): # Get batch if len(batch) == 2 or len(batch) == 3: x, _, t = batch[0], batch[1], batch[-1] else: raise ValueError("Batch size is not valid") # Move batch to device x = x.to(strategy.device) # Forward pass strategy.model.zero_grad() out = avalanche_forward(strategy.model, x, t) # Average L2-Norm of the output loss = torch.norm(out, p="fro", dim=1).mean() loss.backward() # Accumulate importance for name, param in strategy.model.named_parameters(): if param.requires_grad: # In multi-head architectures, the gradient is going # to be None for all the heads different from the # current one. if param.grad is not None: importance[name] += param.grad.abs() * len(batch) # Normalize importance importance = { name: importance[name] / len(dataloader) for name in importance.keys() } return importance
def compute_importances( self, model, criterion, optimizer, dataset, device, batch_size ): """ Compute EWC importance matrix for each parameter """ model.eval() # Set RNN-like modules on GPU to training mode to avoid CUDA error if device == "cuda": for module in model.modules(): if isinstance(module, torch.nn.RNNBase): warnings.warn( "RNN-like modules do not support " "backward calls while in `eval` mode on CUDA " "devices. Setting all `RNNBase` modules to " "`train` mode. May produce inconsistent " "output if such modules have `dropout` > 0." ) module.train() # list of list importances = zerolike_params_dict(model) dataloader = DataLoader(dataset, batch_size=batch_size) for i, batch in enumerate(dataloader): # get only input, target and task_id from the batch x, y, task_labels = batch[0], batch[1], batch[-1] x, y = x.to(device), y.to(device) optimizer.zero_grad() out = avalanche_forward(model, x, task_labels) loss = criterion(out, y) loss.backward() for (k1, p), (k2, imp) in zip( model.named_parameters(), importances ): assert k1 == k2 if p.grad is not None: imp += p.grad.data.clone().pow(2) # average over mini batch length for _, imp in importances: imp /= float(len(dataloader)) return importances
def inner_update(self, fast_model, x, y, t): """Update fast weights using current samples and return the updated fast model. """ logits = avalanche_forward(fast_model, x, t) loss = self._criterion(logits, y) # Compute gradient with respect to the current fast weights grads = list( torch.autograd.grad( loss, fast_model.fast_params, create_graph=self.second_order, retain_graph=self.second_order, allow_unused=True, ) ) # Clip grad norms grads = [ torch.clamp(g, min=-self.grad_clip_norm, max=self.grad_clip_norm) if g is not None else g for g in grads ] # New fast parameters new_fast_params = [ param - alpha * grad if grad is not None else param for (param, alpha, grad) in zip( fast_model.fast_params, self.alpha_params.parameters(), grads ) ] # Update fast model's weights fast_model.update_params(new_fast_params)
def forward(self): return avalanche_forward(self.model, self.mb_x, self.mb_task_id)
def train_batch(self): # Create a stateless copy of the model for inner-updates fast_model = higher.patch.monkeypatch( self.model, copy_initial_weights=True, track_higher_grads=self.second_order, ) if self.clock.train_exp_counter > 0: batch_x = self.mb_x[: self.train_mb_size] batch_y = self.mb_y[: self.train_mb_size] batch_t = self.mb_task_id[: self.train_mb_size] else: batch_x, batch_y, batch_t = self.mb_x, self.mb_y, self.mb_task_id bsize_data = batch_x.shape[0] rough_sz = math.ceil(bsize_data / self.n_inner_updates) meta_losses = [0 for _ in range(self.n_inner_updates)] for i in range(self.n_inner_updates): batch_x_i = batch_x[i * rough_sz : (i + 1) * rough_sz] batch_y_i = batch_y[i * rough_sz : (i + 1) * rough_sz] batch_t_i = batch_t[i * rough_sz : (i + 1) * rough_sz] # We assume that samples for inner update are from the same task self.inner_update(fast_model, batch_x_i, batch_y_i, batch_t_i) # Compute meta-loss with the combination of batch and buffer samples logits_meta = avalanche_forward( fast_model, self.mb_x, self.mb_task_id ) meta_loss = self._criterion(logits_meta, self.mb_y) meta_losses[i] = meta_loss # Compute meta-gradient for the main model meta_loss = sum(meta_losses) / len(meta_losses) meta_grad_model = torch.autograd.grad( meta_loss, fast_model.parameters(time=0), retain_graph=True, allow_unused=True, ) self.model.zero_grad() self.apply_grad(self.model, meta_grad_model) # Clip gradients torch.nn.utils.clip_grad_norm_( self.model.parameters(), self.grad_clip_norm ) if self.learn_lr: # Compute meta-gradient for alpha-lr parameters meta_grad_alpha = torch.autograd.grad( meta_loss, self.alpha_params.parameters(), allow_unused=True ) self.alpha_params.zero_grad() self.apply_grad(self.alpha_params, meta_grad_alpha) torch.nn.utils.clip_grad_norm_( self.alpha_params.parameters(), self.grad_clip_norm ) self.optimizer_alpha.step() # If sync-update: update with self.optimizer # o.w: use the learned LRs to update the model if self.sync_update: self.optimizer.step() else: for p, alpha in zip( self.model.parameters(), self.alpha_params.parameters() ): # Use relu on updated LRs to avoid negative values p.data = p.data - p.grad * F.relu(alpha) self.loss = meta_loss
def forward(self): """Compute the model's output given the current mini-batch.""" return avalanche_forward(self.model, self.mb_x, self.mb_task_id)
def environment_to_experience(self, env, setting): all_observations: List[Observations] = [] all_rewards: List[Rewards] = [] for batch in tqdm.tqdm( env, desc="Converting environment into TensorDataset"): observations: Observations rewards: Optional[Rewards] if isinstance(batch, Observations): observations = batch rewards = None else: assert isinstance(batch, tuple) and len(batch) == 2 observations, rewards = batch if rewards is None: # Need to send actions to the env before we can actually get the # associated Reward. Here there are (at least) three options to choose # from: # Option 1: Select action at random: # action = env.action_space.sample() # if observations.batch_size != action.shape[0]: # action = action[: observations.batch_size] # rewards: Rewards = env.send(action) # Option 2: Use the current model, in 'inference' mode: # action = self.get_actions(observations, action_space=env.action_space) # rewards: Rewards = env.send(action) # Option 3: Train an online model: # NOTE: You might have to change this for your strategy. For instance, # currently does not take any plugins into consideration. self.cl_strategy.optimizer.zero_grad() x = observations.x.to(self.cl_strategy.device) task_labels = observations.task_labels logits = avalanche_forward(self.model, x=x, task_labels=task_labels) y_pred = logits.argmax(-1) action = self.target_setting.Actions(y_pred=y_pred) rewards: Rewards = env.send(action) y = rewards.y.to(self.cl_strategy.device) # Train the model: loss = self.cl_strategy.criterion(logits, y) loss.backward() self.cl_strategy.optimizer.step() all_observations.append(observations) all_rewards.append(rewards) # Stack all the observations into a single `Observations` object: stacked_observations: Observations = Observations.concatenate( all_observations) x = stacked_observations.x task_labels = stacked_observations.task_labels stacked_rewards: Rewards = Rewards.concatenate(all_rewards) y = stacked_rewards.y return SequoiaExperience(env=env, setting=setting, x=x, y=y, task_labels=task_labels)