def train_policy_on_episode(optimizer: Optimizer, training_info: TrainingInfo, episode_number: int): """ Trains both the actor and the critic using all transitions of the latest episode. The actor's loss is the MSE between V(state) and reward + gamma * V(next state), where V indicates the actor's value function. The actor / policy is trained by maximizing the log probability * td-error, and an entropy term is added to encourage exploration. The entropy is decayed at new each episode by the run_params.entropy_decay coefficient. """ training_info.compute_discounted_rewards() # Compute the loss of the policy and the critic at each time step policy_losses = [] # Policy errors value_losses = [] # Critic errors for log_prob, discounted_reward, state_value, entropy in zip( training_info.log_probs, training_info.discounted_rewards, training_info.state_values, training_info.entropies): advantage = discounted_reward - state_value.item() policy_losses.append(-(log_prob + 0.99**episode_number * entropy) * advantage) value_losses.append( F.smooth_l1_loss(state_value.squeeze(0), torch.tensor([discounted_reward]))) # Optimize the policy optimizer.zero_grad() total_policy_loss = torch.stack(policy_losses).sum() + torch.stack( value_losses).sum() total_policy_loss.backward() optimizer.step() # Reset the state of the episode training_info.reset()
def __init__(self, model_params, is_ml=True, lr=0.1, noise_r=0, noise_T=-1, noise_eps=0, momentum=0, NCE=False, NCE_s=0, NCE_gamma=0, is_verbose=False): self.is_ml = is_ml if is_ml: Optimizer.__init__(self, model_params, dict()) else: self.state = defaultdict(dict) self.lr = lr self.noise_r = noise_r self.noise_T = noise_T self.noise_eps = noise_eps self.momentum = momentum self.NCE = NCE self.NCE_s = NCE_s self.NCE_gamma = pow((1 - momentum), 2) / lr self.is_verbose = is_verbose
def train_policy_on_step(critic: SimpleCritic, optimizer: Optimizer, reward: float, state: np.ndarray, next_state: np.ndarray, gamma: float, log_prob: float, entropy: float, episode_number: int, run_params: RunParams): """ Trains both the actor and the critic using the given transition. The actor's loss is the MSE between V(state) and reward + gamma * V(next state), where V indicates the actor's value function. The actor / policy is trained by maximizing the log probability * td-error, and an entropy term is added to encourage exploration. The entropy is decayed at new each episode by the run_params.entropy_decay coefficient. """ # Inspired from https://gym.openai.com/evaluations/eval_gUhDnmlbTKG1qW0jS6HSg/ state, next_state = prepare_state(state), prepare_state(next_state) state_value_target = reward + gamma * critic.forward(next_state) state_value_prediction = critic.forward(state) td_error = state_value_target - state_value_prediction # Update policy optimizer.zero_grad() loss = -(log_prob + run_params.entropy_coeff * run_params.entropy_decay**episode_number * entropy) * td_error loss += F.mse_loss(state_value_prediction, state_value_target) loss.backward() optimizer.step()
def train_steps( self, optimizer: Optimizer, triplet_dataset: FederatedTripletsDataset) -> TrainStepResults: losses: List[float] = [] local_step: int = 0 triplet_loader = DataLoader(triplet_dataset, batch_size=self.settings.batch_size, shuffle=True) for triplets in triplet_loader: # Calculate triplet loss triplet_loss = self.loss_fn( anchor=self.model(triplets["anchor"].cuda()), positive=self.model(triplets["positive"].cuda()), negative=self.model(triplets["negative"].cuda()), ).cuda() # Backward pass optimizer.zero_grad() triplet_loss.backward() optimizer.step() self.global_step += 1 local_step += 1 losses.append(triplet_loss.item()) loss_mean = sum(losses) / len(losses) return TrainStepResults(loss_mean, local_step)
def train_epoch(model: nn.Module, loader: DataLoader, optimizer: Optimizer, epoch: int) -> Tuple[torch.Tensor, torch.Tensor]: log_interval = len(loader) // 10 device = next(model.parameters()).device model.train() steps = [] traces = [] for batch_idx, (data, target) in enumerate(loader, start=1): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() if batch_idx % log_interval == 0: print("Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format( epoch, batch_idx * len(data), len(train_loader.dataset), 100.0 * batch_idx / len(train_loader), loss.item(), )) steps.append(batch_idx) batch_traces = batch_data_matrix_trace(model, data) traces.append(batch_traces) optimizer.step() steps = torch.tensor(steps) traces = torch.stack(traces, dim=1) return steps, traces
def load(self, save_path, model: torch.nn.Module = None, optimizer: Optimizer = None): """Loads saved model from file Args: save_path: Path to saved model (.pth). If a directory is provided instead, model-best.pth is used model: Torch model to restore weights to optimizer: Optimizer """ if os.path.isdir(save_path): save_path = os.path.join(save_path, 'model-best.pth') state = torch.load(save_path) step = 0 if 'step' in state: step = state['step'] if 'state_dict' in state and model is not None: model.load_state_dict(state['state_dict']) if 'optimizer' in state and optimizer is not None: optimizer.load_state_dict(state['optimizer']) self._logger.info('Loaded models from {}'.format(save_path)) return step
def optimizer_step( self, epoch: int, batch_idx: int, optimizer: Optimizer, optimizer_idx: int, optimizer_closure: typing.Optional[typing.Callable] = None, on_tpu: bool = False, using_native_amp: bool = False, using_lbfgs: bool = False, ) -> None: # warm-up + decay schedule placed here since LARSWrapper is not optimizer class # adjust LR of optim contained within LARSWrapper new_learning_rate = self._get_latest_lr() if self.lars_wrapper: for param_group in optimizer.optim.param_groups: param_group["lr"] = new_learning_rate else: for param_group in optimizer.param_groups: param_group["lr"] = new_learning_rate if self.trainer.amp_backend == AMPType.APEX: optimizer_closure() optimizer.step() else: optimizer.step(closure=optimizer_closure)
def unfreeze_and_add_param_group( modules: Union[Module, Iterable[Union[Module, Iterable]]], optimizer: Optimizer, lr: Optional[float] = None, initial_denom_lr: float = 10.0, train_bn: bool = True, ) -> None: """Unfreezes a module and adds its parameters to an optimizer. Args: modules: A module or iterable of modules to unfreeze. Their parameters will be added to an optimizer as a new param group. optimizer: The provided optimizer will receive new parameters and will add them to `add_param_group` lr: Learning rate for the new param group. initial_denom_lr: If no lr is provided, the learning from the first param group will be used and divided by `initial_denom_lr`. train_bn: Whether to train the BatchNormalization layers. """ BaseFinetuning.make_trainable(modules) params_lr = optimizer.param_groups[0]["lr"] if lr is None else float( lr) denom_lr = initial_denom_lr if lr is None else 1.0 params = BaseFinetuning.filter_params(modules, train_bn=train_bn, requires_grad=True) params = BaseFinetuning.filter_on_optimizer(optimizer, params) if params: optimizer.add_param_group({ "params": params, "lr": params_lr / denom_lr })
def unfreeze_and_add_param_group(module: torch.nn.Module, optimizer: Optimizer, lr: Optional[float] = None, unfreeze_end: Optional[str] = None, unfreeze_start: Optional[str] = None, train_bn: bool = True): """Unfreezes a module and adds its parameters to an optimizer.""" if (unfreeze_start is not None) or (unfreeze_end is not None): unfreeze_modules = [] unfreeze_flag = True if unfreeze_start is None else False # the reason for [1:] is because the named_modules return the full model # as an unnamed nn.Sequential module as 1st member # https://discuss.pytorch.org/t/module-children-vs-module-modules/4551/4 for name, _module in named_child_modules(module): if unfreeze_flag: unfreeze_modules.append(_module) if unfreeze_start is not None and name == unfreeze_start: unfreeze_flag = True if unfreeze_end is not None and unfreeze_end == name: break module = torch.nn.Sequential(*unfreeze_modules) _make_trainable(module) params_lr = optimizer.param_groups[0]['lr'] if lr is None else float(lr) optimizer.add_param_group({ 'params': filter_params(module=module, train_bn=train_bn), 'lr': params_lr })
def optimizer_step( self, epoch: int, batch_idx: int, optimizer: Optimizer, optimizer_idx: int, optimizer_closure: typing.Optional[typing.Callable] = None, on_tpu: bool = False, using_native_amp: bool = False, using_lbfgs: bool = False, ) -> None: # warm-up + decay schedule placed here since LARSWrapper is not optimizer class # adjust LR of optim contained within LARSWrapper if self.lars_wrapper: for param_group in optimizer.optim.param_groups: param_group["lr"] = self.lr_schedule[self.trainer.global_step] else: for param_group in optimizer.param_groups: if param_group["name"] == "predictor": param_group["lr"] = self.learning_rate else: param_group["lr"] = self.lr_schedule[ self.trainer.global_step] #param_group[0]["lr"] # from lightning #if self.trainer.amp_backend == AMPType.NATIVE: # optimizer_closure() # self.trainer.scaler.step(optimizer) if ((batch_idx + 1) % self.accumulate_grad_batches_custom) == 0: if self.trainer.amp_backend == AMPType.APEX: optimizer_closure() optimizer.step() else: optimizer.step(closure=optimizer_closure)
def train_loop(data_loader: DataLoader, model: nn.Module, optimizer: Optimizer, device: torch.device) -> List[float]: """ Train loop. Loop in model over input batches, compute loss and do back-propagation. :param data_loader: Pytorch DataLoader containing word2vec model inputs. :param model: Word2Vec pytorch model. :param optimizer: Pytorch Optimizer. :param device: :return: List of loss of each training step. """ loss_values = list() model.train() for bi, d in enumerate(data_loader): center_id = d["center_id"] context_id = d["context_id"] center_id = center_id.to(device, dtype=torch.long) context_id = context_id.to(device, dtype=torch.long) optimizer.zero_grad() outputs = model(center_id=center_id) loss = loss_fn(outputs, context_id) loss_values.append(loss.item()) loss.backward() optimizer.step() return loss_values
def optimizer_step( self, epoch: int, batch_idx: int, optimizer: Optimizer, optimizer_idx: int, optimizer_closure: Optional[Callable] = None, on_tpu: bool = False, using_native_amp: bool = False, using_lbfgs: bool = False, ) -> None: # warm-up + decay schedule placed here since LARSWrapper is not optimizer class # adjust LR of optim contained within LARSWrapper if self.lars_wrapper: for param_group in optimizer.optim.param_groups: param_group["lr"] = self.lr_schedule[self.trainer.global_step] else: for param_group in optimizer.param_groups: param_group["lr"] = self.lr_schedule[self.trainer.global_step] # log LR (LearningRateLogger callback doesn't work with LARSWrapper) self.logger.log_metrics( {"learning_rate": self.lr_schedule[self.trainer.global_step]}, self.current_epoch * batch_idx ) # from lightning if self.trainer.amp_backend == AMPType.NATIVE: optimizer_closure() self.trainer.scaler.step(optimizer) elif self.trainer.amp_backend == AMPType.APEX: optimizer_closure() optimizer.step() else: optimizer.step(closure=optimizer_closure)
def train_step(self, x: torch.Tensor, y: torch.Tensor, support_set, optimizer: Optimizer): optimizer.zero_grad() loss, y_pred = self.compute_loss(x, y, support_set) loss.backward() optimizer.step() return loss, y_pred
def fnTrain( loader: DataLoader, device: str, model: nn.Module, optimizer: Optimizer, fnLoss, scaler: GradScaler, ) -> float: runningLoss = 0 for _, (data, targets) in enumerate(loader): data = data.to(device=device) targets = targets.float().unsqueeze(1).to(device=device) with torch.cuda.amp.autocast(): predictions = model(data) loss = fnLoss(predictions, targets) optimizer.zero_grad() scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() # print(f"batch {idxBatch+ 1} loss {loss.item()}") runningLoss += loss.item() return runningLoss / len(loader)
def _train_segmenter_epoch( model: torch.nn.Module, optimizer: Optimizer, train_dataloader: DataLoader, val_dataloader: DataLoader) -> Tuple[List[Any], List[Any]]: t_losses, v_losses = [], [] model.train() for x, y in tqdm(train_dataloader): optimizer.zero_grad() preds = model(x) loss = F.binary_cross_entropy(preds, y.unsqueeze(1)) loss.backward() optimizer.step() t_losses.append(loss.item()) with torch.no_grad(): model.eval() for val_x, val_y in tqdm(val_dataloader): val_preds = model(val_x) val_loss = F.binary_cross_entropy(val_preds, val_y.unsqueeze(1)) v_losses.append(val_loss.item()) print(f'Train loss: {np.mean(t_losses)}, Val loss: {np.mean(v_losses)}') return t_losses, v_losses
def after_loss_fn_new(_input: torch.Tensor, _label: torch.Tensor, _output: torch.Tensor, loss: torch.Tensor, optimizer: Optimizer, loss_fn: Callable[..., torch.Tensor] = None, amp: bool = False, scaler: torch.cuda.amp.GradScaler = None, **kwargs): noise = torch.zeros_like(_input) adv_loss_fn = functools.partial(self.adv_loss, _label=_label) for m in range(self.pgd.iteration): if amp: scaler.step(optimizer) scaler.update() else: optimizer.step() self.eval() adv_x, _ = self.pgd.optimize(_input=_input, noise=noise, loss_fn=adv_loss_fn, iteration=1, epsilon=adv_train_epsilon) self.train() loss = loss_fn(adv_x, _label) if callable(after_loss_fn_old): after_loss_fn_old(_input=_input, _label=_label, _output=_output, loss=loss, optimizer=optimizer, loss_fn=loss_fn, amp=amp, scaler=scaler, **kwargs) if amp: scaler.scale(loss).backward() else: loss.backward()
def train(loader: DataLoader, network: nn.Module, optimizer: Optimizer, epoch: int, log_interval: int, state: TrainingState) -> None: network.train() for batch_idx, (data, target) in enumerate(loader): # manually set all gradients to zero optimizer.zero_grad() # produce the network's output (forward pass) output = network(data) # compute negative log-likelihood loss between # the output and the ground truth label loss = F.nll_loss(output, target) # collect a new set of gradients and # backprogpagate to network parameters loss.backward() optimizer.step() if batch_idx % log_interval == 0: print('Train Epoch: {} [{}/{} ({:0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(loader.dataset), 100.0 * batch_idx / len(loader), loss.item())) count = (batch_idx * 64) + ((epoch - 1) * len(loader.dataset)) state.update(network, optimizer, loss.item(), count)
def train_batch(policy: SimplePolicyContinuous, states: List[torch.Tensor], actions: List[torch.Tensor], discounted_rewards: List[torch.Tensor], optimizer: Optimizer, episode_number: int, run_params: RunParams): """ Trains the policy using the policy gradient method using a single mini-batch of transitions. Entropy is also taken into account. Each new episode diminishes its importance by run_params.entropy_decay, such that the agent will explore at the beginning and tend to explore less and less over time""" optimizer.zero_grad() policy_losses = [] for (state, action, discounted_reward) in zip(states, actions, discounted_rewards): state = state.float().unsqueeze(0) if run_params.continuous_actions: mu, sigma = policy.forward(state) n = Normal(mu, sigma) else: probs = policy.forward(state) n = Categorical(probs) policy_losses.append( -(n.log_prob(action) + 0.99**episode_number * n.entropy()) * discounted_reward) total_policy_loss = torch.cat(policy_losses).sum() total_policy_loss.backward() optimizer.step()
def train_policy(optimizer: Optimizer, training_info: TrainingInfo, run_params: RunParams): """ Trains the policy using the policy gradient method, given the discounted rewards of the latest episode Entropy is also taken into account. Each new episode diminishes its importance by run_params.entropy_decay, such that the agent will explore at the beginning and tend to explore less and less over time. The agent is trained once on all the transitions of the episode (instead of training many times over mini-batches). """ training_info.compute_discounted_rewards() # Compute the loss of the policy at each time step policy_losses = [] for log_prob, discounted_reward, entropy in zip( training_info.log_probs, training_info.discounted_rewards, training_info.entropies): entropy_coeff = run_params.entropy_coeff * run_params.entropy_decay**training_info.episode_number policy_losses.append(-(log_prob + entropy_coeff * entropy) * discounted_reward) # Optimize the policy optimizer.zero_grad() total_policy_loss = torch.cat(policy_losses).sum() total_policy_loss.backward() optimizer.step() # Reset the state of the episode training_info.reset()
def __init__( self, params, lr: float = 1e-2, momentum: float = 0.9, weight_decay: float = 0, eps: float = 1e-6, ): if momentum < 0 or momentum >= 1: raise ValueError(f"Momentum {momentum} must be in the range [0,1]") if lr <= 0: raise ValueError(f"Learning rate {lr} must be positive") if weight_decay < 0: raise ValueError( f"Weight decay {weight_decay} must be non-negative") if eps < 0: raise ValueError("Eps must be non-negative") defaults = { "lr": lr, "eps": eps, "momentum": momentum, "weight_decay": weight_decay, "k": 0, } self.momentum = momentum PT_Optimizer.__init__(self, params, defaults) self.initialize_state()
def NOLAGattack(self, X_nat, y, delta, Optimizer): #it gets v and delta for batch as inputs if self.dataname in ['MNIST', 'FashionMNIST']: rand_i = torch.from_numpy( np.random.uniform(low=-self.eps, high=self.eps, size=X_nat.size())).to(device) rand_i = rand_i.float() pert = X_nat + rand_i pert = pert + delta pert = torch.where(pert > X_nat + self.eps, X_nat + self.eps, pert) pert = torch.where(pert < X_nat - self.eps, X_nat - self.eps, pert) else: pert = X_nat + delta pert = torch.clamp(pert, 0, 1) randpert = pert - X_nat Optimizer.zero_grad() outputs = self.model(pert) loss = self.criterion(outputs, y) loss.backward() grad = delta.grad.data.clone().detach() ########## Update Delta ############ new_delta = (delta + self.stepsize * (grad)).detach().cpu() new_delta = torch.clamp(new_delta, -self.eps, self.eps) return pert.detach(), pert - X_nat.detach(), new_delta
def train(args, model: SentimentAnalysisModel, train_loader: DataLoader, optimizer: Optimizer, epoch: int, device_: device): global eps_threshold_hit model = model.train().to(device_) criterion = nn.CrossEntropyLoss() losses = [] accuracies = [] virtual_batch_rate = VIRTUAL_BATCH_SIZE / BATCH_SIZE for idx, batch in enumerate(tqdm(train_loader)): ids = batch['input_ids'].to(device_, dtype=torch.long) mask = batch['attention_mask'].to(device_, dtype=torch.long) # token_type_ids = batch['token_type_ids'].to(device_, dtype = torch.long) targets = batch['label'].to(device_, dtype=torch.long) decoder_input_ids = batch['decoder_input_ids'].to(device_, dtype=torch.long) optimizer.zero_grad() predictions = model(input_ids=ids, attention_mask=mask, decoder_input_ids=decoder_input_ids) loss = criterion(predictions, targets) acc = binary_accuracy(predictions, targets) loss.backward() if args.eps_threshold is not None: # do virtual stepping to improve performance if (idx + 1 ) % virtual_batch_rate == 0 or idx == len(train_loader) - 1: optimizer.step() optimizer.zero_grad() else: optimizer.virtual_step() else: optimizer.step() losses.append(loss.item()) accuracies.append(acc.item()) if args.eps_threshold is not None: epsilon, best_alpha = optimizer.privacy_engine.get_privacy_spent() print(f"Train Epoch: {epoch} \t" f"Train Loss: {np.mean(losses):.6f} " f"Train Accuracy: {np.mean(accuracies):.6f} " f"(ε = {epsilon:.2f}, δ = {1e-06}) for α = {best_alpha}") # stop training if eps >= eps_threshold eps_threshold_hit = epsilon >= args.eps_threshold if eps_threshold_hit: print('Hit epsilon threshold, stopping training.') else: print( f'Train epoch: {epoch} \t Avg Loss: {np.mean(losses)} \t Avg Accuracy: {np.mean(accuracies)}' )
def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimizer: Optimizer, optimizer_idx: int): last_batch = batch_idx == len(self.trainer.train_dataloader) - 1 if last_batch: tensorboard = self.logger.experiment fig = plot_grad_flow(self.named_parameters()) tensorboard.add_figure("Gradient Flow", fig, self.current_epoch) optimizer.zero_grad()
def train_step(dataloader: DataLoader, netD: nn.Module, netG: nn.Module, optimizerD: Optimizer, optimizerG: Optimizer, generator_criterion_loss, idx_epoch, num_epochs, num_print: int = 5) -> dict: netG.train() netD.train() results = {'d_loss': 0, 'g_loss': 0, 'd_score': 0, 'g_score': 0} batch_sizes = 0 num_samples = len(dataloader) step_ = int(math.ceil(num_samples / num_print)) t1 = time.time() for idx_train, data_train in enumerate(dataloader): # (0) get lr/hr data data_lr, data_hr_target = data_train['lr'], data_train['hr'] batch_size = data_lr.size(0) batch_sizes += batch_size # (1) Update D network: maximize D(x)-1-D(G(z)) z = x_preprocess(data_lr, to_device=to_device) real_img = x_preprocess(data_hr_target, to_device=to_device) fake_img = netG(z) # netD.zero_grad() real_out = netD(real_img).mean() fake_out = netD(fake_img).mean() d_loss = 1 - real_out + fake_out d_loss.backward(retain_graph=True) optimizerD.step() # (2) Update G network: minimize 1-D(G(z)) + Perception Loss + Image Loss + TV Loss netG.zero_grad() g_loss = generator_criterion_loss(fake_out, fake_img, real_img) g_loss.backward() optimizerG.step() fake_img = netG(z) fake_out = netD(fake_img).mean() # (3) g_loss = generator_criterion_loss(fake_out, fake_img, real_img) results['g_loss'] += float(g_loss) * batch_size d_loss = 1 - real_out + fake_out results['d_loss'] += float(d_loss) * batch_size results['d_score'] += float(real_out) * batch_size results['g_score'] += float(fake_out) * batch_size if (idx_train % step_) == 0: str_desc = ' * Loss_D: {:0.4f} Loss_G: {:0.4f} D(x): {:0.4f} D(G(z)): {:0.4f}'\ .format(results['d_loss'], results['g_loss'], results['d_score'], results['g_score']) print('(TRN) [{}/{}] [{}/{}] -> {}'.format(idx_epoch, num_epochs, idx_train, num_samples, str_desc)) dt = time.time() - t1 results = {k: v / batch_sizes for k, v in results.items()} tmp_ = ', '.join(['{}: {:0.2f}'.format(k, v) for k, v in results.items()]) print(' (TRAIN) ({}/{}) dt ~{:0.2f} (s), {}'.format( idx_epoch, num_epochs, dt, tmp_)) return results
def run_epoch(model: torch.nn.Module, loader: DataLoader, criterion: nn.modules.loss._Loss, gt_former: GroundTruthFormer, epoch: int, mode: str = 'train', writer: SummaryWriter = None, optimizer: Optimizer = None, n_dumps_per_epoch: int = 10, train_loader_size: int = None, device: Union[torch.device, str] = torch.device('cpu')) -> Optional[Tuple[float, float]]: """ Run one epoch for model. Can be used for both training and validation. :param model: pytorch model to be trained or validated :param loader: data loader to run model on batches :param criterion: callable class to calculate loss :param gt_former: callable class to form ground truth data to compute loss :param epoch: number of current epoch :param mode: `train` or `val', controls model parameters update need :param writer: tensorboard writer :param optimizer: pytorch model parameters optimizer :param n_dumps_per_epoch: how many times per epoch to dump images to tensorboard (not implemented yet) :param train_loader_size: number of objects in the train loader, needed for plots scaling in val mode :param device: device to be used for model related computations :return: values for cumulative loss and score (only in 'val' mode) """ if mode == 'train': model.train() elif mode == 'val': model.eval() cumulative_loss, cumulative_score = 0, 0 else: raise ValueError(f'Unknown mode: {mode}') for i, (frames, bboxes) in enumerate(tqdm(loader, desc="Batch", leave=False)): frames = frames.to(device) bboxes = [bbox.to(device) for bbox in bboxes] preds = model(frames) gt_data = gt_former.form_gt(bboxes) loss = criterion(preds, gt_data) score = pr_auc(gt_data[0], preds[0]) if mode == 'train': optimizer.zero_grad() loss.backward() optimizer.step() if writer is not None: writer.add_scalar('Loss', loss.item(), epoch * len(loader) + i) writer.add_scalar('Score', score, epoch * len(loader) + i) else: cumulative_loss += loss.item() cumulative_score += score if mode == 'val': if train_loader_size is not None: # scales val data to train data on the plots iterations = epoch * train_loader_size + loader.batch_size else: iterations = epoch * len(loader) + loader.batch_size cumulative_loss /= len(loader) cumulative_score /= len(loader) if writer is not None: writer.add_scalar('Loss', cumulative_loss, iterations) writer.add_scalar('Score', cumulative_score, iterations) return cumulative_loss, cumulative_score
def _update_params(self, docs: Sequence[Doc], optimizer: Optimizer, verbose: bool = False): loss = get_loss_from_docs(docs) optimizer.zero_grad() loss.backward() optimizer.step() if verbose: logger.info(f"Loss: {loss.detach().item()}")
def step(self, optimizer: Optimizer): if self.is_distributed: self.average_gradients(self._model) #TODO: Maybe we dont need to average every step ? if self.fp16: self._scaler.step(optimizer) self._scaler.update() else: optimizer.step() optimizer.zero_grad()
def train_one_epoch( model: Module, optimizer: Optimizer, data_loader: DataLoader, device: device, epoch: int, print_freq: int, ) -> MetricLogger: """Trains Faster R-CNN for one epoch on the data loader. Parameters ---------- model : Module Model to train. optimizer : Optimizer Selected optimizer which updates weights of the model data_loader : DataLoader Train data. device : device Device on which is the model. epoch : int The number of the training epoch. print_freq : int The printing frequency during the training. Returns ------- MetricLogger: Statistics about the training epoch. """ model.train() metric_logger = MetricLogger(delimiter=" ") metric_logger.add_meter("lr", SmoothedValue(window_size=1, fmt="{value:.6f}")) header = "Epoch: [{}]".format(epoch) for images, targets in metric_logger.log_every(data_loader, print_freq, header): images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) loss_dict_reduced = reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) optimizer.zero_grad() losses.backward() optimizer.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) return metric_logger
def train_function( config: Any, engine: Engine, batch: Any, model: torch.nn.Module, loss_fn: torch.nn.Module, optimizer: Optimizer, device: torch.device, ): """Model training step. Parameters ---------- config config object engine Engine instance batch batch in current iteration model nn.Module model loss_fn nn.Module loss optimizer torch optimizer device device to use for training Returns ------- {INSERT HERE} """ model.train() samples = batch[0].to(device, non_blocking=True) targets = batch[1].to(device, non_blocking=True) with autocast(enabled=config.use_amp): outputs = model(samples) loss = loss_fn(outputs, targets) loss.backward() engine.state.backward_completed += 1 engine.fire_event(TrainEvents.BACKWARD_COMPLETED) optimizer.step() engine.state.optim_step_completed += 1 engine.fire_event(TrainEvents.OPTIM_STEP_COMPLETED) optimizer.zero_grad() loss_value = loss.item() engine.state.metrics = {"epoch": engine.state.epoch, "train_loss": loss_value} return loss_value
def train(train_loader: DataLoader, model: nn.Module, criterion: nn.Module, optimizer: Optimizer, epoch: int, world_size: int): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to train mode model.train() end = time.time() for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) # Create non_blocking tensors for distributed training input = input.cuda(non_blocking=True) target = target.cuda(non_blocking=True) # compute output logits = model(input) loss = criterion(logits, target) # compute gradients in a backward pass optimizer.zero_grad() loss.backward() # Call step of optimizer to update model params optimizer.step() # Measure accuracy prec1, prec5 = accuracy(logits.data, target.data, topk=(1, 5)) # Average loss and accuracy across processes for logging reduced_loss = reduce_tensor(loss.data, world_size) prec1 = reduce_tensor(prec1, world_size) prec5 = reduce_tensor(prec5, world_size) # to_python_float incurs a host<->device sync batch_size = input[0].size(0) losses.update(to_python_float(reduced_loss), batch_size) top1.update(to_python_float(prec1), batch_size) top5.update(to_python_float(prec5), batch_size) torch.cuda.synchronize() batch_time.update((time.time() - end)) end = time.time() return losses.avg,top1.avg,top5.avg,batch_time.sum