def train(self, epoch): self.model.train() metrics = Metrics(self.metrics, self.prod_train_len, self.num_classes) loss, len_steps, len_data = 0, 0, 0 # Training loop loop = tqdm(self.train_loader) for batch_id, (data, target) in enumerate(loop): data, target = data.to(self.device), target.to(self.device) self.optimizer.zero_grad() output = self.model(data) if self.task == "classification": current_loss = self.criterion(output, target) elif self.task == "segmentation": current_loss = self.criterion(output, target.squeeze(dim=1)) current_loss.backward() loss += current_loss self.optimizer.step() if self.task == "classification": len_steps += len(data) len_data = len_steps elif self.task == "segmentation": len_steps += len(data) * np.prod(data.shape[-2:]) len_data += len(data) # Update metrics confidence, pred = F.softmax(output, dim=1).max(dim=1, keepdim=True) metrics.update(pred, target, confidence) # Update the average loss loop.set_description(f"Epoch {epoch}/{self.nb_epochs}") loop.set_postfix( OrderedDict({ "loss_nll": f"{(loss / len_data):05.4e}", "acc": f"{(metrics.accuracy / len_steps):05.2%}", })) loop.update() # Eval on epoch end scores = metrics.get_scores(split="train") logs_dict = OrderedDict({ "epoch": { "value": epoch, "string": f"{epoch:03}" }, "lr": { "value": self.optimizer.param_groups[0]["lr"], "string": f"{self.optimizer.param_groups[0]['lr']:05.1e}", }, "train/loss_nll": { "value": loss / len_data, "string": f"{(loss / len_data):05.4e}", }, }) for s in scores: logs_dict[s] = scores[s] # Val scores if self.val_loader is not None: val_losses, scores_val = self.evaluate(self.val_loader, self.prod_val_len, split="val") logs_dict["val/loss_nll"] = { "value": val_losses["loss_nll"].item() / self.nsamples_val, "string": f"{(val_losses['loss_nll'].item() / self.nsamples_val):05.4e}", } for sv in scores_val: logs_dict[sv] = scores_val[sv] # Test scores test_losses, scores_test = self.evaluate(self.test_loader, self.prod_test_len, split="test") logs_dict["test/loss_nll"] = { "value": test_losses["loss_nll"].item() / self.nsamples_test, "string": f"{(test_losses['loss_nll'].item() / self.nsamples_test):05.4e}", } for st in scores_test: logs_dict[st] = scores_test[st] # Print metrics misc.print_dict(logs_dict) # Save the model checkpoint self.save_checkpoint(epoch) # CSV logging misc.csv_writter(path=self.output_folder / "logs.csv", dic=OrderedDict(logs_dict)) # Tensorboard logging self.save_tb(logs_dict) # Scheduler step if self.scheduler: self.scheduler.step()
def train(self, epoch): self.model.train() self.disable_bn() if self.config_args["model"].get("uncertainty", None): self.disable_dropout() metrics = Metrics(self.metrics, self.prod_train_len, self.num_classes) loss, confid_loss = 0, 0 len_steps, len_data = 0, 0 # Training loop loop = tqdm(self.train_loader) for step, batch in enumerate(tqdm(loop, desc="Iteration")): batch = tuple(t.to(self.device) for t in batch) idx_ids, input_ids, input_mask, segment_ids, label_ids = batch output = self.model(input_ids, segment_ids, input_mask, labels=None) print('output', output[0], output[1], torch.sigmoid(output[1])) current_loss = self.criterion(output, label_ids) """ loss """ n_gpu = torch.cuda.device_count() if n_gpu > 1: current_loss = current_loss.mean() if self.config_args['training']['gradient_accumulation_steps'] > 1: current_loss = current_loss / self.config_args['training'][ 'gradient_accumulation_steps'] current_loss.backward() loss += current_loss len_steps += len(input_ids) len_data = len_steps # Update metrics pred = output[0].argmax(dim=1, keepdim=True) confidence = torch.sigmoid(output[1]) metrics.update(idx_ids, pred, label_ids, confidence) pred_detach, label_detach, confidence_detach, idx_detach = pred.detach( ), label_ids.detach(), confidence.detach(), idx_ids.detach() print('pred', pred_detach.cpu()) print('label', label_detach.cpu()) print('idx', idx_detach.cpu()) print('confidence', confidence_detach.cpu()) if (step + 1) % self.config_args['training'][ 'gradient_accumulation_steps'] == 0: print('optimizer step', step + 1) self.optimizer.step() self.optimizer.zero_grad() # Update the average loss loop.set_description(f"Epoch {epoch}/{self.nb_epochs}") loop.set_postfix( OrderedDict({ "loss_confid": f"{(loss / len_data):05.3e}", "acc": f"{(metrics.accuracy / len_steps):05.2%}", })) loop.update() # Eval on epoch end scores = metrics.get_scores(split="train") logs_dict = OrderedDict({ "epoch": { "value": epoch, "string": f"{epoch:03}" }, "lr": { "value": self.optimizer.param_groups[0]["lr"], "string": f"{self.optimizer.param_groups[0]['lr']:05.1e}", }, "train/loss_confid": { "value": loss / len_data, "string": f"{(loss / len_data):05.4e}", }, }) for s in scores: logs_dict[s] = scores[s] # Val scores val_losses, scores_val = self.evaluate(self.val_loader, self.prod_val_len, split="val") logs_dict["val/loss_confid"] = { "value": val_losses["loss_confid"].item() / self.nsamples_val, "string": f"{(val_losses['loss_confid'].item() / self.nsamples_val):05.4e}", } for sv in scores_val: logs_dict[sv] = scores_val[sv] # Test scores test_losses, scores_test = self.evaluate(self.test_loader, self.prod_test_len, split="test") logs_dict["test/loss_confid"] = { "value": test_losses["loss_confid"].item() / self.nsamples_test, "string": f"{(test_losses['loss_confid'].item() / self.nsamples_test):05.4e}", } for st in scores_test: logs_dict[st] = scores_test[st] # Print metrics misc.print_dict(logs_dict) # Save the model checkpoint self.save_checkpoint(epoch) # CSV logging misc.csv_writter(path=self.output_folder / "logs.csv", dic=OrderedDict(logs_dict)) # Tensorboard logging self.save_tb(logs_dict) # Scheduler step if self.scheduler: self.scheduler.step()
def train(self, epoch): self.model.train() # self.disable_bn() # if self.config_args["model"].get("uncertainty", None): # self.disable_dropout() metrics = Metrics(self.metrics, self.prod_train_len, self.num_classes) loss, confid_loss = 0, 0 len_steps, len_data = 0, 0 # Training loop loop = tqdm(self.train_loader) for batch_id, (data, target) in enumerate(loop): data, target = data.to(self.device), target.to(self.device) self.optimizer.zero_grad() output = self.model(data) # import pdb # pdb.set_trace() # print(output[0]) # exit() if self.task == "classification": current_loss = self.criterion(output, target) elif self.task == "segmentation": current_loss = self.criterion(output, target.squeeze(dim=1)) current_loss.backward() loss += current_loss self.optimizer.step() if self.task == "classification": len_steps += len(data) len_data = len_steps elif self.task == "segmentation": len_steps += len(data) * np.prod(data.shape[-2:]) len_data += len(data) # Update metrics pred = output[0].argmax(dim=1, keepdim=True) confidence = torch.sigmoid(output[1]) metrics.update(pred, target, confidence) # Update the average loss loop.set_description(f"Epoch {epoch}/{self.nb_epochs}") loop.set_postfix( OrderedDict({ "loss_confid": f"{(loss / len_data):05.3e}", "acc": f"{(metrics.accuracy / len_steps):05.2%}", })) loop.update() # Eval on epoch end scores = metrics.get_scores(split="train") logs_dict = OrderedDict({ "epoch": { "value": epoch, "string": f"{epoch:03}" }, "train/loss_confid": { "value": loss / len_data, "string": f"{(loss / len_data):05.4e}", }, }) for s in scores: logs_dict[s] = scores[s] # Val scores val_losses, scores_val, _ = self.evaluate(self.val_loader, self.prod_val_len, split="val") logs_dict["val/loss_confid"] = { "value": val_losses["loss_confid"].item() / self.nsamples_val, "string": f"{(val_losses['loss_confid'].item() / self.nsamples_val):05.4e}", } for sv in scores_val: logs_dict[sv] = scores_val[sv] # Test scores test_losses, scores_test, _ = self.evaluate(self.test_loader, self.prod_test_len, split="test") logs_dict["test/loss_confid"] = { "value": test_losses["loss_confid"].item() / self.nsamples_test, "string": f"{(test_losses['loss_confid'].item() / self.nsamples_test):05.4e}", } for st in scores_test: logs_dict[st] = scores_test[st] # Print metrics misc.print_dict(logs_dict) # Save the model checkpoint self.save_checkpoint(epoch) # CSV logging misc.csv_writter(path=self.output_folder / "logs.csv", dic=OrderedDict(logs_dict)) # Tensorboard logging self.save_tb(logs_dict)