class LivelossCallback(AvgStatsCallback): def __init__(self, metrics): super().__init__(metrics) self.liveloss = PlotLosses(skip_first=0) self.metricnames = [m.__name__ for m in metrics] self.logs = {} def begin_epoch(self): super().begin_epoch() self.logs = {} self.iteration = 0 def after_loss(self): super().after_loss() if self.in_train: self.iteration += 1 print( "\r[%d, %5d] Train_loss: %.3f" % (self.epoch + 1, self.iteration, self.loss), end="", ) def after_epoch(self): super().after_epoch() self.logs["loss"] = self.train_stats.avg_stats[0] self.logs["val_loss"] = self.valid_stats.avg_stats[0] for i, metric in enumerate(self.metricnames): self.logs[metric] = self.train_stats.avg_stats[i + 1].item() self.logs["val_" + metric] = self.valid_stats.avg_stats[i + 1].item() self.liveloss.update(self.logs) self.liveloss.draw()
def train_vae(self, epochs=10, hidden_size=2, lr=0.0005, recon_loss_method='mse'): """ Handles the training of the vae model. Parameters ---------- epochs : int Number of complete passes over the whole training set. hidden_size : int Size of the latent space of the vae. lr : float. Learning rate for the vae model training. recon_loss_method : str Method for reconstruction loss calculation Returns ------- None """ set_seed(42) # Set the random seed self.model = VAE(hidden_size, self.input.shape) # Initialise model # Create optimizer optimizer = optim.Adam(self.model.parameters(), lr=lr, betas=(0.9, 0.999)) if self.plot_loss: liveloss = PlotLosses() liveloss.skip_first = 0 liveloss.figsize = (16, 10) # Start training loop for epoch in range(1, epochs + 1): tl = train(epoch, self.model, optimizer, self.train_loader, recon_loss_method=recon_loss_method ) # Train model on train dataset testl = test(epoch, self.model, self.test_loader, recon_loss_method=recon_loss_method) if self.plot_loss: # log train and test losses for dynamic plot logs = {} logs['' + 'ELBO'] = tl logs['val_' + 'ELBO'] = testl liveloss.update(logs) liveloss.draw()
def train(model, patch_train_loader, patch_val_loader, EPOCHS, learning_rate): loss_func = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)#, weight_decay=0.99) liveloss = PlotLosses() lr2_tr_loss = [] lr2_val_loss = [] model_losses, valid_losses = [], [] for epoch in range(EPOCHS): print("epoch{}".format(epoch)) model_losses, valid_losses = [], [] logs = {} prefix = '' # with train data model.train() for idx, (data,target) in enumerate(patch_train_loader): data = torch.autograd.Variable(data).to(device = device, dtype = torch.float) print(data.shape) optimizer.zero_grad() pred = model(data) print(pred.shape) loss = loss_func(pred, data) # Backpropagation loss.backward() # update optimizer.step() # loss save model_losses.append(loss.cpu().data.item()) logs[prefix + 'MSE loss'] = loss.item() print(idx,"complete") ## with validation data(only nodefect) model.eval() for idx, (data,target) in enumerate(patch_val_loader): data = torch.autograd.Variable(data).to(device = device, dtype = torch.float) pred = model(data) loss = loss_func(pred, data) valid_losses.append(loss.item()) prefix = 'val_' logs[prefix + 'MSE loss'] = loss.item() lr2_tr_loss.append(np.mean(model_losses)) lr2_val_loss.append(np.mean(valid_losses)) liveloss.update(logs) liveloss.draw() print ("Epoch:", epoch+1, " Training Loss: ", np.mean(model_losses), " Valid Loss: ", np.mean(valid_losses)) ## epoch 별로 모델을 저장을 해서, 혹시 overfitting이 된다면 그 이전의 epoch때를 저장해서 AE모델로 사용하고자한다. path = os.path.join("/content/drive/Shared drives/data/nocrop/model/hs/model{}".format(str(model)[11:12]),str(model)[:12] + '_epoch{}.pth'.format(epoch)) torch.save(model.state_dict(), path) ## epoch19(즉 마지막 에포크)때의 모델을 AE모델로 저장 if epoch == EPOCHS -1: path = os.path.join("/content/drive/Shared drives/data/nocrop/model/hs",str(model)[:12] + '.pth') torch.save(model.state_dict(), path) return lr2_tr_loss, lr2_val_loss
def execute(model, n_epochs, trn_ldr, val_ldr, opti, crit, plot): ''' This routine is responsible for the entire training process, and handles in-training plotting Arguments: model : the model to be trained // nn.Module n_epochs : the number of epochs the model should be trained for // integer trn_ldr : the training dataloader // dataloader val_ldr : the validation dataloader // dataloader opti : the optimiser object // optim crit : the criterion (loss) function // nn loss function plot : a flag denoting whether in-training plotting should occur // boolean Parameters: liveloss : responsible for in-training plotting, activated by plot // PlotLosses() object epoch : the current epoch number // integer logs : holds the log data for the current epoch // dict trn_los : the training loss for the current epoch // float trn_acc : the training accuracy for the current epoch // float val_los : the validation loss for the current epoch // float val_acc : the validation accuracy for the current epoch // float Returns: model : the final, trained model // nn.Module ''' if plot: liveloss = PlotLosses() # initialise liveloss if plotting flag true for epoch in range(n_epochs): logs = {} trn_los, trn_acc = trn(model, opti, crit, trn_ldr) # run the training cycle logs['' + 'log loss'] = trn_los.item() logs['' + 'accuracy'] = trn_acc.item() # update the logs val_los, val_acc = val(model, crit, val_ldr) # run the validation cycle logs['val_' + 'log loss'] = val_los.item() logs['val_' + 'accuracy'] = val_acc.item() # update the logs if plot: liveloss.update(logs) liveloss.draw() # print the plots if flag is true if not plot: print( "Epoch: " + str(epoch)) # if not plotting, print epoch number for tracking return model # return finished trained model
class LiveLossPlotListener(DojoListener): """ DojoListener implementation which renders a livelossplot after finishing a dan. """ def __init__(self): self.liveloss = None def training_started(self, aikidoka: Aikidoka, kata: Kata, kun: DojoKun): self.liveloss = PlotLosses() def dan_finished(self, aikidoka: Aikidoka, run: (int, int), metrics: (float, float)): (loss, acc) = metrics self.liveloss.update({"loss": loss, "train_acc": acc}) self.liveloss.draw()
def train_model_gener(model, criterion, optimizer, dataloaders, num_epochs=10): liveloss = PlotLosses() model = model.to(device) for epoch in range(num_epochs): logs = {} for phase in ['train', 'validation']: if phase == 'train': model.train() else: model.eval() running_loss = 0.0 running_corrects = 0 for inputs_full, labels_class in dataloaders[phase]: # here are changes! inputs = inputs_full[:, :-1].to(device) labels = inputs_full[:, 1:].to(device) outputs = model(inputs) loss = criterion(outputs, labels) if phase == 'train': optimizer.zero_grad() loss.backward() optimizer.step() _, preds = torch.max(outputs, 1) running_loss += loss.detach() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) epoch_loss = running_loss / len(dataloaders[phase].dataset) epoch_acc = running_corrects.float() / len( dataloaders[phase].dataset) prefix = '' if phase == 'validation': prefix = 'val_' logs[prefix + 'log loss'] = epoch_loss.item() logs[prefix + 'accuracy'] = epoch_acc.item() liveloss.update(logs) liveloss.draw()
def train(self, train_ds, valid_ds, plot_loss=True): # Initialize plotting if plot_loss: liveloss = PlotLosses() # Initialize DataLoaders tdl = DataLoader(train_ds, batch_size=self.batch_size, pin_memory=True) vdl = DataLoader(valid_ds, batch_size=self.batch_size, shuffle=False, pin_memory=True) # Lists for losses train_losses, valid_losses = [], [] # Lists for accuracies train_accs, valid_accs = [], [] # Iterate over epochs for epoch in range(self.max_epochs): # Logs for livelossplot logs = {} batch_losses = [] batch_count_goods = [] # Iterate over batches for idx_batch, batch in enumerate(tdl): x = batch[0].to(DEVICE) y = batch[1].to(device=DEVICE, dtype=torch.long) pred = self.model(x) loss = self.loss_fn(pred, y) batch_losses.append(loss.item()) # Accuracy with torch.no_grad(): batch_count_goods.append(self.count_goods(pred, y)) loss.backward() self.optimizer.step() self.optimizer.zero_grad() # Save train loss and accuracy for the epoch train_losses.append(sum(batch_losses) / len(train_ds)) train_accs.append(sum(batch_count_goods) / len(train_ds)) # Compute and save validation loss and accuracy for the epoch with torch.no_grad(): v_batch_losses, v_batch_count_goods = [], [] for idx_batch, batch in enumerate(vdl): x = batch[0].to(DEVICE) y = batch[1].to(device=DEVICE, dtype=torch.long) pred = self.model(x) loss = self.loss_fn(pred, y) v_batch_losses.append(loss.item()) v_batch_count_goods.append(self.count_goods(pred, y)) valid_losses.append(sum(v_batch_losses) / len(valid_ds)) valid_accs.append(sum(v_batch_count_goods) / len(valid_ds)) if plot_loss: logs['log loss'] = train_losses[epoch] logs['val_log loss'] = valid_losses[epoch] logs['accuracy'] = train_accs[epoch] logs['val_accuracy'] = valid_accs[epoch] liveloss.update(logs) liveloss.draw()
# TO START: # pip install livelossplot # pip install neptune-cli # neptune account login # neptune run minimal-neptune.py # enjoy results from time import sleep import numpy as np from livelossplot import PlotLosses liveplot = PlotLosses(target='neptune') for i in range(20): liveplot.update({ 'accuracy': 1 - np.random.rand() / (i + 2.), 'val_accuracy': 1 - np.random.rand() / (i + 0.5), 'mse': 1. / (i + 2.), 'val_mse': 1. / (i + 0.5) }) liveplot.draw() sleep(.5)
def train(model, criterion, optimizer, train_dl, test_dl, num_epochs=40): liveloss = PlotLosses() for epoch in range(num_epochs): train_loss, valid_loss = [], [] logs = {} prefix = '' # Training Part model.train() for i, data in enumerate(train_dl, 0): # Get the inputs inputs = labels = data inputs = inputs.cuda() labels = labels.cuda() inputs = inputs.float() labels = labels.float() # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = model(inputs) outputs = outputs.cuda() loss = criterion(outputs,labels) loss.backward() optimizer.step() ## -> Dense Output Re-feeding <- ## # Zero the gradiants optimizer.zero_grad() # Important detach() the output, to avoid construction of # computation graph outputs = model(outputs.detach()) outputs = outputs.cuda() loss = criterion(outputs, labels) loss.backward() optimizer.step() train_loss.append(loss.item()) logs[prefix + 'MMSE loss'] = loss.item() for i, data in enumerate(test_dl, 0): model.eval() inputs = labels = data inputs = inputs.cuda() labels = labels.cuda() inputs = inputs.float() labels = labels.float() outputs = model(inputs) outputs = outputs.cuda() loss = criterion(outputs, labels) valid_loss.append(loss.item()) prefix = 'val_' logs[prefix + 'MMSE loss'] = loss.item() print() liveloss.update(logs) liveloss.draw() print ("Epoch:", epoch+1, " Training Loss: ", np.mean(train_loss), " Valid Loss: ", np.mean(valid_loss))
def train_classifier( self, train_loader, test_loader, params: dict = None, livelossplot=False, save_checkpoint_each=None, ): """ Method to train the model. Arguments: ---------- - train_loader : DatasetLoader for the training set - test_loader : DatasetLoader for the test set - params (dict) : if needed to update some parameters such as epochs without rebuilding the entire class put the updated parameters here - livelossplot (bool=False): use livelossplot to plot running loss and error_rate - save_checkpoint_each (list): list of epoch when we want to save model """ # Update parameters if given if save_checkpoint_each is None: save_checkpoint_each = [self.params_classifier["epochs"]] if params: for param, value in params.items(): self.params_classifier[param] = value # Define liveloss and time of training start if livelossplot: liveloss = PlotLosses() since = time.time() # Show which device is used print("Using device {}".format(self.device)) self.model.to(self.device) loader_dict = {"train": train_loader, "validation": test_loader} for e in range(self.params_classifier["epochs"]): self.logs = {} if not livelossplot: print("Epoch {}/{} :".format(e, self.params_classifier["epochs"])) print("--------------") # Alternate between train and validation phase for phase in ["train", "validation"]: if phase == "train": self.model.train() else: self.model.eval() # Define loss and uncorrects predictions running_loss = 0.0 running_uncorrects = 0 # Loop over loader for images, labels in iter(loader_dict[phase]): images = images.to(self.device) labels = torch.tensor(labels, dtype=torch.long, device=self.device) # Compute forward output = self.model.forward(images) loss = self.loss(output, labels) # Do the retropropag if in train phase if phase == "train": self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Compute prediction _, predicted = torch.max(output, 1) running_loss += loss.detach() * images.size(0) running_uncorrects += torch.sum( predicted != labels.data.detach()) # Compute loss and error_rate size_loader = len(loader_dict[phase].dataset) epoch_loss = running_loss / size_loader epoch_error_rate = running_uncorrects.float() / size_loader # Set the prefix for logs prefix = "" if phase == "validation": prefix = "val_" # Update logs self.logs[prefix + "log loss"] = epoch_loss.item() self.logs[prefix + "error_rate"] = epoch_error_rate.item() # Use liveloss to plot loss and accuracy if livelossplot: liveloss.update(self.logs) liveloss.draw() else: string_print = """ Training: | Validation: log loss = {} | val_log loss = {} error_rate = {} | val_error_rate = {} """.format( self.logs["log loss"], self.logs["val_log loss"], self.logs["error_rate"], self.logs["val_error_rate"], ) print(string_print) # Save checkpoint if (e + 1) in save_checkpoint_each: save_checkpoint( self.model, model_name="AlexNet_checkpoint_e{}.pth".format(e)) # Print training time time_elapsed = time.time() - since print("Training complete in {:.0f}m {:.0f}s".format( time_elapsed // 60, time_elapsed % 60))
def train_model_it(model, dataloaders, dataset_sizes, criterion, optimizer, batch_size, num_epochs=10, scheduler=None): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") since = time.time() liveloss = PlotLosses() best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0 for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch + 1, num_epochs)) print('-' * 10) running_loss = 0.0 running_corrects = 0 #Iteration for i, (inputs, labels) in enumerate(dataloaders['train']): if scheduler != None: scheduler.step() model.train() running_loss = 0.0 running_corrects = 0 inputs = inputs.to(device) labels = labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward # track history if only in train outputs = model(inputs) _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) print("\rTraining Iteration: {}/{}, Loss: {}.".format( i + 1, len(dataloaders['train']), loss.item() * inputs.size(0) / batch_size), end="") sys.stdout.flush() if (i + 1) % 100 == 0: it_loss = running_loss / batch_size it_acc = running_corrects.double() / batch_size model.eval() val_loss = 0 val_corr = 0 for j, (inputs, labels) in enumerate(dataloaders['val']): inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() with torch.set_grad_enabled(False): outputs = model(inputs) _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) val_loss += loss.item() * inputs.size(0) val_corr += torch.sum(preds == labels.data) print("\rValidation Iteration: {}/{}, Loss: {}.".format( j + 1, len(dataloaders['val']), loss.item() * inputs.size(0) / batch_size), end="") sys.stdout.flush() valid_loss = val_loss / dataset_sizes['val'] valid_acc = val_corr.double() / dataset_sizes['val'] if valid_acc > best_acc: best_acc = valid_acc best_model_wts = copy.deepcopy(model.state_dict()) # statistics liveloss.update({ 'log loss': it_loss, 'val_log loss': valid_loss, 'accuracy': it_acc, 'val_accuracy': valid_acc }) liveloss.draw() print('validation loss: {}, validation accuracy: {}'.format( valid_loss, valid_acc)) print('Best Accuracy: {}'.format(best_acc)) torch.save( model.state_dict(), "./models/acc_{}_loss_{}.pt".format(best_acc, valid_loss)) # print('Train Loss: {:.4f} Acc: {:.4f}'.format(avg_loss, t_acc)) # print( 'Val Loss: {:.4f} Acc: {:.4f}'.format(val_loss, val_acc)) # print('Best Val Accuracy: {}'.format(best_acc)) print() time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best val Acc: {:4f}'.format(best_acc)) # load best model weights model.load_state_dict(best_model_wts) return model
def train_loop(model, device, optimizer, train_loader, test_loader, lr_scheduler=reduce_lr_scheduler, criterion=cross_entropy, epoch_value=10, plot_loss=False): lr_policy = lr_scheduler(optimizer) start = time.time() liveloss = PlotLosses() for epoch_ind in range(epoch_value): try: logs = {} model.train() train_loss = 0 for ind, (input_s, target_s) in enumerate(train_loader): input_s = input_s.to(device) target_s = target_s.to(device) pred = model(input_s) loss = criterion(pred, target_s) model.zero_grad() loss.backward() optimizer.step() train_loss += loss del input_s, target_s gc.collect() torch.cuda.empty_cache() train_loss /= (ind + 1) test_loss = 0 model.eval() with torch.no_grad(): for ind, (input_s, target_s) in enumerate(test_loader): input_s = input_s.to(device) target_s = target_s.to(device) pred = model(input_s) loss = criterion(pred, target_s) test_loss += loss del input_s, target_s gc.collect() torch.cuda.empty_cache() test_loss /= (ind + 1) lr_policy.step(test_loss) train_time = time_since(start) if plot_loss: logs['val_' + 'loss'] = train_loss logs['loss'] = test_loss liveloss.update(logs) liveloss.draw() else: callback(train_loss, test_loss, train_time, epoch_value, epoch_ind + 1) except KeyboardInterrupt: print(f"Early stopping | Epoch: {epoch_ind + 1}") break return model
def train_cross_validation(model_cls, dataset, dropout=0.0, lr=1e-3, weight_decay=1e-2, num_epochs=200, n_splits=10, use_gpu=True, dp=False, ddp=False, comment='', tb_service_loc='192.168.192.57:6007', batch_size=1, num_workers=0, pin_memory=False, cuda_device=None, tb_dir='runs', model_save_dir='saved_models', res_save_dir='res', fold_no=None, saved_model_path=None, device_ids=None, patience=20, seed=None, fold_seed=None, save_model=False, is_reg=True, live_loss=True, domain_cls=True, final_cls=True): """ :type fold_seed: int :param live_loss: bool :param is_reg: bool :param save_model: bool :param seed: :param patience: for early stopping :param device_ids: for ddp :param saved_model_path: :param fold_no: int :param ddp_port: str :param ddp: DDP :param cuda_device: list of int :param pin_memory: bool, DataLoader args :param num_workers: int, DataLoader args :param model_cls: pytorch Module cls :param dataset: instance :param dropout: float :param lr: float :param weight_decay: :param num_epochs: :param n_splits: number of kFolds :param use_gpu: bool :param dp: bool :param comment: comment in the logs, to filter runs in tensorboard :param tb_service_loc: tensorboard service location :param batch_size: Dataset args not DataLoader :return: """ saved_args = locals() seed = int(time.time() % 1e4 * 1e5) if seed is None else seed saved_args['random_seed'] = seed torch.manual_seed(seed) np.random.seed(seed) if use_gpu: torch.cuda.manual_seed_all(seed) # torch.backends.cudnn.deterministic = True # torch.backends.cudnn.benchmark = False model_name = model_cls.__name__ if not cuda_device: if device_ids and dp: device = device_ids[0] else: device = torch.device( 'cuda' if torch.cuda.is_available() and use_gpu else 'cpu') else: device = cuda_device device_count = torch.cuda.device_count() if dp else 1 device_count = len(device_ids) if (device_ids is not None and dp) else device_count batch_size = batch_size * device_count # TensorBoard log_dir_base = get_model_log_dir(comment, model_name) if tb_service_loc is not None: print("TensorBoard available at http://{1}/#scalars®exInput={0}". format(log_dir_base, tb_service_loc)) else: print("Please set up TensorBoard") # model criterion = nn.NLLLoss() print("Training {0} {1} models for cross validation...".format( n_splits, model_name)) # 1 # folds, fold = KFold(n_splits=n_splits, shuffle=False, random_state=seed), 0 # 2 # folds = GroupKFold(n_splits=n_splits) # iter = folds.split(np.zeros(len(dataset)), groups=dataset.data.site_id) # 4 # folds = StratifiedKFold(n_splits=n_splits, random_state=fold_seed, shuffle=True if fold_seed else False) # iter = folds.split(np.zeros(len(dataset)), dataset.data.y.numpy(), groups=dataset.data.subject_id) # 5 fold = 0 iter = multi_site_cv_split(dataset.data.y, dataset.data.site_id, dataset.data.subject_id, n_splits, random_state=fold_seed, shuffle=True if fold_seed else False) for train_idx, val_idx in tqdm_notebook(iter, desc='CV', leave=False): fold += 1 liveloss = PlotLosses() if live_loss else None # for a specific fold if fold_no is not None: if fold != fold_no: continue writer = SummaryWriter(log_dir=osp.join('runs', log_dir_base + str(fold))) model_save_dir = osp.join('saved_models', log_dir_base + str(fold)) print("creating dataloader tor fold {}".format(fold)) train_dataset, val_dataset = norm_train_val(dataset, train_idx, val_idx) model = model_cls(writer) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, collate_fn=lambda data_list: data_list, num_workers=num_workers, pin_memory=pin_memory) val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=batch_size, collate_fn=lambda data_list: data_list, num_workers=num_workers, pin_memory=pin_memory) if fold == 1 or fold_no is not None: print(model) writer.add_text('model_summary', model.__repr__()) writer.add_text('training_args', str(saved_args)) optimizer = torch.optim.AdamW(model.parameters(), lr=lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=weight_decay, amsgrad=False) # scheduler_reduce = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5) scheduler = GradualWarmupScheduler(optimizer, multiplier=10, total_epoch=5) # scheduler = scheduler_reduce # optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay) if dp and use_gpu: model = model.cuda() if device_ids is None else model.to( device_ids[0]) model = DataParallel(model, device_ids=device_ids) elif use_gpu: model = model.to(device) if saved_model_path is not None: model.load_state_dict(torch.load(saved_model_path)) best_map, patience_counter, best_score = 0.0, 0, np.inf for epoch in tqdm_notebook(range(1, num_epochs + 1), desc='Epoch', leave=False): logs = {} # scheduler.step(epoch=epoch, metrics=best_score) for phase in ['train', 'validation']: if phase == 'train': model.train() dataloader = train_dataloader else: model.eval() dataloader = val_dataloader # Logging running_total_loss = 0.0 running_corrects = 0 running_reg_loss = 0.0 running_nll_loss = 0.0 epoch_yhat_0, epoch_yhat_1 = torch.tensor([]), torch.tensor([]) epoch_label, epoch_predicted = torch.tensor([]), torch.tensor( []) logging_hist = True if phase == 'train' else False # once per epoch for data_list in tqdm_notebook(dataloader, desc=phase, leave=False): # TODO: check devices if dp: data_list = to_cuda(data_list, (device_ids[0] if device_ids is not None else 'cuda')) y_hat, domain_yhat, reg = model(data_list) y = torch.tensor([], dtype=dataset.data.y.dtype, device=device) domain_y = torch.tensor([], dtype=dataset.data.site_id.dtype, device=device) for data in data_list: y = torch.cat([y, data.y.view(-1).to(device)]) domain_y = torch.cat( [domain_y, data.site_id.view(-1).to(device)]) loss = criterion(y_hat, y) domain_loss = criterion(domain_yhat, domain_y) # domain_loss = -1e-7 * domain_loss # print(domain_loss.item()) if domain_cls: total_loss = domain_loss _, predicted = torch.max(domain_yhat, 1) label = domain_y if final_cls: total_loss = loss _, predicted = torch.max(y_hat, 1) label = y if domain_cls and final_cls: total_loss = (loss + domain_loss).sum() _, predicted = torch.max(y_hat, 1) label = y if is_reg: total_loss += reg.sum() if phase == 'train': # print(torch.autograd.grad(y_hat.sum(), model.saved_x, retain_graph=True)) optimizer.zero_grad() total_loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 2.0) optimizer.step() running_nll_loss += loss.item() running_total_loss += total_loss.item() running_reg_loss += reg.sum().item() running_corrects += (predicted == label).sum().item() epoch_yhat_0 = torch.cat( [epoch_yhat_0, y_hat[:, 0].detach().view(-1).cpu()]) epoch_yhat_1 = torch.cat( [epoch_yhat_1, y_hat[:, 1].detach().view(-1).cpu()]) epoch_label = torch.cat( [epoch_label, label.detach().float().view(-1).cpu()]) epoch_predicted = torch.cat([ epoch_predicted, predicted.detach().float().view(-1).cpu() ]) # precision = sklearn.metrics.precision_score(epoch_label, epoch_predicted, average='micro') # recall = sklearn.metrics.recall_score(epoch_label, epoch_predicted, average='micro') # f1_score = sklearn.metrics.f1_score(epoch_label, epoch_predicted, average='micro') accuracy = sklearn.metrics.accuracy_score( epoch_label, epoch_predicted) epoch_total_loss = running_total_loss / dataloader.__len__() epoch_nll_loss = running_nll_loss / dataloader.__len__() epoch_reg_loss = running_reg_loss / dataloader.__len__() # print('epoch {} {}_nll_loss: {}'.format(epoch, phase, epoch_nll_loss)) writer.add_scalars( 'nll_loss', {'{}_nll_loss'.format(phase): epoch_nll_loss}, epoch) writer.add_scalars('accuracy', {'{}_accuracy'.format(phase): accuracy}, epoch) # writer.add_scalars('{}_APRF'.format(phase), # { # 'accuracy': accuracy, # 'precision': precision, # 'recall': recall, # 'f1_score': f1_score # }, # epoch) if epoch_reg_loss != 0: writer.add_scalars( 'reg_loss'.format(phase), {'{}_reg_loss'.format(phase): epoch_reg_loss}, epoch) # print(epoch_reg_loss) # writer.add_histogram('hist/{}_yhat_0'.format(phase), # epoch_yhat_0, # epoch) # writer.add_histogram('hist/{}_yhat_1'.format(phase), # epoch_yhat_1, # epoch) # Save Model & Early Stopping if phase == 'validation': model_save_path = model_save_dir + '-{}-{}-{:.3f}-{:.3f}'.format( model_name, epoch, accuracy, epoch_nll_loss) # best score if accuracy > best_map: best_map = accuracy model_save_path = model_save_path + '-best' score = epoch_nll_loss if score < best_score: patience_counter = 0 best_score = score else: patience_counter += 1 # skip first 10 epoch # best_score = best_score if epoch > 10 else -np.inf if save_model: for th, pfix in zip( [0.8, 0.75, 0.7, 0.5, 0.0], ['-perfect', '-great', '-good', '-bad', '-miss']): if accuracy >= th: model_save_path += pfix break torch.save(model.state_dict(), model_save_path) writer.add_scalars('best_val_accuracy', {'{}_accuracy'.format(phase): best_map}, epoch) writer.add_scalars( 'best_nll_loss', {'{}_nll_loss'.format(phase): best_score}, epoch) writer.add_scalars('learning_rate', { 'learning_rate': scheduler.optimizer.param_groups[0]['lr'] }, epoch) if patience_counter >= patience: print("Stopped at epoch {}".format(epoch)) return if live_loss: prefix = '' if phase == 'validation': prefix = 'val_' logs[prefix + 'log loss'] = epoch_nll_loss logs[prefix + 'accuracy'] = accuracy if live_loss: liveloss.update(logs) liveloss.draw() print("Done !")
class LiveLossPlot(Callback): """ Callback to write metrics to `LiveLossPlot <https://github.com/stared/livelossplot>`_, a library for visualisation in notebooks Example: :: >>> import torch.nn >>> from torchbearer import Trial >>> from torchbearer.callbacks import LiveLossPlot # Example Trial which clips all model gradients norms at 2 under the L1 norm. >>> model = torch.nn.Linear(1,1) >>> live_loss_plot = LiveLossPlot() >>> trial = Trial(model, callbacks=[live_loss_plot], metrics=['acc']) Args: on_batch (bool): If True, batch metrics will be logged. Else batch metrics will not be logged batch_step_size (int): The number of batches between logging metrics on_epoch (bool): If True, epoch metrics will be logged every epoch. Else epoch metrics will not be logged draw_once (bool): If True, draw the plot only at the end of training. Else draw every time metrics are logged kwargs: Keyword arguments for livelossplot.PlotLosses State Requirements: - :attr:`torchbearer.state.METRICS`: Metrics should be a dict containing the metrics to be plotted - :attr:`torchbearer.state.BATCH`: Batch should be the current batch or iteration number in the epoch """ def __init__(self, on_batch=False, batch_step_size=10, on_epoch=True, draw_once=False, **kwargs): super(LiveLossPlot, self).__init__() self._kwargs = kwargs self.on_batch = on_batch self.on_epoch = on_epoch self.draw_once = draw_once self.batch_step_size = batch_step_size if on_batch: self.on_step_training = self._on_step_training if on_epoch: self.on_end_epoch = self._on_end_epoch def on_start(self, state): from livelossplot import PlotLosses self.plt = PlotLosses(**self._kwargs) self.batch_plt = PlotLosses(**self._kwargs) def _on_step_training(self, state): self.batch_plt.update({ k: get_metric('LiveLossPlot', state, k) for k in state[torchbearer.METRICS] }) if state[torchbearer. BATCH] % self.batch_step_size == 0 and not self.draw_once: with no_print(): self.batch_plt.draw() def _on_end_epoch(self, state): self.plt.update({ k: get_metric('LiveLossPlot', state, k) for k in state[torchbearer.METRICS] }) if not self.draw_once: with no_print(): self.plt.draw() def on_end(self, state): if self.draw_once: with no_print(): self.batch_plt.draw() self.plt.draw()
def train_model(model, optimizer, criterion, n_epochs, train_loader, validation_loader=None, device='cpu', random_seed=42, backup_folder=None): """ Train a model for a number of epochs. Visualizes average loss, F1 and accuracy score over epochs If a folder is given, saves state dict and scores to disk after each epoch Parameters ---------- model : torch.nn.Module The neural network model to train optimizer : torch.optim.Optimizer Optimizer for to use in training criterion : loss function in torch.nn n_epochs : int number of training iterations over entire train dataset train_loader : torch.utils.data.DataLoader batch data loader of train data validation_loader : torch.utils.data.DataLoader, optional batch data loader of validation data, if available device : str, optional 'cpu' or 'cuda'; hardware accelerator to use random_seed : int, optional seed number for RNGs backup_folder : path-like, optional folder where model parameters are saved after each epoch !will delete all contents of folder first! Returns ------- model : torch.nn.Module model with optimized weights after training validation_loss : float average loss of final model on all samples from validation/test set validation_accuracy : float accuracy score of final model on validation/test set """ set_seed( random_seed) # seed all RNGs before start to have reproducible results model = model.to(device) # create instance of model if backup_folder is not None: if os.path.isdir(backup_folder): shutil.rmtree(backup_folder) elif os.path.exists(backup_folder): os.remove(backup_folder) os.mkdir(backup_folder) with open(os.path.join(backup_folder, "training_report.csv"), 'w') as csv_report: csv_report.write( "epoch,train_loss,train_accuracy,train_f1,validation_loss,validation_accuracy,validation_f1\n" ) # use special (faster) data loaders if running on TPU if TPU_AVAILABLE and (device not in ['cpu', 'cuda']): train_loader = xla_loader(train_loader, [device]).per_device_loader(device) if validation_loader is not None: validation_loader = xla_loader(validation_loader, [device]).per_device_loader(device) live_plot = PlotLosses() # training loop for epoch in range(n_epochs): logs = dict() # do mini-batch SGD over all training samples train_loss, train_accuracy, train_f1 = train(model, optimizer, criterion, train_loader, device) if backup_folder is not None: torch.save( model.state_dict(), os.path.join(backup_folder, "model_epoch_{:d}.pth".format(epoch))) logs['log loss'] = train_loss logs['f1 score'] = train_f1 logs['accuracy'] = train_accuracy # evaluate model on validation/test set if validation_loader is not None: validation_loss, validation_accuracy, validation_f1 = validate( model, criterion, validation_loader, device) logs['val_log loss'] = validation_loss logs['val_f1 score'] = validation_f1 logs['val_accuracy'] = validation_accuracy else: validation_loss, validation_accuracy, validation_f1 = 0, 0, 0 if backup_folder is not None: with open(os.path.join(backup_folder, "training_report.csv"), 'a') as csv_report: csv_report.write( "{:d},{:.4f},{:.4f},{:.4f},{:.4f},{:.4f},{:.4f}\n".format( epoch, train_loss, train_accuracy, train_f1, validation_loss, validation_accuracy, validation_f1)) # draw the visualization of average loss and accuracy live_plot.update(logs) live_plot.draw() return model, validation_loss, validation_accuracy, validation_f1
class train_wrapper(): """ Class that keeps a model, its optimiser and dataloaders together. Stores the train, validate and evaluate functions for training as well as some other useful methods to carry out the training with a love plot and save the model. """ def __init__(self, model, optimizer, train_loader, validate_loader, criterion=nn.CrossEntropyLoss(), device="cpu", keep_best=0): "Stores the parameters on the class instance for later methods" for arg in ["model", "optimizer", "train_loader", "validate_loader", "criterion", "device", "keep_best"]: exec("self." + arg + "=" + arg) try: self.transform = validate_loader.dataset.transform except: print("No transform found, test data must be normalised manually") # store the liveloss as it holds all our logs, useful for later self.liveloss = PlotLosses() # store the best model params self.best_params_dict = {} # store the current epoch between training batches self.epoch = 0 # for keeping the best model params self.max_acc=0. return def train(self): "Train a single epoch" # set the model expect a backward pass self.model.train() train_loss, train_accuracy = 0, 0 # for every training batch for X, y in self.train_loader: # put the samples on the device X, y = X.to(self.device), y.to(self.device) # zero the gradent self.optimizer.zero_grad() # find the model output with current parameters output = self.model(X) # caclulate the loss for to the expect output loss = self.criterion(output, y) # propagate the gradients though the network loss.backward() # store the loss (scaled by batch size for averaging) train_loss += loss * X.size(0) # find the predictions from this output y_pred = F.log_softmax(output, dim=1).max(1)[1] # compare to expected output to find the accuracy train_accuracy += accuracy_score(y.cpu().numpy(), y_pred.detach().cpu().numpy())*X.size(0) # improve the parameters self.optimizer.step() # return the mean loss and accuracy of this epoch N_samp = len(self.train_loader.dataset) return train_loss/N_samp, train_accuracy/N_samp def validate(self): """ Find the loss and accuracy of the current model parameters to the validation data set """ # if no validation set present return zeros if self.validate_loader == None: return torch.tensor(0.), torch.tensor(0.) # set the model to not expect a backward pass self.model.eval() validation_loss, validation_accuracy = 0., 0. # for every validate batch for X, y in self.validate_loader: # tell the optimizer not to store gradients with torch.no_grad(): # put the samples on the device X, y = X.to(self.device), y.to(self.device) # find the model output with current parameters output = self.model(X) # caclulate the loss for to the expect output loss = self.criterion(output, y) # store the loss (scaled by batch size for averaging) validation_loss += loss * X.size(0) # find the predictions from this output y_pred = F.log_softmax(output, dim=1).max(1)[1] # compare to expected output to find the accuracy validation_accuracy += accuracy_score(y.cpu().numpy(), y_pred.cpu().numpy())*X.size(0) # return the mean loss and accuracy of this epoch N_samp = len(self.validate_loader.dataset) return validation_loss/N_samp, validation_accuracy/N_samp def evaluate(self, test_data, prob_output=True): """ Find the prediction of the current model parameters with the test data set and return both the predicted and actual labels """ # set the model to not expect a backward pass self.model.eval() y_preds = [] # for every test batch for X in test_data: # normalise the test data with validates transformation if self.transform: X = self.transform(X) # tell the optimizer not to store gradients with torch.no_grad(): # put the samples on the device X = X.to(self.device) # find the model output with current parameters output = self.model(X.view(-1, 1, 28, 28)) # find the predictions from this output y_pred = F.log_softmax(output, dim=1) if not prob_output: y_pred = y_pred.max(1)[1] # store the predicted and actual outcomes y_preds.append(y_pred.cpu().numpy()) # return the list of predictions and actual targets return np.concatenate(y_preds, 0) def train_model(self, epochs): """ Do a live plot of the training accuracy and loss as the model is trained """ for _ in range(epochs): logs = {} train_loss, train_accuracy = self.train() logs['' + 'log loss'] = train_loss.item() logs['' + 'accuracy'] = train_accuracy.item() validation_loss, validation_accuracy = self.validate() logs['val_' + 'log loss'] = validation_loss.item() logs['val_' + 'accuracy'] = validation_accuracy.item() # if we are after the if self.keep_best: if train_accuracy.item() > self.max_acc and self.epoch > self.keep_best: self.max_acc = train_accuracy.item() self.best_params_dict = self.model.state_dict() self.liveloss.update(logs) self.liveloss.draw() self.epoch += 1 print("Training Finished") return def save_model(self, name, path=F"/content/gdrive/My Drive/models/"): """ Pickel either the whole model or its parameter dictionary via torch's save methods """ dict = {"model":self.model, "transform":self.transform, "Liveloss":self.liveloss} torch.save(dict, path + name) print("saved to " + path + name) def num_model_params(self): n_params = sum([t.cpu().detach().numpy().size for t in self.model.parameters()]) print("Number of model Parameters: ", n_params) return n_params def max_acc_epoch(self): max_acc = self.liveloss.metrics_extrema['val_accuracy']['max'] for log in self.liveloss.logs: if log["val_accuracy"] == max_acc: return log["_i"]# def confusion_matrix(self): y_preds, ys = [], [] # same code as validate self.model.eval() for X, y in self.validate_loader: with torch.no_grad(): X, y = X.to(self.device), y.to(self.device) output = self.model(X) y_pred = F.log_softmax(output, dim=1) y_pred = y_pred.max(1)[1] y_preds.append(y_pred.cpu().numpy()) ys.append(y.cpu().numpy()) y_preds = np.array(y_preds).flatten() ys = np.array(ys).flatten() return ConfusionMatrix(actual_vector=ys, predict_vector=y_preds)
def train(self): """Multiple training. Returns: None. """ max_noprogress = 5 _loss_train_min = 1e-5 n_noprogress = 0 process_bar = tqdm(range(self.iteration)) liveloss = PlotLosses(fig_path=self.output_file_name + ".iter.pdf") loss_list = [] _best_ndcg = 0 for i in process_bar: logs = {} all_loss = 0 kl_loss = 0 batch_num = 0 for batch_ndx, sample in enumerate(self.data_loader): pos_u = torch.tensor( [triple[0] for triple in sample], dtype=torch.int64, device=self.device, ) pos_i_1 = torch.tensor( [triple[1] for triple in sample], dtype=torch.int64, device=self.device, ) pos_i_2 = torch.tensor( [triple[2] for triple in sample], dtype=torch.int64, device=self.device, ) neg_u = torch.tensor( self.data.user_sampler.sample(self.n_neg, len(sample)), dtype=torch.int64, device=self.device, ) neg_i_1 = torch.tensor( self.data.item_sampler.sample(self.n_neg, len(sample)), dtype=torch.int64, device=self.device, ) neg_i_2 = torch.tensor( self.data.item_sampler.sample(self.n_neg, len(sample)), dtype=torch.int64, device=self.device, ) # print(pos_u,neg_u) self.optimizer.zero_grad() loss = self.model.forward(pos_u, pos_i_1, pos_i_2, neg_u, neg_i_2, neg_i_2) # print(loss) loss.backward() self.optimizer.step() all_loss = all_loss + loss kl_loss = kl_loss + self.model.kl_loss batch_num = batch_ndx if self.device.type == "cuda": all_loss = all_loss.cpu() if kl_loss != 0: kl_loss = kl_loss.cpu() logs["loss"] = all_loss.item() / batch_num if self.show_result: data_i = np.random.randint(10) result = self.data.evaluate_vali(self.data.test[data_i], self.model) logs["ndcg@10_test"], logs["recall@10_test"] = ( result["ndcg@10"], result["recall@10"], ) result = self.data.evaluate_vali(self.data.validate[data_i], self.model) logs["ndcg@10_val"], logs["recall@10_val"] = ( result["ndcg@10"], result["recall@10"], ) if _best_ndcg < result["ndcg@10"]: _best_ndcg = result["ndcg@10"] self.best_model = copy.deepcopy(self.model.state_dict()) torch.save(self.best_model, self.output_file_name) if kl_loss != 0: logs["kl_loss"] = kl_loss.item() / batch_num logs["loss"] = logs["loss"] - logs["kl_loss"] loss_list.append(logs["loss"]) if i > 1: if abs(loss_list[i] - loss_list[i - 1]) < _loss_train_min: n_noprogress += 1 else: n_noprogress = 0 liveloss.update(logs) liveloss.draw() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (logs["loss"], self.optimizer.param_groups[0]["lr"])) print("=== #no progress: ", n_noprogress) if n_noprogress >= max_noprogress: liveloss.draw() break """Sets the learning rate to the initial LR decayed by 10 every 10 epochs""" lr = self.initial_lr * (0.5**(i // 10)) for param_group in self.optimizer.param_groups: param_group["lr"] = lr if i >= self.iteration - 1: liveloss.draw()
def main(): global best_test_bpd last_checkpoints = [] lipschitz_constants = [] ords = [] # if args.resume: # validate(args.begin_epoch - 1, model, ema) #liveloss = PlotLosses() #liveloss = PlotLosses() liveloss = PlotLosses() for epoch in range(args.begin_epoch, args.nepochs): logs = {} logger.info('Current LR {}'.format(optimizer.param_groups[0]['lr'])) running_loss = train(epoch, model) #train(epoch, model) lipschitz_constants.append(get_lipschitz_constants(model)) #ords.append(get_ords(model)) #ords.append(get_ords(model)) ords.append(get_ords(model)) logger.info('Lipsh: {}'.format(pretty_repr(lipschitz_constants[-1]))) logger.info('Order: {}'.format(pretty_repr(ords[-1]))) #epoch_loss = running_loss / len(dataloaders[phase].dataset) epoch_loss = running_loss / len( datasets.CIFAR10( args.dataroot, train=True, transform=transform_train)) logs['log loss'] = epoch_loss.item() liveloss.update(logs) liveloss.draw() if args.ema_val: test_bpd = validate(epoch, model, ema) else: test_bpd = validate(epoch, model) if args.scheduler and scheduler is not None: scheduler.step() if test_bpd < best_test_bpd: best_test_bpd = test_bpd utils.save_checkpoint( { 'state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'args': args, 'ema': ema, 'test_bpd': test_bpd, }, os.path.join(args.save, 'moMoModels'), epoch, last_checkpoints, num_checkpoints=5) """ utils.save_checkpoint({ 'state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'args': args, 'ema': ema, 'test_bpd': test_bpd, }, os.path.join(args.save, 'mMoModels'), epoch, last_checkpoints, num_checkpoints=5) utils.save_checkpoint({ 'state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'args': args, 'ema': ema, 'test_bpd': test_bpd, }, os.path.join(args.save, 'mModels'), epoch, last_checkpoints, num_checkpoints=5) utils.save_checkpoint({ 'state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'args': args, 'ema': ema, 'test_bpd': test_bpd, }, os.path.join(args.save, 'models'), epoch, last_checkpoints, num_checkpoints=5) """ torch.save( { 'state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'args': args, 'ema': ema, 'test_bpd': test_bpd, }, os.path.join(args.save, 'models', '010mmoosttMoosttRecentt.pth')) """
def train_model(model, dataloaders, dataset_sizes, criterion, optimizer, scheduler, num_epochs=25): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") since = time.time() liveloss = PlotLosses() best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0 for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch + 1, num_epochs)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': scheduler.step() model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode running_loss = 0.0 running_corrects = 0 # Iterate over data. for i, (inputs, labels) in enumerate(dataloaders[phase]): inputs = inputs.to(device) labels = labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward # track history if only in train with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs) _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) # backward + optimize only if in training phase if phase == 'train': loss.backward() optimizer.step() # statistics running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) print("\rIteration: {}/{}, Loss: {}.".format( i + 1, len(dataloaders[phase]), loss.item() * inputs.size(0)), end="") # print( (i+1)*100. / len(dataloaders[phase]), "% Complete" ) sys.stdout.flush() epoch_loss = running_loss / dataset_sizes[phase] epoch_acc = running_corrects.double() / dataset_sizes[phase] if phase == 'train': avg_loss = epoch_loss t_acc = epoch_acc else: val_loss = epoch_loss val_acc = epoch_acc # print('{} Loss: {:.4f} Acc: {:.4f}'.format( # phase, epoch_loss, epoch_acc)) # deep copy the model if phase == 'val' and epoch_acc > best_acc: best_acc = epoch_acc best_model_wts = copy.deepcopy(model.state_dict()) liveloss.update({ 'log loss': avg_loss, 'val_log loss': val_loss, 'accuracy': t_acc, 'val_accuracy': val_acc }) liveloss.draw() print('Train Loss: {:.4f} Acc: {:.4f}'.format(avg_loss, t_acc)) print('Val Loss: {:.4f} Acc: {:.4f}'.format(val_loss, val_acc)) print('Best Val Accuracy: {}'.format(best_acc)) print() time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best val Acc: {:4f}'.format(best_acc)) # load best model weights model.load_state_dict(best_model_wts) return model
best_acc = test_correct / test_total # checkpoint = torch.load('./checkpoint/Sqnet_1x_v1.0/Sqnet_1x_v1.0_Cifar10.ckpt') # net.load_state_dict(checkpoint['net_state_dict']) # optimizer.load_state_dict(checkpoint['optimizer_state_dict']) liveloss = PlotLosses() for _epoch in range(start_epoch, start_epoch + num_epochs): start_time = time.time() train(_epoch) print() test(_epoch) print() print() end_time = time.time() print('Epoch #%d Cost %ds' % (_epoch, end_time - start_time)) best_cost = end_time - start_time if end_time - start_time < best_cost: best_cost = end_time - start_time liveloss.update({ 'log loss': train_loss, 'val_log loss': test_loss, 'accuracy': train_correct, 'val_accuracy': test_correct }) liveloss.draw() print('Best Cost: %ds' % (best_cost)) print('Best Acc: %.4f percent' % (best_acc * 100))
def fit_model(train_loader, val_loader, model, optimizer, scheduler, n_epochs, log_interval, plot=True, burnin=-1, patience=3, early_stop_score='MAP', eval_metric='cosine'): early_stop = {} early_stop['best'] = -float('inf') early_stop['best_params'] = to_cpu(model.state_dict()) early_stop['fails'] = 0 if plot: liveloss = PlotLosses() for epoch in range(n_epochs): logs = {} start_time = time.time() # Training train_loss = train_epoch(train_loader, model, optimizer) train_scores = {} # Turned off for optimize # if epoch > 0 and epoch % log_interval == 0: # train_scores = evaluate_ranking(model, train_loader, metric=eval_metric) elapsed = time.time() - start_time message = '\n' + '=' * 80 message += '\nTrain: ' message += f' epoch: {epoch:2d}, time: {int(elapsed):d}s., loss: {train_loss:5.3f}' if 'silhouette' in train_scores: message += f', silouhette: {train_scores["silhouette"]:.2f}' message += '\n' # Validation start_time = time.time() val_loss = test_epoch(val_loader, model) val_scores = {} if epoch > 0 and epoch % log_interval == 0: train_label_set = list(set(train_loader.dataset.labels)) val_scores = evaluate_ranking(model, val_loader, train_label_set, metric=eval_metric) # early stopping if val_scores[early_stop_score] > early_stop['best']: early_stop['best'] = val_scores[early_stop_score] early_stop['best_params'] = to_cpu(model.state_dict()) early_stop['fails'] = 0 early_stop['val_scores'] = val_scores else: early_stop['fails'] += 1 if early_stop['fails'] >= patience: raise EarlyStopException(early_stop['best'], early_stop['best_params'], early_stop['fails'], early_stop['val_scores']) elapsed = time.time() - start_time message += 'Validation:' message += f' epoch: {epoch:2d}, time: {int(elapsed):d}s., loss: {val_loss:5.3f}' if 'silhouette' in val_scores: message += f', silhouette: {val_scores["silhouette"]:.2f}' message += f'\n MAP: {val_scores["MAP"]:.2f}' message += f', MAP (seen): {val_scores["MAP seen labels"]:.2f}' message += f', MAP (unseen): {val_scores["MAP unseen labels"]:.2f}' message += '\n' message += '=' * 80 + '\n' print(message) logs['loss'] = train_loss logs['val_loss'] = val_loss for score, value in train_scores.items(): logs[score] = value for score, value in val_scores.items(): logs[f'val_{score}'] = value if epoch > burnin: scheduler.step(val_loss) if plot: liveloss.update(logs) liveloss.draw() # return data in case it never early stopped return early_stop
def train(self, train_ds, valid_ds, plot_loss=True, verbose=True, save_path=None, need_y: str = 'no'): """Method for training, takes train and validation Datasets, as well as parameters specifying training monitoring and trains a network for a given set of hyperparameters. :param train_ds: training Dataset :param valid_ds: validation Dataset :param plot_loss: whether to plot loss during training :param verbose: whether to print loss after each epoch :param save_path: if given, serialises the model and saves there :param need_y: command to extract y's in order to train Attention based models with 'state' or 'switch cells' layer """ # Create DataLoaders assert need_y in ['no', 'yes'], 'Should be no/yes' train_dl = DataLoader(train_ds, batch_size=self.batch_size, shuffle=True) test_dl = DataLoader(valid_ds, batch_size=self.batch_size) # Dictionary for losses losses = {'train_loss': [], 'valid_loss': []} # Plot losses if the user chooses so if plot_loss: liveloss = PlotLosses() # Iterate over epochs for epoch in range(self.max_epochs): # Switch to training mode self.model.train() if verbose: print('Starting epoch {}'.format(epoch + 1)) # A list for batch-wise training losses in a given epoch epoch_loss = [] # Iterate over batches for idx_batch, batch in enumerate(train_dl): self.optimizer.zero_grad() if need_y == 'yes': out = self.model(batch[0]['train_obs'].permute(1, 0, 2), y=batch[1].permute(1, 0)) tr_loss = self.loss(out, batch[0]['train_y'].to(DEVICE)) elif need_y == 'no': out = self.model(batch['train_obs'].permute(1, 0, 2)) tr_loss = self.loss(out, batch['train_y'].to(DEVICE)) epoch_loss.append(tr_loss.item()) tr_loss.backward() self.optimizer.step() # Switch to evaluation mode self.model.eval() # Compute training loss for the epoch losses['train_loss'].append(sum(epoch_loss) / len(train_dl)) # Compute validation loss by iterating through valid dl batches with torch.no_grad(): # A list for batch-wise validation losses val_loss = [] # Iterate over batches in the validation DataLoader for idx_v_batch, v_batch in enumerate(test_dl): if need_y == 'yes': val_loss.append( self.loss( self.model(v_batch[0]['test_obs'].permute( 1, 0, 2), y=v_batch[1].permute(1, 0)), v_batch[0]['test_y']).item()) elif need_y == 'no': val_loss.append( self.loss( self.model(v_batch['test_obs'].permute( 1, 0, 2)), v_batch['test_y']).item()) losses['valid_loss'].append(sum(val_loss) / len(test_dl)) # Printing loss for a given epoch if verbose: print('Loss: {}'.format(losses['valid_loss'][epoch])) # Plot loss after each epoch if the user chose to if plot_loss: logs = { 'log_loss': losses['train_loss'][epoch], 'val_log_loss': losses['valid_loss'][epoch] } liveloss.update(logs) liveloss.draw() # Early stopping if self.early_stopping_patience: lag_1 = losses['valid_loss'][( epoch - self.early_stopping_patience):epoch] lag_2 = losses['valid_loss'][(epoch - self.early_stopping_patience - 1):(epoch - 1)] no_drops = sum(True if l1 < l2 else False for l1, l2 in zip(lag_1, lag_2)) if epoch > self.early_stopping_patience and no_drops == 0: break # Save last loss self.final_loss = np.mean(losses['valid_loss'][-1]) self.last_epoch = epoch # Save model if save_path: torch.save(self.model.state_dict(), save_path)
def train_model(model, train_dataset, validate_dataset, test_dataset, batch_size, test_batch_size, lr, n_epochs, optimizer=None, epoch_trained=0, seed=42): """The train function """ set_seed(seed) if optimizer is None: optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5, eps=1e-3, amsgrad=True) else: optimizer = optimizer criterion_train = multiscaleUnsupervisorError criterion_validate = realEPE # Prepare data loader train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True) validation_loader = DataLoader(validate_dataset, batch_size=test_batch_size, shuffle=False, num_workers=4, pin_memory=True) test_loader = DataLoader(test_dataset, batch_size=test_batch_size, shuffle=False, num_workers=4, pin_memory=True) liveloss = PlotLosses() para_dict = {} total_time = 0 for epoch in range(epoch_trained, n_epochs): start_time = time.clock() print("Total epoch %d" % n_epochs) print("Epoch %d starts! " % epoch) print("Memory allocated: ", torch.cuda.memory_allocated() / 1024 / 1024 / 1024) GPUtil.showUtilization() logs = {} train_loss, train_loss_epe = train(model, optimizer, criterion_train, train_loader) validation_loss_epe = validate(model, criterion_validate, validation_loader) end_time = time.clock() logs['' + 'multiscale loss'] = train_loss logs['' + 'EPE loss'] = train_loss_epe logs['val_' + 'EPE loss'] = validation_loss_epe liveloss.update(logs) liveloss.draw() total_time += end_time - start_time print( "Epoch: ", epoch, ", Avg. Train EPE Loss: %1.3f" % train_loss_epe, " Avg. Validation EPE Loss: %1.3f" % validation_loss_epe, "Time used this epoch (seconds): %1.3f" % (end_time - start_time), "Time remain(hrs) %1.3f" % (total_time / (epoch + 1) * (n_epochs - epoch) / 3600)) # Every 5 epoach, checkpoint if (epoch + 1) % 5 == 0: test_loss_epe = validate(model, criterion_validate, test_loader) # Fill in the parameters into the dict para_dict['epoch'] = epoch para_dict['dataset size'] = len(train_loader.dataset) para_dict['train EPE'] = train_loss_epe para_dict['validation EPE'] = validation_loss_epe para_dict['learning rate'] = lr para_dict['time used(seconds)'] = total_time # There is no actual test loss, so use validation loss here para_dict['test EPE'] = test_loss_epe # Do the save save_model(model, optimizer, train_loss, para_dict, "UnLiteFlowNet_checkpoint_%d_" % epoch) test_loss_epe = validate(model, criterion_validate, test_loader) print(" Avg. Test EPE Loss: %1.3f" % test_loss_epe, "Total time used(seconds): %1.3f" % total_time) print("") # Fill in the parameters into the dict para_dict = {} para_dict['epoch'] = n_epochs para_dict['dataset size'] = len(train_loader.dataset) para_dict['batch_size'] = batch_size para_dict['train EPE'] = train_loss_epe para_dict['validation EPE'] = validation_loss_epe para_dict['learning rate'] = lr para_dict['time used(seconds)'] = total_time para_dict['test EPE'] = test_loss_epe save_model(model, optimizer, train_loss, para_dict, "UnLiteFlowNet_%d_" % epoch) return model