class Logger: def __init__(self, log_path, comment=None): self.log_path = log_path self.writer = SummaryWriter(log_dir=self.log_path, comment=comment) try: if not (os.path.exists(self.log_path)): os.makedirs(self.log_path) else: pass # print("Directory Already Exists.") except Exception as e: print(e) print("Failed to Create Log Directory.") def save_params( self, param_list, param_name_list, epoch=None, batch_size=None, batch=None, combine=False, combine_name=None, global_step=None, ): if combine is False: for i in range(len(param_list)): if isinstance(param_list[i], Variable): param_list[i] = param_list[i].data.cpu().numpy() if global_step is None: self.writer.add_scalar( param_name_list[i], param_list[i], Logger._global_step(epoch, batch_size, batch), ) else: self.writer.add_scalar(param_name_list[i], param_list[i], global_step) else: scalar_dict = dict(zip(param_name_list, param_list)) if global_step is None: self.writer.add_scalars( combine_name, scalar_dict, Logger._global_step(epoch, batch_size, batch), ) else: self.writer.add_scalars(combine_name, scalar_dict, global_step) def save_batch_images(self, image_name, image_batch, epoch, batch_size, batch=None, dataformats="CHW"): self.writer.add_images( image_name, image_batch, Logger._global_step(epoch, batch_size, batch), dataformats=dataformats, ) def save_prcurve(self, labels, preds, epoch, batch_size, batch=None): self.writer.add_pr_curve("pr_curve", labels, preds, Logger._global_step(epoch, batch_size, batch)) def save_hyperparams(self, hparam_list, hparam_name_list, metric_list, metric_name_list): for i in range(len(hparam_list)): if isinstance(hparam_list[i], list): hparam_list[i] = ",".join(list(map(str, hparam_list[i]))) if isinstance(hparam_list[i], dict): hparam_list[i] = json.dumps(hparam_list[i]) # if type(hparam_list[i]) == DictConfig: # hparam_list[i] = str(hparam_list[i]) if hparam_list[i] is None: hparam_list[i] = "None" print(hparam_list, hparam_name_list, metric_list, metric_name_list) self.writer.add_hparams( dict(zip(hparam_name_list, hparam_list)), dict(zip(metric_name_list, metric_list)), ) def save_models(self, model_list, model_names_list, epoch): # Need to check is epoch is needed for model_name, model in zip(model_names_list, model_list): torch.save(model.state_dict(), os.path.join(self.log_path, model_name)) def save_fig(self, fig, fig_name, epoch, batch_size, batch=None): self.writer.add_figure(fig_name, fig, Logger._global_step(epoch, batch_size, batch)) # def display_params( # self, params_list, params_name_list, epoch, num_epochs, batch_size, batch # ): # for i in range(len(params_list)): # if isinstance(params_list[i], Variable): # params_list[i] = params_list[i].data.cpu().numpy() # print("Epoch: {}/{}, Batch: {}/{}".format(epoch,\ # num_epochs, batch, batch_size)) # for i in range(len(params_list)): # print("{}:{}".format(params_name_list[i], params_list[i])) # def draw_model_architecture(self, model, output, input, \ # input_name, save_name): # make_dot( # output, params=dict(list(model.named_parameters())) + \ # [(input_name, input)] # ) def close(self): self.writer.close() @staticmethod def _global_step(epoch, batch_size, batch): if batch: return epoch * batch_size + batch else: return epoch
n_correct += (predictions == labels).sum().item() class_prob_batch = [F.softmax(output, dim=0) for output in outputs] class_preds.append(class_prob_batch) # 10 different class probability class_labels.append(predictions) # single class prediction class_preds = torch.cat([torch.stack(batch) for batch in class_preds]) class_labels = torch.cat(class_labels) acc = 100.0 * n_correct / n_total print(f'Accuracy for {n_total} images is {acc:0.4f}') #tensorboard for i in range(10): label_i = class_labels == i preds_i = class_preds[:, i] writer.add_pr_curve(str(i), label_i, preds_i, global_step=0) writer.close() #evaluation in test set: image_id = [] label = [] y_pred = model(test_dataset.x).detach() _, predictions = torch.max(y_pred, 1) for i, pred in enumerate(predictions): image_id.append(i + 1) pred = pred.item() label.append(pred) submission_dict = {'ImageId': image_id, 'Label': label} df = pd.DataFrame(submission_dict) df.to_csv('submission_v1.csv', index=False)
def main(): #net = 'ATClsResnet' net = 'ATClsFPNResnet' #net = 'resnet' #net = 'inceptionv3' configRoot = Path('configs') dataConfigFileName = Path('dataconfig.yaml') netConfigFileName = Path(f'{net}.yaml') cfg = config(str(configRoot / dataConfigFileName)) cfg.mergeWith(str(configRoot / netConfigFileName)) # data batchSize = cfg['train']['batch_size'] shuffle = cfg['train']['shuffle'] numWorkers = cfg['train']['num_worker'] balance = cfg['train']['balance'] # train netname = cfg['train']['netname'] num_epochs = cfg['train']['epoch'] device = cfg['train']['device'] device = torch.device(device) outputDir = Path(cfg['train']['output_dir']) netoutdir = Path(netname) thisRunName = Path(cfg['train']['session_dir']) saveRoot = outputDir / netoutdir / thisRunName if saveRoot.exists(): val = input(f'remove {str(saveRoot)} and continue ? y or n') if val == 'y': shutil.rmtree(str(saveRoot)) else: raise Exception("Stop for protect the existing data") tfLog = Path('tensorboard') writer = SummaryWriter(str(saveRoot / tfLog)) save_step = cfg['train']['save_step'] # loss #Clscriterion = nn.CrossEntropyLoss() #Clscriterion = ClsWeightLoss() Clscriterion = FocalLoss() ATcriterion = ATMaskLoss() TrainDataSet = class2setWithATMask(cfg, isTrain=True) TestDataSet = class2setWithATMask(cfg, isTrain=False) print(f'TrainDataSet positive rate {TrainDataSet.prate}') print(f'TestDataSet positive rate {TestDataSet.prate}') if balance: prate = TrainDataSet.prate weights = [] for _, l, _ in TrainDataSet: weights.append(1 - prate if l == 1 else prate) trainSampler = WeightedRandomSampler(weights, len(TrainDataSet), replacement=True) TrainDataloader = DataLoader(TrainDataSet, batch_size=batchSize, num_workers=numWorkers, sampler=trainSampler) print("using balance data") else: TrainDataloader = DataLoader(TrainDataSet, batch_size=batchSize, shuffle=shuffle, num_workers=numWorkers) TestDataloader = DataLoader(TestDataSet, batch_size=batchSize, shuffle=False, num_workers=numWorkers) model = regNets[net](config=cfg) optimizer = optim.SGD(model.parameters(), lr=1e-4, momentum=0.5, weight_decay=1e-3) dataiter = iter(TrainDataloader) images, labels, masks = dataiter.next() imgGrid = torchvision.utils.make_grid(images) MaskGrid = torchvision.utils.make_grid( masks.unsqueeze(1).repeat(1, 3, 1, 1)) writer.add_image('sample images', imgGrid) writer.add_image('sample image mask', MaskGrid) writer.add_graph(model, images) model.to(device) step = 1 pred_at = [] label_at = [] for e in range(num_epochs): running_loss_cls = 0.0 running_loss_at = 0.0 running_loss = 0.0 running_at_ploss = 0.0 running_at_nloss = 0.0 running_cls_label = [] running_cls_prob = [] pos = 0 neg = 0 model.train() for i, data in enumerate(TrainDataloader): step += 1 inputs, labels, atmask = data[0].to(device), data[1].to( device), data[2].to(device) optimizer.zero_grad() clsout, rpn = model(inputs) running_cls_label.append(labels.cpu()) running_cls_prob.append(clsout.cpu()) clsLoss = Clscriterion(clsout, labels) ploss, nloss = ATcriterion(rpn, atmask) loss = ploss + nloss + clsLoss running_loss_at += ploss + nloss running_at_ploss += ploss running_at_nloss += nloss running_loss_cls += clsLoss running_loss += loss pos += torch.sum(labels) neg += torch.sum(torch.ones_like(labels) - labels) loss.backward() optimizer.step() if step % save_step == 0: ckp = Path(f'checkpoint_{e+1}_{i+1}_{step+1}.pth') torch.save(model.state_dict(), str(saveRoot / ckp)) if step % 600 == 599: model.eval() test(model, TestDataloader, device, writer, step, Clscriterion) model.train() if i % 5 == 4: avgLoss = running_loss / 5 avgClsLoss = running_loss_cls / 5 avgAtLoss = running_loss_at / 5 avgAtPLoss = running_at_ploss / 5 avgAtNLoss = running_at_nloss / 5 totalStep = e * len(TrainDataloader) + i rpn = rpn.cpu()[:2] rpn = torch.argmax(rpn, dim=1) rpn = rpn.unsqueeze(1).repeat(1, 3, 1, 1) writer.add_images('train rpn pred', rpn) atmask = atmask.cpu()[:2] atmask = atmask.unsqueeze(1).repeat(1, 3, 1, 1) writer.add_images('train rpn target', atmask) print( "epoch:{:2d}, step:{:4d} TotalStep:{:4d} loss:{:.3f} ClsLoss:{:.3f} AtLoss:{:.3f} posIns:{} negIns:{}" .format(e + 1, i + 1, step, avgLoss, avgClsLoss, avgAtLoss, pos, neg)) writer.add_scalar('training loss', avgLoss, step) writer.add_scalar('training cls loss', avgClsLoss, step) writer.add_scalar('training At loss', avgAtLoss, step) writer.add_scalar('training At p loss', avgAtPLoss, step) writer.add_scalar('training At n loss', avgAtNLoss, step) running_loss = 0 running_loss_cls = 0 running_loss_at = 0 running_at_ploss = 0 running_at_nloss = 0 pos = 0 neg = 0 cls_prob = torch.cat(running_cls_prob, dim=0) cls_prob = F.softmax(cls_prob, dim=1)[:, 1] writer.add_pr_curve('Training Crack PR', torch.cat(running_cls_label), cls_prob, global_step=step) running_cls_label.clear() running_cls_prob.clear() torch.save(model.state_dict(), str(saveRoot / Path('model_final.pth')))
class util_classification_tensorboard(): ''' classification tensorboard 기록 전용 클래스 ''' def __init__(self, kernel_type, classes): ''' Constructor classes = list [name1, name2, ... , name_n] e.g. ['normal', 'stone'] ''' self.writer = SummaryWriter(f'./runs/{kernel_type}') self.classes = classes def __del__(self): ''' Destructor ''' self.writer.close() def write_batchsamples(self, batchsample_name, images, labels): ''' 이미지 배치 샘플들을 기록 ''' # 이미지 그리드를 만듬. img_grid = torchvision.utils.make_grid(images) self.__matplotlib_imshow(img_grid, one_channel=True) self.writer.add_image(batchsample_name, img_grid) def write_net_graph(self, net): ''' 뉴럴넷 모델을 기록함 ''' self.writer.add_graph(net) def write_train_epoch(self, loss, inputs, labels, preds, probs, epoch): ''' 뉴럴넷 학습 epoch 단계별 이미지/loss를 기록함 train_epoch() 안쪽에서 호출하는 목적 ''' # 학습 중 손실(running loss)을 기록 self.writer.add_scalar('train/Loss', loss, epoch) self.writer.add_scalar('train/Accuracy', loss, epoch) self.writer.add_scalar('train/AUC', loss, epoch) # 미니배치(mini-batch)에 대한 예측 결과 Figure를 기록 self.writer.add_figure('train/predict vs. GT', self.__plot_classes_preds( inputs, labels, preds, probs), global_step=epoch) # ROC 그리기 target_index = 1 self.write_pr_curve_tensorboard(self, target_index, preds, probs, global_step=epoch) def write_pr_curve_tensorboard(self, target_index, test_preds, test_probs, global_step=0): ''' target_index에 해당하는 ROC curve를 그림 ''' tensorboard_preds = test_preds == target_index tensorboard_probs = test_probs[:, target_index] self.writer.add_pr_curve(self.classes[target_index], tensorboard_preds, tensorboard_probs, global_step=global_step) def __plot_classes_preds(self, images, labels, preds, probs): ''' 학습된 신경망과 배치로부터 가져온 이미지 / 라벨을 사용하여 matplotlib Figure를 생성. 신경망의 예측 결과 / 확률과 함께 정답(GT)을 보여주며, 예측 결과가 맞았는지 여부에 따라 색을 다르게 표시 ''' # 배치에서 이미지를 가져와 예측 결과 / 정답과 함께 표시(plot)합니다 fig = plt.figure(figsize=(12, 48)) for idx in np.arange(4): ax = fig.add_subplot(1, 4, idx + 1, xticks=[], yticks=[]) self.__matplotlib_imshow(images[idx], one_channel=True) ax.set_title( "{0}, {1:.1f}%\n(label: {2})".format( self.classes[preds[idx]], probs[idx] * 100.0, self.classes[labels[idx]]), color=("green" if preds[idx] == labels[idx].item() else "red")) return fig def __matplotlib_imshow(img, one_channel=False): # 이미지를 보여주기 위한 헬퍼(helper) 함수 # (아래 `plot_classes_preds` 함수에서 사용) if one_channel: img = img.mean(dim=0) img = img / 2 + 0.5 # unnormalize npimg = img.numpy() if one_channel: plt.imshow(npimg, cmap="Greys") else: plt.imshow(np.transpose(npimg, (1, 2, 0)))
writer.add_image('my_image', img, 0) # If you have non-default dimension setting, set the dataformats argument. writer.add_image('my_image_HWC', img_HWC, 0, dataformats='HWC') img_batch = np.zeros((16, 3, 100, 100)) for i in range(16): img_batch[i, 0] = np.arange(0, 10000).reshape(100, 100) / 10000 / 16 * i img_batch[i, 1] = (1 - np.arange(0, 10000).reshape(100, 100) / 10000) / 16 * i writer.add_images('my_image_batch', img_batch, 0) labels = np.random.randint(2, size=100) # binary label predictions = np.random.rand(100) writer.add_pr_curve('pr_curve', labels, predictions, 0) vertices_tensor = torch.as_tensor([ [1, 1, 1], [-1, -1, 1], [1, -1, -1], [-1, 1, -1], ], dtype=torch.float).unsqueeze(0) colors_tensor = torch.as_tensor([ [255, 0, 0], [0, 255, 0], [0, 0, 255], [255, 0, 255], ], dtype=torch.int).unsqueeze(0) faces_tensor = torch.as_tensor([ [0, 2, 3],
def main(args): data_path = args.data_path snapshot_path = args.snapshot_path batch_size = args.batch_size workers = args.workers image_size = args.image_size device = args.device n_epochs = args.epochs learning_rate = args.learning_rate tensorboard_logdir = args.tensorboard_logdir dataset = FloatingSeaObjectDataset( data_path, fold="train", transform=get_transform("train", intensity=args.augmentation_intensity, add_fdi_ndvi=args.add_fdi_ndvi), output_size=image_size, seed=args.seed) valid_dataset = FloatingSeaObjectDataset( data_path, fold="val", transform=get_transform("test", add_fdi_ndvi=args.add_fdi_ndvi), output_size=image_size, seed=args.seed, hard_negative_mining=False) # store run arguments in the same folder run_arguments = vars(args) run_arguments["train_regions"] = ", ".join(dataset.regions) run_arguments["valid_dataset"] = ", ".join(valid_dataset.regions) os.makedirs(os.path.dirname(args.snapshot_path), exist_ok=True) with open( os.path.join(os.path.dirname(args.snapshot_path), f"run_arguments_{args.seed}.json"), 'w') as outfile: json.dump(run_arguments, outfile) print(run_arguments) # loading training datasets train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=workers) val_loader = DataLoader(valid_dataset, batch_size=batch_size, num_workers=workers, shuffle=True) # compute the number of labels in each class # weights = compute_class_occurences(train_loader) #function that computes the occurences of the classes pos_weight = torch.FloatTensor([float(args.pos_weight)]).to(device) bcecriterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight, reduction="none") def criterion(y_pred, target, mask=None): """a wrapper around BCEWithLogitsLoss that ignores no-data mask provides a boolean mask on valid data""" loss = bcecriterion(y_pred, target) if mask is not None: return (loss * mask.double()).mean() else: return loss.mean() inchannels = 12 if not args.add_fdi_ndvi else 14 model = get_model(args.model, inchannels=inchannels, pretrained=not args.no_pretrained).to(device) # initialize optimizer optimizer = Adam(model.parameters(), lr=learning_rate) if snapshot_path is not None and os.path.exists(snapshot_path): start_epoch, logs = resume(snapshot_path, model, optimizer) start_epoch += 1 print( f"resuming from snapshot {snapshot_path}. starting epoch {start_epoch}" ) for log in logs: print( f"epoch {log['epoch']}: trainloss {log['trainloss']:.4f}, valloss {log['valloss']:.4f}. (from {snapshot_path})" ) else: start_epoch = 1 logs = [] # create summary writer if tensorboard_logdir is not None writer = SummaryWriter( log_dir=tensorboard_logdir) if tensorboard_logdir is not None else None for epoch in range(start_epoch, n_epochs + 1): trainloss = training_epoch(model, train_loader, optimizer, criterion, device) valloss, metrics = validating_epoch(model, val_loader, criterion, device) log = dict( epoch=epoch, trainloss=trainloss, valloss=valloss, ) log.update(metrics) logs.append(log) if writer is not None: writer.add_scalars("loss", { "train": trainloss, "val": valloss }, global_step=epoch) fig = predict_images(val_loader, model, device) writer.add_figure("predictions", fig, global_step=epoch) predictions, targets = get_scores(val_loader, model, device) targets = targets.reshape(-1) targets = targets > 0.5 # make to bool predictions = predictions.reshape(-1) writer.add_pr_curve("unbalanced", targets, predictions, global_step=epoch) # make predictions and targets balanced by removing not floating pixels until numbers of positive # and negative samples are equal floating_predictions = predictions[targets] not_floating_predictions = predictions[~targets] np.random.shuffle(not_floating_predictions) not_floating_predictions = not_floating_predictions[:len( floating_predictions)] predictions = np.hstack( [floating_predictions, not_floating_predictions]) targets = np.hstack([ np.ones_like(floating_predictions), np.zeros_like(not_floating_predictions) ]) writer.add_pr_curve("balanced", targets, predictions, global_step=epoch) # retrieve best loss by iterating through previous logged losses best_loss = min([l["valloss"] for l in logs]) best_kappa = max([l["kappa"] for l in logs]) kappa = metrics["kappa"] save_msg = "" # write save model message in the same line of the pring if valloss <= best_loss or kappa >= best_kappa: save_msg = f"saving model to {snapshot_path}" # add this message if model saved snapshot(snapshot_path, model, optimizer, epoch, logs) metrics_message = ", ".join( [f"{k} {v:.2f}" for k, v in metrics.items()]) print( f"epoch {epoch}: trainloss {trainloss:.4f}, valloss {valloss:.4f}, {metrics_message} ,{save_msg}" )
writer.add_scalars('avg/total loss', { 'train': total_train_loss, 'val': total_val_loss }, epoch) writer.add_scalars('avg/auc', { 'train': train_avg_auc, 'val': val_avg_auc }, epoch) for class_name, auc1, auc2 in zip(class_names, train_auc, val_auc): writer.add_scalars('AUC/{}'.format(class_name), { 'train': auc1, 'val': auc2 }, epoch) for i in range(len(class_names)): writer.add_pr_curve('PR curve train/{}'.format(class_names[i]), train_data_pr[1][:, i], train_data_pr[0][:, i], global_step=epoch) writer.add_pr_curve('PR curve validation/{}'.format(class_names[i]), val_data_pr[1][:, i], val_data_pr[0][:, i], global_step=epoch) writer.flush() print( 'EPOCH %d:\tTRAIN [duration %.3f sec, loss: %.3f, avg auc: %.3f]\t\t' 'VAL [duration %.3f sec, loss: %.3f, avg auc: %.3f]\tCurrent time %s' % (epoch + 1, train_duration, total_train_loss, train_avg_auc, val_duration, total_val_loss, val_avg_auc, str(datetime.now(timezone('Europe/Moscow'))))) torch.save(
def train(self): # Load saved model if resume option selected if self.resume: print(Trainer.time_str() + ' Resuming training ... ') checkpoint = torch.load(os.path.join(self.log_root, self.get_epoch_root(self.resume_epoch), 'torch_model_optim.pth')) self.model.load_state_dict(checkpoint['model_state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) else: print(Trainer.time_str() + ' Starting training ... ') writer = SummaryWriter(self.log_root) self.model = self.model.to(self.device) epoch = int(self.model.epoch) + 1 batch_counter = int(self.model.iteration) # Epoch loop for epoch in range(epoch, epoch + self.num_epoch): # Create logging directory epoch_root = self.get_epoch_root(epoch) if not os.path.exists(os.path.join(self.log_root, epoch_root)): os.makedirs(os.path.join(self.log_root, epoch_root)) # Select data loaders for the current epoch cur_epoch_loaders = self.get_epoch_loaders(epoch) # Dictionary (of dictionaries) to collect four metrics from different phases for tensorboard epoch_metric_names = ['epoch_loss', 'epoch_accuracy', 'epoch_precision', 'epoch_recall'] epoch_metric_dict = {metric_name: dict.fromkeys(cur_epoch_loaders.keys()) for metric_name in epoch_metric_names} # Loop over phases within one epoch [train, validation, test] for phase in cur_epoch_loaders.keys(): # Select training state of the NN model if phase == 'train': self.model.train(True) else: self.model.train(False) # Select Loader cur_loader = cur_epoch_loaders[phase] # Number of samples columns = self.target_names.columns sample_count_df = pd.DataFrame(np.zeros([2, len(columns)], dtype=np.int64), columns=columns, index=('No', 'Yes')) num_samples = len(cur_loader.batch_sampler.sampler) total_sample_counter = 0 num_target_class = self.model.classifier.num_output # initializing variables for keeping track of results for tensorboard reporting results_phase = self.init_results_phase(num_samples=num_samples, num_target_class=num_target_class) for i, data in enumerate(cur_loader): batch_counter += 1 # Copy input and targets to the device object inputs = data['input'].to(self.device) type_indices = self.get_target_type_index() targets = data['target'][:, type_indices].float().squeeze().to(self.device) # Zero the parameter gradients self.optimizer.zero_grad() # Forward pass outputs = self.model(inputs).squeeze() loss = self.criterion(outputs, targets) # Backward + Optimize(in training) if phase == 'train': loss.mean().backward() self.optimizer.step() # Record results of the operation for the reporting results_batch = self.get_results_batch(results_phase.keys(), data, loss, outputs) # Aggregate results into a phase array for complete epoch reporting cur_batch_size = inputs.shape[0] nominal_batch_size = cur_loader.batch_size results_phase, batch_idx_range = self.update_results_phase(results_batch=results_batch, results_phase=results_phase, nominal_batch_size=nominal_batch_size, cur_batch_size=cur_batch_size, batch_idx=i) # Gather number of each class in mini batch total_sample_counter += cur_batch_size non_zero_count = np.count_nonzero(results_batch['target'], axis=0) cur_sample_count = np.vstack((cur_batch_size-non_zero_count, non_zero_count)) assert (cur_sample_count.sum(axis=0) == cur_batch_size).all(), 'Sum to batch size check failed' sample_count_df = sample_count_df + cur_sample_count # logging for the running loss and accuracy for each target class if i % self.log_int == 0: running_loss_log = results_phase['loss'][:batch_idx_range[1]].mean(axis=0) running_accuracy = results_phase['correct'][:batch_idx_range[1]].mean(axis=0) accuracy_dict = self.add_target_names(running_accuracy.round(3)) running_loss_dict = self.add_target_names(running_loss_log.round(3)) print(Trainer.time_str() + ' Phase: ' + phase + f', epoch: {epoch}, batch: {i}, running loss: {running_loss_dict}, running accuracy: {accuracy_dict}') writer.add_scalars(f'running_loss/{phase}', running_loss_dict, batch_counter) writer.add_scalars(f'running_accuracy/{phase}', accuracy_dict, batch_counter) # Number of samples in epoch checked two ways assert total_sample_counter == num_samples # Make sure no -1s left in the phase results (excluding input which throws errors) for key in ['loss', 'output_prob', 'prediction', 'target', 'correct']: assert not (results_phase[key] == -1).any() # Fraction for each class of target class_fraction_df = sample_count_df / num_samples assert np.isclose(class_fraction_df.sum(), 1.0).all(), 'All fraction sum to 1.0 failed' # the index for positive examples in each class with_index = 'Yes' fraction_positive_dict = class_fraction_df.loc[with_index].to_dict() writer.add_scalars(f'Fraction_with_target/{phase}', fraction_positive_dict, epoch) # calculate epoch loss and accuracy average over batch samples # Epoch error measures epoch_loss_log = results_phase['loss'].mean(axis=0) epoch_loss_dict = self.add_target_names(epoch_loss_log.round(3)) epoch_accuracy_log = results_phase['correct'].mean(axis=0) epoch_acc_dict = self.add_target_names(epoch_accuracy_log.round(3)) print(Trainer.time_str() + ' Phase: ' + phase + f', epoch: {epoch}: epoch loss: {epoch_loss_dict}, epoch accuracy: {epoch_acc_dict}') # Pickle important results dict elements: loss, output_prob and dataset_indices dict_to_save = {key: results_phase[key] for key in ['loss', 'output_prob', 'dataset_indices']} io.save_dict(dict_to_save, os.path.join(self.log_root, epoch_root, 'results_saved.pkl')) # Precision, recall, accuracy and loss precision, recall, _, num_pos = sk_metrics.precision_recall_fscore_support(results_phase['target'].squeeze(), results_phase['prediction'].squeeze(), zero_division=0) # The metrics function returns the result for both positive and negative labels when operated with # a single target type. When the task is a multilabel decision it only returns the positive label results if num_target_class == 1: precision = [precision[1]] recall = [recall[1]] num_pos = num_pos[1] assert (np.asarray(sample_count_df.loc['Yes']) == num_pos).all(), 'Number of positive samples matching failed' cur_metrics = [epoch_loss_dict, epoch_acc_dict, self.add_target_names(precision), self.add_target_names(recall)] for i, metric_name in enumerate(epoch_metric_names): epoch_metric_dict[metric_name][phase] = cur_metrics[i] # Confusion matrix Figure if num_target_class == 1: confusion_matrix = sk_metrics.confusion_matrix(results_phase['target'].squeeze(), results_phase['prediction'].squeeze()) elif num_target_class > 1: confusion_matrix = sk_metrics.multilabel_confusion_matrix(results_phase['target'], results_phase['prediction']) else: raise Exception('number of target classes is negative') fig_confusion_norm = self.plot_confusion_matrix(confusion_matrix) figname_confusion = 'Confusion_matrix' fig_confusion_norm.savefig(os.path.join(self.log_root, epoch_root, figname_confusion + phase + '.png'), dpi=300) writer.add_figure(f'{figname_confusion}/{phase}', fig_confusion_norm, epoch) # Images with highest loss in each target type (Myelin and artefact currently) fig = self.show_imgs(results_phase=results_phase) figname_examples = 'Examples_with_highest_loss' fig.savefig(os.path.join(self.log_root, epoch_root, figname_examples + '_' + phase + '.png'), dpi=300) writer.add_figure(f'{figname_examples}/{phase}', fig, epoch) # Precision/Recall curves for i, t_type in enumerate(self.target_names): writer.add_pr_curve(f'{t_type}/{phase}', labels=results_phase.get('target')[:, i], predictions=results_phase.get('output_prob')[:, i], global_step=epoch, num_thresholds=100) # save model if self.save & (phase == 'train') & (epoch % self.save_int == 0): print(Trainer.time_str() + ' Writing model graph ... ') # writer.add_graph(self.model, inputs) print(Trainer.time_str() + ' Saving model state... ') self.model.epoch = torch.nn.Parameter(torch.tensor(epoch), requires_grad=False) self.model.iteration = torch.nn.Parameter(torch.tensor(batch_counter), requires_grad=False) torch.save({ 'model_state_dict': self.model.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict() }, os.path.join(self.log_root, epoch_root, 'torch_model_optim.pth')) # write the epoch related metrics to the tensorboard for metric_name in epoch_metric_names: cur_metric = epoch_metric_dict[metric_name] for ph in cur_metric: cur_metric_phase = {f'{ph}_{t_type}': val for t_type, val in cur_metric[ph].items()} writer.add_scalars(metric_name, cur_metric_phase, epoch) print(Trainer.time_str() + ' Finished training ... ') writer.close() print(Trainer.time_str() + ' Closed writer ... ')
def train_net(model, device, epochs=10, batch_size=8, lr=0.1, save_cp=True, optim='adam'): # First we read 'labels.csv' and construct input dataframe inp_df = pd.read_csv(correct_labels) # Now we get the stratified DataFrames for input to our Dataset Objects train_df, val_df, _ = utils.get_stratified_train_val_test_sets( inp_df=inp_df, seed=seed_val) resize = (128, 128) # Now, let's build dataset objects # Building up the Dataset objects train_set = CovidDataset( root=dir_data_root, inp_df=train_df, transformations=utils.get_transformations(for_train=True)) val_set = CovidDataset( root=dir_data_root, inp_df=val_df, transformations=utils.get_transformations(for_train=False)) # Creating DataLoaders for each set train_set_dl = DataLoader(train_set, batch_size=batch_size, shuffle=True) val_set_dl = DataLoader(val_set, batch_size=4, shuffle=True) # dictionary of dataloaders dataloaders = { 'train': train_set_dl, 'val': val_set_dl, } # Deciding the optimizer if optim == 'rmsprop': optimizer = torch.optim.RMSprop(model.parameters(), lr=lr, weight_decay=1e-8) elif optim == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=lr) elif optim == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.95, weight_decay=0.001, nesterov=True) # Setting up loss based criterion = nn.CrossEntropyLoss() params = {'optimizer': optimizer, 'criterion': criterion} # Let's create a tensorboard object writer = SummaryWriter( log_dir='runs/train_proposed_model_run1', comment= f'OPTIM_{optim}_LOSS_{criterion}_LR_{lr}_BATCH_SIZE_{batch_size}_IMG_SIZE_{resize}' ) # Let's create a random input which may be passed to network and its depiction be displayed in tensorboard graph viz ran_inp = torch.randn((2, 3, 128, 128), device=device) writer.add_graph(model=model, input_to_model=ran_inp) optim_string = optimizer.__str__().replace("\n", ' ') text = f''' Input Type: Chest XRays - RGB Images Output Type; 0/1 - Classification Batch Norm: True Activation: ReLU Epochs: {epochs} Optimizer: {optim_string} Learning Rate: {lr} Train Batch Size: {batch_size} Val Batch Size: 4 Loss Criterion: {criterion.__repr__()} Weight Init: Default ''' writer.add_text('Configurations', text, 1) writer.flush() logging.info(f'''Starting training: Epochs: {epochs} Batch size: {batch_size} Optimizer: {optim} Learning rate: {lr} Training size: {train_set.__len__()} Validation size: {val_set.__len__()} Checkpoints: {save_cp} Device: {device.type} Image Size: {resize} ''') # store train/val loss history train_epoch_loss = [] train_epoch_acc = [] train_epoch_metrics = [] val_epoch_loss = [] val_epoch_acc = [] val_epoch_metrics = [] # To create reference for making decision for best results prev_val_loss = np.Infinity # Any arbitrary Number would do fine prev_val_acc = 0.0 prev_val_f1 = 0.0 # Initialization to save best weights and model best_model_wts = copy.deepcopy(model.state_dict()) for epoch in range(epochs): start_time = time() # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': model.train() # Set model to training mode set_len = len(train_set) desc = f'Epoch {epoch + 1}/{epochs}' leave = True else: model.eval( ) # Set model to evaluate mode i.e. freeze weight updates set_len = len(val_set) desc = 'Validation Phase' leave = True total = 0 running_loss = 0 correct_preds = 0 original_labels = [] predicted_labels = [] with tqdm(total=set_len, desc=desc, unit='img', leave=leave) as pbar: for batch in dataloaders[ phase]: # Get Each Batch According to Phase _, images, labels = batch # Let's get images and labels for each batch images = images.to(device) labels = labels.to(device) # We want to zero out the gradients every time as by default pytorch accumulates gradients optimizer.zero_grad() # We want to calculate and update gradients only in 'train phase' if phase == 'val': with torch.no_grad(): outputs = model(images) loss = params['criterion'](outputs, labels).cuda() pbar.set_postfix( **{'Val CE loss (running)': loss.item()}) total += images.size(0) _, predicted = torch.max(outputs.data, 1) correct_preds += (predicted == labels).sum().item() original_labels += labels.data.cpu().numpy( ).tolist() predicted_labels += predicted.data.cpu().numpy( ).tolist() pbar.update(images.shape[0]) else: # forward pass outputs = model(images) loss = params['criterion'](outputs, labels).cuda() pbar.set_postfix( **{'train CE Loss (running)': loss.item()}) loss.backward() # Calculate Gradients params['optimizer'].step() # Update Weights total += images.size(0) _, predicted = torch.max(outputs.data, 1) correct_preds += (predicted == labels).sum().item() original_labels += labels.data.cpu().numpy().tolist() predicted_labels += predicted.data.cpu().numpy( ).tolist() pbar.update(images.shape[0]) running_loss += loss.item() * images.size(0) epoch_loss = running_loss / total epoch_acc = (correct_preds / total) * 100 epoch_metrics = utils.evaluate_metrics(original_labels, predicted_labels) if phase == 'train': train_epoch_loss.append(epoch_loss) train_epoch_acc.append(epoch_acc) # logging.info(f'''Train: # CE: {epoch_loss} # ''') writer.add_scalar('Loss/Train/Cross Entropy Loss', epoch_loss, (epoch + 1)) writer.add_scalar('Metrics/Train/Accuracy', epoch_acc, (epoch + 1)) writer.add_pr_curve('Metrics/Train/PR-Curve', np.asarray(original_labels), np.asarray(predicted_labels), (epoch + 1)) writer.add_scalar('Metrics/Train/F1-Score', epoch_metrics['f1_score'], (epoch + 1)) writer.add_scalar('Metrics/Train/Precision', epoch_metrics['precision'], (epoch + 1)) writer.add_scalar('Metrics/Train/Recall', epoch_metrics['recall'], (epoch + 1)) writer.add_scalar('Metrics/Train/Specificity', epoch_metrics['specificity'], (epoch + 1)) writer.add_scalar('Metrics/Train/Sensitivity', epoch_metrics['sensitivity'], (epoch + 1)) writer.flush() elif phase == 'val': writer.add_scalar('Loss/Validation/Cross Entropy Loss', epoch_loss, (epoch + 1)) writer.add_scalar('Metrics/Validation/Accuracy', epoch_acc, (epoch + 1)) writer.add_pr_curve('Metrics/Validation/PR-Curve', np.asarray(original_labels), np.asarray(predicted_labels), (epoch + 1)) writer.add_scalar('Metrics/Validation/F1-Score', epoch_metrics['f1_score'], (epoch + 1)) writer.add_scalar('Metrics/Validation/Precision', epoch_metrics['precision'], (epoch + 1)) writer.add_scalar('Metrics/Validation/Recall', epoch_metrics['recall'], (epoch + 1)) writer.add_scalar('Metrics/Validation/Specificity', epoch_metrics['specificity'], (epoch + 1)) writer.add_scalar('Metrics/Validation/Sensitivity', epoch_metrics['sensitivity'], (epoch + 1)) # logging.info(f'''Validation: # CE: {epoch_loss} # ''') writer.flush() val_epoch_loss.append(epoch_loss) val_epoch_acc.append(epoch_acc) if round(epoch_loss, 5) < prev_val_loss and round( epoch_acc, 5) > prev_val_acc and round( epoch_metrics['f1_score'], 5) > prev_val_f1: prev_val_loss = epoch_loss prev_val_acc = epoch_acc prev_val_f1 = epoch_metrics['f1_score'] best_model_wts = copy.deepcopy(model.state_dict()) best_res = f''' Val Loss: {epoch_loss} Accuracy: {epoch_acc} Metrics: {epoch_metrics} ''' writer.add_text('Best Results', best_res, (epoch + 1)) writer.flush() if save_cp: try: if not os.path.exists(dir_checkpoint): os.mkdir(dir_checkpoint) logging.info('Created checkpoint directory') except OSError: pass if (epoch + 1) % 5 == 0: torch.save( model.state_dict(), os.path.join(dir_checkpoint, f'modelp6{epoch + 1}.pth')) logging.info(f'Checkpoint {epoch + 1} saved !') end_time = time() logging.info('Epoch took time: {}'.format(str(end_time - start_time))) writer.close() # Load n Save Best Model Weights model.load_state_dict(best_model_wts) torch.save(model.state_dict(), os.path.join(dir_checkpoint, 'modelp6_best_weights' + '.pth')) return train_epoch_loss, train_epoch_acc, train_epoch_metrics, val_epoch_loss, val_epoch_acc, val_epoch_metrics
class MyTensorBoard(): def __init__(self, net, LabelStr, EventDir): self.labelStr = labelStr self.writer = SummaryWriter(EventDir + '/') self.net = net def matplotlib_imshow(self, img, one_channel = True): if one_channel: img = img.mean(dim = 0) img = img / 2 + 0.5 npimg = img.numpy() if one_channel: plt.imshow(npimg, cmap = 'Greys') else: plt.imshow(np.transpose(npimg, (1, 2, 0))) def ImageVisualize(self, images, labels): img_grid = torchvision.utils.make_grid(images) self.matplotlib_imshow(img_grid, one_channel = True) self.writer.add_image('Images', img_grid) self.writer.close() # Add Net structure to Tensorboard def NetVisualize(self, sampleInput): self.writer.add_graph(self.net, sampleInput) self.writer.close() def images_to_probs(self, images): ''' Generates predictions and corresponding probabilities from a trained network and a list of images ''' output = net(images) _, preds_tensor = torch.max(output, 1) preds = np.squeeze(preds_tensor.numpy()) return preds, [F.softmax(el, dim = 0)[i].item() for i, el in zip(preds, output)] def plot_classes_preds(self, images, labels): ''' Generates matplotlib Figure using a trained network, along with images and labels from a batch, that shows the network's top predictions along with its probability, alongside the actual label, coloring this information based on whether the predictions was correct or not. Uses the "Images_to_probs" function ''' preds, probs = images_to_probs(images) fig = plt.figure(figsize = (12, 48)) for idx in np.arange(4): ax = fig.add_subplot(1, 4, idx + 1, xticks = [], yticks = []) matplotlib_imshow(images[idx], one_channel = True) ax.set_title("{0}, {1:.1f}%\n(label: {2})".format(self.labelStr[preds[idx]], probs[idx] * 100.0, self.labelStr[labels[idx]]), color = ("green" if preds[idx] == labels[idx].item() else "red")) return fig def ScalarVisualize(self, graphTitle, loss, currentStep): ''' Log scalar values to plots, e.g. loss, acc, during training ''' self.writer.add_scalar(graphTitle, loss, currentStep) self.writer.close() def PredVisualize(self, images, labels, currentStep): ''' Log matplotlib figures of model's predictions to specified mini-batch ''' self.writer.add_figure('predictions vs. actuals', self.plot_classes_preds(images, labels), global_step = currentStep) self.writer.close() def ProjVisualize(self, data, labels, use_rand_instance = True, num_rand = 100): ''' Add projection visualization to Tensorboard ''' assert len(data) == len(labels) if use_rand_instance: perm = torch.randperm(len(data)) images, labels = data[perm][:num_rand], labels[perm][:num_rand] else: images, labels = data, labels class_labels = [self.labelStr[lab] for lab in labels] features = images.view(-1, 28 * 28) self.writer.add_embedding(features, metadata = class_labels, label_img = images.unsqueeze(1)) self.writer.close() def PRcurveVisualize(self, test_probs, test_preds): ''' Plot the Precision - Recall curve in Tensorboard, per - class wise ''' for class_index in range(len(self.labelStr)): tensorboard_preds = test_preds == class_index tensorboard_probs = test_probs[:, class_index] self.writer.add_pr_curve(self.labelStr[class_index], tensorboard_preds, tensorboard_probs, global_step = 0) self.writer.close()
def train(self): if self.resume: print('(' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ') Resuming training ... ') checkpoint = torch.load(os.path.join(self.log_root, 'torch_model')) self.model.load_state_dict(checkpoint['model_state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) else: print('(' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ') Starting training ... ') writer = SummaryWriter(self.log_root) self.model = self.model.to(self.device) epoch = int(self.model.epoch) + 1 it = int(self.model.iteration) sample_inds = dict() for epoch in range(epoch, epoch + self.num_epoch): if isinstance(self.data_loaders, list): # each element of the list is a data loader for an epoch loader_change_interval = self.num_epoch / len(self.data_loaders) division_index, _ = divmod(epoch, loader_change_interval) # Make sure that the index does not exceed the length of the data_loader list index = round(min(division_index, len(self.data_loaders)-1)) cur_data_loaders = self.data_loaders[index] else: # same dataloaders for all epochs cur_data_loaders = self.data_loaders # Dictionary (of dictionaries) to collect four metrics from different phases for tensorboard epoch_metric_names = ['epoch_loss', 'epoch_accuracy', 'precision/PPV', 'recall/TPR'] epoch_metric_dict = {metric_name: dict.fromkeys(cur_data_loaders.keys()) for metric_name in epoch_metric_names} epoch_root = 'epoch_{:02d}'.format(epoch) if not os.path.exists(os.path.join(self.log_root, epoch_root)): os.makedirs(os.path.join(self.log_root, epoch_root)) for phase in cur_data_loaders.keys(): if phase == 'train': self.model.train(True) else: self.model.train(False) epoch_loss = 0 running_loss = 0.0 target_sum = 0 predicted_sum = 0 correct_sum = 0 batch_idx_start = 0 num_items = len(cur_data_loaders[phase].batch_sampler.sampler) inputs_phase = -np.ones((num_items, 1, 140, 140)).astype(float) outputs_phase = -np.ones((num_items, self.model.classifier.num_output)).astype(float) predictions_phase = -np.ones(num_items).astype(int) targets_phase = -np.ones(num_items).astype(int) correct_phase = -np.ones(num_items).astype(int) sample_ind_phase = [] for i, data in enumerate(cur_data_loaders[phase]): it += 1 # copy input and targets to the device object inputs = data['input'].to(self.device) targets = data['target'].to(self.device) sample_ind_batch = data['sample_idx'] sample_ind_phase.extend(sample_ind_batch) # zero the parameter gradients self.optimizer.zero_grad() # forward + backward + optimize outputs = self.model(inputs).squeeze() loss = self.criterion(outputs, targets) if phase == 'train': loss.backward() self.optimizer.step() inputs, outputs, targets = Trainer.copy2cpu(inputs, outputs, targets) predicted_classes = np.argmax(np.exp(outputs.detach().numpy()), axis=1) predicted_sum += np.sum(predicted_classes) target_classes = targets.detach().numpy() target_sum += np.sum(target_classes) correct_classes = predicted_classes == target_classes correct_sum += np.sum(correct_classes) if i > 0: batch_idx_start = batch_idx_end batch_idx_end = batch_idx_start + len(targets) inputs_phase[batch_idx_start:batch_idx_end, :, :, :] = inputs.detach().numpy() outputs_phase[batch_idx_start:batch_idx_end, :] = outputs.detach().numpy() predictions_phase[batch_idx_start:batch_idx_end] = predicted_classes targets_phase[batch_idx_start:batch_idx_end] = target_classes correct_phase[batch_idx_start:batch_idx_end] = correct_classes running_loss += loss.item() epoch_loss += loss.item() # Report fraction of clean data in mini batch clean_num = float((targets == 0).sum()) debris_num = float((targets == 1).sum()) fraction_clean = clean_num / (debris_num + clean_num) writer.add_scalars('Fraction_clean_samples', {phase: fraction_clean}, it) if i % self.log_int == 0: running_loss_log = float(running_loss) / batch_idx_end running_accuracy_log = float(correct_sum) / batch_idx_end print('(' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ')' + ' Phase: ' + phase + ', epoch: {}, batch: {}, running loss: {:0.4f}, running accuracy: {:0.3f} '. format(epoch, i, running_loss_log, running_accuracy_log)) writer.add_scalars('running_loss', {phase: running_loss_log}, it) writer.add_scalars('running_accuracy', {phase: running_accuracy_log}, it) epoch_loss_log = float(epoch_loss) / num_items epoch_accuracy_log = float(correct_sum) / num_items print('(' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ')' + ' Phase: ' + phase + ', epoch: {}: epoch loss: {:0.4f}, epoch accuracy: {:0.3f} '. format(epoch, epoch_loss_log, epoch_accuracy_log)) metrics = Metrics( targets=targets_phase, outputs=outputs_phase, output_prob_fn=lambda x: np.exp(x[:, 1]), sample_ind=sample_ind_phase) metrics.confusion_table( path_out=os.path.join(self.log_root, epoch_root, 'confusion_table_' + phase + '.csv')) metrics.prediction_table( path_out=os.path.join(self.log_root, epoch_root, 'prediction_table_' + phase + '.csv')) # Set the current values of the epoch error metrics cur_metrics = [epoch_loss_log, epoch_accuracy_log, metrics.metrics['PPV'], metrics.metrics['TPR']] for i, metric_name in enumerate(epoch_metric_names): epoch_metric_dict[metric_name][phase] = cur_metrics[i] fig = Trainer.show_imgs(inputs=inputs_phase, outputs=outputs_phase, predictions=predictions_phase, targets=targets_phase, sample_ind=sample_ind_phase) figname = 'image_examples_' fig.savefig(os.path.join(self.log_root, epoch_root, figname + '_' + phase + '.png')) writer.add_figure(figname + phase, fig, epoch) fig = Trainer.show_classification_matrix(targets=targets_phase, predictions=predictions_phase, metrics=metrics.metrics) figname = 'targets_outputs_correct_' fig.savefig(os.path.join(self.log_root, epoch_root, figname + '_' + phase + '.png')) fig.savefig(os.path.join(self.log_root, epoch_root, figname + '_' + phase + '.eps')) writer.add_figure(figname + phase, fig, epoch) writer.add_pr_curve( 'pr_curve_'+phase, labels=targets_phase, predictions=np.exp(outputs_phase[:, 1]), global_step=epoch, num_thresholds=50) if self.save & (phase == 'train') & (epoch % self.save_int == 0): print('(' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ') Writing model graph ... ') # writer.add_graph(self.model, inputs) print('(' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ') Saving model state... ') self.model.epoch = torch.nn.Parameter(torch.tensor(epoch), requires_grad=False) self.model.iteration = torch.nn.Parameter(torch.tensor(it), requires_grad=False) torch.save({ 'model_state_dict': self.model.state_dict(), }, os.path.join(self.log_root, epoch_root, 'model_state_dict')) torch.save({ 'optimizer_state_dict': self.optimizer.state_dict() }, os.path.join(self.log_root, 'optimizer_state_dict')) # write the epoch related metrics to the tensorboard for metric_name in epoch_metric_names: writer.add_scalars(metric_name, epoch_metric_dict[metric_name], epoch) print('(' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ') Finished training ... ') writer.close() print('(' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ') Closed writer ... ')
def train( net, optimizer, lossfunc, train_dataloader, val_dataloader, batchsize=32, numepochs=1, device='cuda', log_basedir='trainlogs', log_subdir='run0', log_frequency=100, val_frequency=100, ): """ train Train a sign language classifier network on the asl-alphabet. inputs: net - (SLClassifier) Sign language classifier network optimizer - (torch.optim optimizer) Optimizer object created using net's parameters train_dataloader - (ASLAlphabet) ASLAlphabet training dataloader val_dataloader - (ASLAlphabet) ASLAlphabet validation dataloader (note that this gets refreshed with shuffle=True when val_dataloader iterator hits stopping point) batchsize - (int) number of samples per batches numepochs - (int) number of epochs to train on device - (str) device to perform computations on log_basedir - (str) project logging folder that holds all logs (e.g. "C:\\Users...\\project_name\logs") log_subdir - (str) subdirectory of log_basedir specifying the storage folder for _this_ experiment (e.g. "run1") log_frequency - (int) logging frequency (in number of batches) val_frequency - (int) process validation batch every val_frequency samples """ if log_frequency > val_frequency: raise Exception( "log_frequency must be less than or equal to val_frequency!") net.to(device) net.train() print('[ network pushed to device ]') logpath = os.path.join(log_basedir, log_subdir) trainwriter = SummaryWriter(log_dir=os.path.join( logpath, 'training')) # start tensorboard writer # create state_dict log folder state_dict_path = os.path.join(logpath, 'state_dicts') os.mkdir(state_dict_path) trainwriter.add_graph(net, torch.rand(1, 3, 200, 200).to(device)) print('[ starting training ]') print('----------------------------------------------------------------') t_start = time.time() # record if val_frequency is not None and val_frequency != 0: val_dataloader_it = iter( val_dataloader) # use this to load validation batches when we want valwriter = SummaryWriter(log_dir=os.path.join(logpath, 'validation')) batches_processed = 0 val_batches_processed = 0 logstep = 0 # the "global_step" variable for tensorboard logging for epoch in range(numepochs): print('epoch =', epoch) for i, batch in enumerate(train_dataloader): # start device transfer timing transfer_start = time.time() # sample and move to device labels, samples = batch samples = samples.to(device) labels = labels.to(device) transfer_time = time.time( ) - transfer_start # record cpu dataload time # gpu computations compute_start = time.time() scores = net(samples) probs = scores.softmax(dim=1) loss = lossfunc(scores, labels) # reduced to scalar # TODO: add regularization # backprop + paramater update optimizer.zero_grad() loss.backward() optimizer.step() compute_time = time.time() - compute_start batches_processed += 1 # # Tensorboard logging # if log_frequency != 0 and batches_processed % log_frequency == 0: # log time for log_frequency batches t_end = time.time() mtpb = (t_end - t_start) / log_frequency # mean time per sample trainwriter.add_scalars( 'times', { 'mean_time_per_batch': mtpb, 'transfer_time': transfer_time, 'compute_time': compute_time }, logstep) t_start = t_end # compute accuracy _, class_pred = probs.max(dim=1) acc = (class_pred == labels).sum() / float( len(labels)) # accuracy trainwriter.add_scalars('accuracies', {'train': acc}, logstep) # record batch loss trainwriter.add_scalars('losses', {'loss': loss}, logstep) # TODO: get PR curves working one_hot = torch.nn.functional.one_hot(labels) # sometimes 29th class isn't represented, so one_hot results in <29 columns if one_hot.size(1) < 29: one_hot = torch.cat([ one_hot.cpu(), torch.zeros(one_hot.size(0), 29 - one_hot.size(1)).long() ], dim=1) trainwriter.add_pr_curve('pr', labels=one_hot, predictions=probs, global_step=logstep) # gpu usage # TODO: optimize gpu usage trainwriter.add_scalars( 'gpu_usage', { 'mem_allocated': torch.cuda.memory_allocated('cuda'), 'mem_cached': torch.cuda.memory_cached('cuda') }, logstep) print('logstep =', logstep) print('batches_processed =', batches_processed) print( 'epoch_progress =', batchsize * batches_processed / len(train_dataloader.dataset)) print('train_samples_processed =', batchsize * batches_processed) print('mean_time_per_batch =', mtpb) print( '----------------------------------------------------------------' ) # # Validation # if val_frequency != 0: if batches_processed % val_frequency == 0 and val_frequency != 0: net.eval() # set evaluation mode with torch.no_grad(): labels, samples = next(val_dataloader_it) labels = labels.to(device) samples = samples.to(device) scores = net(samples) probs = scores.softmax(dim=1) # val losses loss_val = lossfunc(scores, labels) valwriter.add_scalars('losses', {'val': loss_val}, logstep) # val accuracy _, class_pred = probs.max(dim=1) val_acc = (class_pred == labels).sum() / float( len(labels)) # accuracy valwriter.add_scalars('accuracies', {'validation': val_acc}, logstep) val_batches_processed += 1 # reset validation dataloader if we just completed the last batch if torch.Tensor([val_batches_processed]) % torch.ceil( torch.Tensor([ len(val_dataloader.dataset) / val_dataloader.batch_size ])) == 0: val_dataloader_it = iter(val_dataloader) net.train() logstep += 1 # checkpoint model every epoch pth_path = os.path.join(state_dict_path, 'net_state_dict_epoch{}.pth'.format(epoch)) torch.save(net.state_dict(), pth_path) print('[ model saved, path = {} ]'.format(pth_path)) return net # return the network for subsequent usage
import os import shutil import time import numpy as np from torch.utils.tensorboard import SummaryWriter os.makedirs('tensorboard_runs', exist_ok=True) shutil.rmtree('tensorboard_runs') writer = SummaryWriter(log_dir='tensorboard_runs', filename_suffix=str(time.time())) for k in range(11): for i in range(10): data = np.random.random(10) # for j in range(data.shape[0]): # writer.add_scalars('ROC curve/{}_data'.format(k), {str(i): data[j]}, j / 10) writer.add_pr_curve('ROC curve/{}_data'.format(k), data, data) writer.flush()
def train(logger, config, model, processor): comment = f"_TASK-{config.task_name}_MODEL-{config.model_name}" + \ f"_EPOCH-{config.epoch}_BATCH-{config.batch_size}_LR-{config.lr}" suffix = get_logdir_suffix(comment) logger.info(suffix) tb_log_dir = os.path.join(config.output_dir, 'tb_log_dir', suffix) tb_writer = SummaryWriter(log_dir=tb_log_dir) if config.model_name == 'sent_crcnn': criterion = RankingLoss(processor.class_num, config) else: criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=config.lr) # Train! logger.info("***** Running training *****") logger.info(f" Num examples = {processor.num_train_examples}") logger.info(f" Num Epochs = {config.epoch}") logger.info(f" Train batch size = {config.batch_size}") global_step = 0 train_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(config.epoch, desc="Epoch") set_seed(config.seed) best_nonna_macro_f1 = 0.0 epoch_num = 1 for _ in train_iterator: train_loader = processor.train_loader epoch_iterator = tqdm(train_loader, desc="Iteration", ncols=60) for step, raw_batch in enumerate(epoch_iterator): model.train() if config.task_name == 'sent': batch = tuple(t.to(config.device) for t in raw_batch[:-2]) rel_labels = batch[4] bag_labels = batch[5] instance_id = raw_batch[6] bag_id = raw_batch[7] inputs = { "token2ids": batch[0], "pos1s": batch[1], "pos2s": batch[2], "mask": batch[3], } elif config.task_name == 'bag': batch = tuple(t.to(config.device) for t in raw_batch[:-3]) rel_labels = batch[4] bag_labels = batch[5] instance_id = raw_batch[6] bag_id = raw_batch[7] inputs = { "token2ids": batch[0], "pos1s": batch[1], "pos2s": batch[2], "mask": batch[3], "scopes": raw_batch[8], "is_training": True, "rel_labels": rel_labels, } else: raise NotImplementedError optimizer.zero_grad() out = model(**inputs) loss = criterion(out, rel_labels.to(config.device)) loss.backward() optimizer.step() train_loss += loss.item() global_step += 1 if config.do_eval_while_train: if global_step % config.tb_logging_step == 0: if config.model_name == 'sent_crcnn': results, eval_loss, preds, labels, outs = evaluate_crcnn( model, criterion, logger, processor, config, "train", f"E-{epoch_num}_S-{step+1}") else: results, eval_loss, preds, labels, outs = evaluate_nyth( model, criterion, logger, processor, config, "train", f"E-{epoch_num}_S-{step+1}") for key, val in results.items(): if 'report' in key: continue tb_writer.add_scalar(f"{key}/train", val, global_step) tb_writer.add_scalar("loss/train", (train_loss - logging_loss) / config.tb_logging_step, global_step) probs = torch.nn.functional.softmax(torch.tensor(outs), dim=1) thresholds, indices = probs.max(dim=1) tb_writer.add_pr_curve('pr_curve/train', labels == preds, thresholds, global_step=global_step, num_thresholds=len(preds)) logging_loss = train_loss if config.model_name == 'sent_crcnn': results, eval_loss, preds, labels, outs = evaluate_crcnn( model, criterion, logger, processor, config, "dev", f"E-{epoch_num}_S-{step+1}") else: results, eval_loss, preds, labels, outs = evaluate_nyth( model, criterion, logger, processor, config, "dev", f"E-{epoch_num}_S-{step+1}") for key, val in results.items(): if 'report' in key: continue tb_writer.add_scalar(f"{key}/dev", val, global_step) probs = torch.nn.functional.softmax(torch.tensor(outs), dim=1) thresholds, indices = probs.max(dim=1) tb_writer.add_pr_curve('pr_curve/dev', labels == preds, thresholds, global_step=global_step, num_thresholds=len(preds)) nonna_macro_f1 = results[config.select_score] if nonna_macro_f1 > best_nonna_macro_f1: best_nonna_macro_f1 = nonna_macro_f1 logger.info( f"Epoch: {epoch_num}, *Best DEV {config.select_score}: {best_nonna_macro_f1}" ) if config.save_best_model: output_dir = os.path.join(config.output_dir, 'checkpoints', 'best') if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model torch.save(model_to_save.state_dict(), os.path.join(output_dir, 'best_model.pth')) logger.info( f"Epoch: {epoch_num}, Saving model to {output_dir}") else: logger.info( f"Epoch: {epoch_num}, DEV {config.select_score}: {nonna_macro_f1}" ) epoch_num += 1 tb_writer.close() return global_step, train_loss / global_step