예제 #1
0
class Logger:
    def __init__(self, log_path, comment=None):
        self.log_path = log_path
        self.writer = SummaryWriter(log_dir=self.log_path, comment=comment)
        try:
            if not (os.path.exists(self.log_path)):
                os.makedirs(self.log_path)
            else:
                pass
                # print("Directory Already Exists.")
        except Exception as e:
            print(e)
            print("Failed to Create Log Directory.")

    def save_params(
        self,
        param_list,
        param_name_list,
        epoch=None,
        batch_size=None,
        batch=None,
        combine=False,
        combine_name=None,
        global_step=None,
    ):
        if combine is False:
            for i in range(len(param_list)):
                if isinstance(param_list[i], Variable):
                    param_list[i] = param_list[i].data.cpu().numpy()

                if global_step is None:
                    self.writer.add_scalar(
                        param_name_list[i],
                        param_list[i],
                        Logger._global_step(epoch, batch_size, batch),
                    )
                else:
                    self.writer.add_scalar(param_name_list[i], param_list[i],
                                           global_step)

        else:
            scalar_dict = dict(zip(param_name_list, param_list))
            if global_step is None:
                self.writer.add_scalars(
                    combine_name,
                    scalar_dict,
                    Logger._global_step(epoch, batch_size, batch),
                )
            else:
                self.writer.add_scalars(combine_name, scalar_dict, global_step)

    def save_batch_images(self,
                          image_name,
                          image_batch,
                          epoch,
                          batch_size,
                          batch=None,
                          dataformats="CHW"):
        self.writer.add_images(
            image_name,
            image_batch,
            Logger._global_step(epoch, batch_size, batch),
            dataformats=dataformats,
        )

    def save_prcurve(self, labels, preds, epoch, batch_size, batch=None):
        self.writer.add_pr_curve("pr_curve", labels, preds,
                                 Logger._global_step(epoch, batch_size, batch))

    def save_hyperparams(self, hparam_list, hparam_name_list, metric_list,
                         metric_name_list):

        for i in range(len(hparam_list)):
            if isinstance(hparam_list[i], list):
                hparam_list[i] = ",".join(list(map(str, hparam_list[i])))
            if isinstance(hparam_list[i], dict):
                hparam_list[i] = json.dumps(hparam_list[i])
            #             if type(hparam_list[i]) == DictConfig:
            #                 hparam_list[i] = str(hparam_list[i])
            if hparam_list[i] is None:
                hparam_list[i] = "None"
        print(hparam_list, hparam_name_list, metric_list, metric_name_list)
        self.writer.add_hparams(
            dict(zip(hparam_name_list, hparam_list)),
            dict(zip(metric_name_list, metric_list)),
        )

    def save_models(self, model_list, model_names_list,
                    epoch):  # Need to check is epoch is needed
        for model_name, model in zip(model_names_list, model_list):
            torch.save(model.state_dict(),
                       os.path.join(self.log_path, model_name))

    def save_fig(self, fig, fig_name, epoch, batch_size, batch=None):
        self.writer.add_figure(fig_name, fig,
                               Logger._global_step(epoch, batch_size, batch))

    # def display_params(
    #     self, params_list, params_name_list, epoch, num_epochs, batch_size, batch
    # ):
    #     for i in range(len(params_list)):
    #         if isinstance(params_list[i], Variable):
    #             params_list[i] = params_list[i].data.cpu().numpy()
    #     print("Epoch: {}/{}, Batch: {}/{}".format(epoch,\
    # num_epochs, batch, batch_size))
    #     for i in range(len(params_list)):
    #         print("{}:{}".format(params_name_list[i], params_list[i]))

    # def draw_model_architecture(self, model, output, input, \
    # input_name, save_name):
    #     make_dot(
    #         output, params=dict(list(model.named_parameters())) + \
    # [(input_name, input)]
    #     )

    def close(self):
        self.writer.close()

    @staticmethod
    def _global_step(epoch, batch_size, batch):
        if batch:
            return epoch * batch_size + batch
        else:
            return epoch
예제 #2
0
        n_correct += (predictions == labels).sum().item()

        class_prob_batch = [F.softmax(output, dim=0) for output in outputs]

        class_preds.append(class_prob_batch)  # 10 different class probability
        class_labels.append(predictions)  # single class prediction

class_preds = torch.cat([torch.stack(batch) for batch in class_preds])
class_labels = torch.cat(class_labels)
acc = 100.0 * n_correct / n_total
print(f'Accuracy for {n_total} images is {acc:0.4f}')
#tensorboard
for i in range(10):
    label_i = class_labels == i
    preds_i = class_preds[:, i]
    writer.add_pr_curve(str(i), label_i, preds_i, global_step=0)
    writer.close()
#evaluation in test set:

image_id = []
label = []

y_pred = model(test_dataset.x).detach()
_, predictions = torch.max(y_pred, 1)
for i, pred in enumerate(predictions):
    image_id.append(i + 1)
    pred = pred.item()
    label.append(pred)
submission_dict = {'ImageId': image_id, 'Label': label}
df = pd.DataFrame(submission_dict)
df.to_csv('submission_v1.csv', index=False)
예제 #3
0
def main():
    #net = 'ATClsResnet'
    net = 'ATClsFPNResnet'
    #net = 'resnet'
    #net = 'inceptionv3'

    configRoot = Path('configs')
    dataConfigFileName = Path('dataconfig.yaml')
    netConfigFileName = Path(f'{net}.yaml')
    cfg = config(str(configRoot / dataConfigFileName))
    cfg.mergeWith(str(configRoot / netConfigFileName))

    # data
    batchSize = cfg['train']['batch_size']
    shuffle = cfg['train']['shuffle']
    numWorkers = cfg['train']['num_worker']
    balance = cfg['train']['balance']

    # train
    netname = cfg['train']['netname']
    num_epochs = cfg['train']['epoch']
    device = cfg['train']['device']
    device = torch.device(device)
    outputDir = Path(cfg['train']['output_dir'])
    netoutdir = Path(netname)
    thisRunName = Path(cfg['train']['session_dir'])

    saveRoot = outputDir / netoutdir / thisRunName
    if saveRoot.exists():
        val = input(f'remove {str(saveRoot)} and continue ? y or n')
        if val == 'y':
            shutil.rmtree(str(saveRoot))
        else:
            raise Exception("Stop for protect the existing data")
    tfLog = Path('tensorboard')

    writer = SummaryWriter(str(saveRoot / tfLog))
    save_step = cfg['train']['save_step']

    # loss
    #Clscriterion = nn.CrossEntropyLoss()
    #Clscriterion = ClsWeightLoss()
    Clscriterion = FocalLoss()
    ATcriterion = ATMaskLoss()

    TrainDataSet = class2setWithATMask(cfg, isTrain=True)
    TestDataSet = class2setWithATMask(cfg, isTrain=False)
    print(f'TrainDataSet positive rate {TrainDataSet.prate}')
    print(f'TestDataSet positive rate {TestDataSet.prate}')

    if balance:
        prate = TrainDataSet.prate
        weights = []
        for _, l, _ in TrainDataSet:
            weights.append(1 - prate if l == 1 else prate)
        trainSampler = WeightedRandomSampler(weights,
                                             len(TrainDataSet),
                                             replacement=True)

        TrainDataloader = DataLoader(TrainDataSet,
                                     batch_size=batchSize,
                                     num_workers=numWorkers,
                                     sampler=trainSampler)
        print("using balance data")
    else:
        TrainDataloader = DataLoader(TrainDataSet,
                                     batch_size=batchSize,
                                     shuffle=shuffle,
                                     num_workers=numWorkers)

    TestDataloader = DataLoader(TestDataSet,
                                batch_size=batchSize,
                                shuffle=False,
                                num_workers=numWorkers)

    model = regNets[net](config=cfg)
    optimizer = optim.SGD(model.parameters(),
                          lr=1e-4,
                          momentum=0.5,
                          weight_decay=1e-3)
    dataiter = iter(TrainDataloader)
    images, labels, masks = dataiter.next()
    imgGrid = torchvision.utils.make_grid(images)
    MaskGrid = torchvision.utils.make_grid(
        masks.unsqueeze(1).repeat(1, 3, 1, 1))

    writer.add_image('sample images', imgGrid)
    writer.add_image('sample image mask', MaskGrid)
    writer.add_graph(model, images)

    model.to(device)
    step = 1
    pred_at = []
    label_at = []
    for e in range(num_epochs):
        running_loss_cls = 0.0
        running_loss_at = 0.0
        running_loss = 0.0
        running_at_ploss = 0.0
        running_at_nloss = 0.0
        running_cls_label = []
        running_cls_prob = []
        pos = 0
        neg = 0
        model.train()

        for i, data in enumerate(TrainDataloader):

            step += 1
            inputs, labels, atmask = data[0].to(device), data[1].to(
                device), data[2].to(device)
            optimizer.zero_grad()
            clsout, rpn = model(inputs)

            running_cls_label.append(labels.cpu())
            running_cls_prob.append(clsout.cpu())

            clsLoss = Clscriterion(clsout, labels)
            ploss, nloss = ATcriterion(rpn, atmask)
            loss = ploss + nloss + clsLoss

            running_loss_at += ploss + nloss
            running_at_ploss += ploss
            running_at_nloss += nloss
            running_loss_cls += clsLoss
            running_loss += loss

            pos += torch.sum(labels)
            neg += torch.sum(torch.ones_like(labels) - labels)
            loss.backward()
            optimizer.step()
            if step % save_step == 0:
                ckp = Path(f'checkpoint_{e+1}_{i+1}_{step+1}.pth')
                torch.save(model.state_dict(), str(saveRoot / ckp))

            if step % 600 == 599:
                model.eval()
                test(model, TestDataloader, device, writer, step, Clscriterion)
                model.train()
            if i % 5 == 4:
                avgLoss = running_loss / 5
                avgClsLoss = running_loss_cls / 5
                avgAtLoss = running_loss_at / 5
                avgAtPLoss = running_at_ploss / 5
                avgAtNLoss = running_at_nloss / 5

                totalStep = e * len(TrainDataloader) + i
                rpn = rpn.cpu()[:2]
                rpn = torch.argmax(rpn, dim=1)
                rpn = rpn.unsqueeze(1).repeat(1, 3, 1, 1)
                writer.add_images('train rpn pred', rpn)
                atmask = atmask.cpu()[:2]
                atmask = atmask.unsqueeze(1).repeat(1, 3, 1, 1)
                writer.add_images('train rpn target', atmask)

                print(
                    "epoch:{:2d}, step:{:4d} TotalStep:{:4d} loss:{:.3f} ClsLoss:{:.3f} AtLoss:{:.3f} posIns:{} negIns:{}"
                    .format(e + 1, i + 1, step, avgLoss, avgClsLoss, avgAtLoss,
                            pos, neg))
                writer.add_scalar('training loss', avgLoss, step)
                writer.add_scalar('training cls loss', avgClsLoss, step)
                writer.add_scalar('training At loss', avgAtLoss, step)
                writer.add_scalar('training At p loss', avgAtPLoss, step)
                writer.add_scalar('training At n loss', avgAtNLoss, step)

                running_loss = 0
                running_loss_cls = 0
                running_loss_at = 0
                running_at_ploss = 0
                running_at_nloss = 0
                pos = 0
                neg = 0

                cls_prob = torch.cat(running_cls_prob, dim=0)
                cls_prob = F.softmax(cls_prob, dim=1)[:, 1]
                writer.add_pr_curve('Training Crack PR',
                                    torch.cat(running_cls_label),
                                    cls_prob,
                                    global_step=step)
                running_cls_label.clear()
                running_cls_prob.clear()

    torch.save(model.state_dict(), str(saveRoot / Path('model_final.pth')))
예제 #4
0
class util_classification_tensorboard():
    ''' classification tensorboard 기록 전용 클래스 '''
    def __init__(self, kernel_type, classes):
        ''' Constructor
        classes = list [name1, name2, ... , name_n]
            e.g. ['normal', 'stone']
        '''
        self.writer = SummaryWriter(f'./runs/{kernel_type}')
        self.classes = classes

    def __del__(self):
        ''' Destructor '''
        self.writer.close()

    def write_batchsamples(self, batchsample_name, images, labels):
        ''' 이미지 배치 샘플들을 기록 '''
        # 이미지 그리드를 만듬.
        img_grid = torchvision.utils.make_grid(images)
        self.__matplotlib_imshow(img_grid, one_channel=True)
        self.writer.add_image(batchsample_name, img_grid)

    def write_net_graph(self, net):
        ''' 뉴럴넷 모델을 기록함 '''
        self.writer.add_graph(net)

    def write_train_epoch(self, loss, inputs, labels, preds, probs, epoch):
        ''' 뉴럴넷 학습 epoch 단계별 이미지/loss를 기록함
            train_epoch() 안쪽에서 호출하는 목적
        '''

        # 학습 중 손실(running loss)을 기록
        self.writer.add_scalar('train/Loss', loss, epoch)
        self.writer.add_scalar('train/Accuracy', loss, epoch)
        self.writer.add_scalar('train/AUC', loss, epoch)

        # 미니배치(mini-batch)에 대한 예측 결과 Figure를 기록
        self.writer.add_figure('train/predict vs. GT',
                               self.__plot_classes_preds(
                                   inputs, labels, preds, probs),
                               global_step=epoch)

        # ROC 그리기
        target_index = 1
        self.write_pr_curve_tensorboard(self,
                                        target_index,
                                        preds,
                                        probs,
                                        global_step=epoch)

    def write_pr_curve_tensorboard(self,
                                   target_index,
                                   test_preds,
                                   test_probs,
                                   global_step=0):
        ''' target_index에 해당하는 ROC curve를 그림 '''
        tensorboard_preds = test_preds == target_index
        tensorboard_probs = test_probs[:, target_index]

        self.writer.add_pr_curve(self.classes[target_index],
                                 tensorboard_preds,
                                 tensorboard_probs,
                                 global_step=global_step)

    def __plot_classes_preds(self, images, labels, preds, probs):
        '''
        학습된 신경망과 배치로부터 가져온 이미지 / 라벨을 사용하여 matplotlib
        Figure를 생성. 신경망의 예측 결과 / 확률과 함께 정답(GT)을 보여주며,
        예측 결과가 맞았는지 여부에 따라 색을 다르게 표시
        '''
        # 배치에서 이미지를 가져와 예측 결과 / 정답과 함께 표시(plot)합니다
        fig = plt.figure(figsize=(12, 48))
        for idx in np.arange(4):
            ax = fig.add_subplot(1, 4, idx + 1, xticks=[], yticks=[])
            self.__matplotlib_imshow(images[idx], one_channel=True)
            ax.set_title(
                "{0}, {1:.1f}%\n(label: {2})".format(
                    self.classes[preds[idx]], probs[idx] * 100.0,
                    self.classes[labels[idx]]),
                color=("green" if preds[idx] == labels[idx].item() else "red"))
        return fig

    def __matplotlib_imshow(img, one_channel=False):
        # 이미지를 보여주기 위한 헬퍼(helper) 함수
        # (아래 `plot_classes_preds` 함수에서 사용)
        if one_channel:
            img = img.mean(dim=0)
        img = img / 2 + 0.5  # unnormalize
        npimg = img.numpy()
        if one_channel:
            plt.imshow(npimg, cmap="Greys")
        else:
            plt.imshow(np.transpose(npimg, (1, 2, 0)))
writer.add_image('my_image', img, 0)

# If you have non-default dimension setting, set the dataformats argument.
writer.add_image('my_image_HWC', img_HWC, 0, dataformats='HWC')

img_batch = np.zeros((16, 3, 100, 100))
for i in range(16):
    img_batch[i, 0] = np.arange(0, 10000).reshape(100, 100) / 10000 / 16 * i
    img_batch[i, 1] = (1 - np.arange(0, 10000).reshape(100, 100) / 10000) / 16 * i

writer.add_images('my_image_batch', img_batch, 0)

labels = np.random.randint(2, size=100)  # binary label
predictions = np.random.rand(100)
writer.add_pr_curve('pr_curve', labels, predictions, 0)

vertices_tensor = torch.as_tensor([
    [1, 1, 1],
    [-1, -1, 1],
    [1, -1, -1],
    [-1, 1, -1],
], dtype=torch.float).unsqueeze(0)
colors_tensor = torch.as_tensor([
    [255, 0, 0],
    [0, 255, 0],
    [0, 0, 255],
    [255, 0, 255],
], dtype=torch.int).unsqueeze(0)
faces_tensor = torch.as_tensor([
    [0, 2, 3],
예제 #6
0
def main(args):
    data_path = args.data_path
    snapshot_path = args.snapshot_path

    batch_size = args.batch_size
    workers = args.workers
    image_size = args.image_size
    device = args.device
    n_epochs = args.epochs
    learning_rate = args.learning_rate

    tensorboard_logdir = args.tensorboard_logdir

    dataset = FloatingSeaObjectDataset(
        data_path,
        fold="train",
        transform=get_transform("train",
                                intensity=args.augmentation_intensity,
                                add_fdi_ndvi=args.add_fdi_ndvi),
        output_size=image_size,
        seed=args.seed)
    valid_dataset = FloatingSeaObjectDataset(
        data_path,
        fold="val",
        transform=get_transform("test", add_fdi_ndvi=args.add_fdi_ndvi),
        output_size=image_size,
        seed=args.seed,
        hard_negative_mining=False)

    # store run arguments in the same folder
    run_arguments = vars(args)
    run_arguments["train_regions"] = ", ".join(dataset.regions)
    run_arguments["valid_dataset"] = ", ".join(valid_dataset.regions)
    os.makedirs(os.path.dirname(args.snapshot_path), exist_ok=True)
    with open(
            os.path.join(os.path.dirname(args.snapshot_path),
                         f"run_arguments_{args.seed}.json"), 'w') as outfile:
        json.dump(run_arguments, outfile)

    print(run_arguments)

    # loading training datasets
    train_loader = DataLoader(dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=workers)
    val_loader = DataLoader(valid_dataset,
                            batch_size=batch_size,
                            num_workers=workers,
                            shuffle=True)

    # compute the number of labels in each class
    # weights = compute_class_occurences(train_loader) #function that computes the occurences of the classes
    pos_weight = torch.FloatTensor([float(args.pos_weight)]).to(device)

    bcecriterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight,
                                        reduction="none")

    def criterion(y_pred, target, mask=None):
        """a wrapper around BCEWithLogitsLoss that ignores no-data
        mask provides a boolean mask on valid data"""
        loss = bcecriterion(y_pred, target)
        if mask is not None:
            return (loss * mask.double()).mean()
        else:
            return loss.mean()

    inchannels = 12 if not args.add_fdi_ndvi else 14
    model = get_model(args.model,
                      inchannels=inchannels,
                      pretrained=not args.no_pretrained).to(device)

    # initialize optimizer
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if snapshot_path is not None and os.path.exists(snapshot_path):
        start_epoch, logs = resume(snapshot_path, model, optimizer)
        start_epoch += 1
        print(
            f"resuming from snapshot {snapshot_path}. starting epoch {start_epoch}"
        )
        for log in logs:
            print(
                f"epoch {log['epoch']}: trainloss {log['trainloss']:.4f}, valloss {log['valloss']:.4f}. (from {snapshot_path})"
            )
    else:
        start_epoch = 1
        logs = []

    # create summary writer if tensorboard_logdir is not None
    writer = SummaryWriter(
        log_dir=tensorboard_logdir) if tensorboard_logdir is not None else None

    for epoch in range(start_epoch, n_epochs + 1):
        trainloss = training_epoch(model, train_loader, optimizer, criterion,
                                   device)
        valloss, metrics = validating_epoch(model, val_loader, criterion,
                                            device)

        log = dict(
            epoch=epoch,
            trainloss=trainloss,
            valloss=valloss,
        )
        log.update(metrics)

        logs.append(log)

        if writer is not None:
            writer.add_scalars("loss", {
                "train": trainloss,
                "val": valloss
            },
                               global_step=epoch)
            fig = predict_images(val_loader, model, device)
            writer.add_figure("predictions", fig, global_step=epoch)

            predictions, targets = get_scores(val_loader, model, device)
            targets = targets.reshape(-1)
            targets = targets > 0.5  # make to bool
            predictions = predictions.reshape(-1)
            writer.add_pr_curve("unbalanced",
                                targets,
                                predictions,
                                global_step=epoch)

            # make predictions and targets balanced by removing not floating pixels until numbers of positive
            # and negative samples are equal
            floating_predictions = predictions[targets]
            not_floating_predictions = predictions[~targets]
            np.random.shuffle(not_floating_predictions)
            not_floating_predictions = not_floating_predictions[:len(
                floating_predictions)]
            predictions = np.hstack(
                [floating_predictions, not_floating_predictions])
            targets = np.hstack([
                np.ones_like(floating_predictions),
                np.zeros_like(not_floating_predictions)
            ])
            writer.add_pr_curve("balanced",
                                targets,
                                predictions,
                                global_step=epoch)

        # retrieve best loss by iterating through previous logged losses
        best_loss = min([l["valloss"] for l in logs])
        best_kappa = max([l["kappa"] for l in logs])
        kappa = metrics["kappa"]

        save_msg = ""  # write save model message in the same line of the pring
        if valloss <= best_loss or kappa >= best_kappa:
            save_msg = f"saving model to {snapshot_path}"  # add this message if model saved
            snapshot(snapshot_path, model, optimizer, epoch, logs)

        metrics_message = ", ".join(
            [f"{k} {v:.2f}" for k, v in metrics.items()])

        print(
            f"epoch {epoch}: trainloss {trainloss:.4f}, valloss {valloss:.4f}, {metrics_message} ,{save_msg}"
        )
    writer.add_scalars('avg/total loss', {
        'train': total_train_loss,
        'val': total_val_loss
    }, epoch)
    writer.add_scalars('avg/auc', {
        'train': train_avg_auc,
        'val': val_avg_auc
    }, epoch)
    for class_name, auc1, auc2 in zip(class_names, train_auc, val_auc):
        writer.add_scalars('AUC/{}'.format(class_name), {
            'train': auc1,
            'val': auc2
        }, epoch)
    for i in range(len(class_names)):
        writer.add_pr_curve('PR curve train/{}'.format(class_names[i]),
                            train_data_pr[1][:, i],
                            train_data_pr[0][:, i],
                            global_step=epoch)
        writer.add_pr_curve('PR curve validation/{}'.format(class_names[i]),
                            val_data_pr[1][:, i],
                            val_data_pr[0][:, i],
                            global_step=epoch)
    writer.flush()

    print(
        'EPOCH %d:\tTRAIN [duration %.3f sec, loss: %.3f, avg auc: %.3f]\t\t'
        'VAL [duration %.3f sec, loss: %.3f, avg auc: %.3f]\tCurrent time %s' %
        (epoch + 1, train_duration, total_train_loss, train_avg_auc,
         val_duration, total_val_loss, val_avg_auc,
         str(datetime.now(timezone('Europe/Moscow')))))

    torch.save(
예제 #8
0
    def train(self):
        # Load saved model if resume option selected
        if self.resume:
            print(Trainer.time_str() + ' Resuming training ... ')
            checkpoint = torch.load(os.path.join(self.log_root, self.get_epoch_root(self.resume_epoch), 'torch_model_optim.pth'))
            self.model.load_state_dict(checkpoint['model_state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        else:
            print(Trainer.time_str() + ' Starting training ... ')

        writer = SummaryWriter(self.log_root)
        self.model = self.model.to(self.device)
        epoch = int(self.model.epoch) + 1
        batch_counter = int(self.model.iteration)
        # Epoch loop
        for epoch in range(epoch, epoch + self.num_epoch):
            # Create logging directory
            epoch_root = self.get_epoch_root(epoch) 
            if not os.path.exists(os.path.join(self.log_root, epoch_root)):
                os.makedirs(os.path.join(self.log_root, epoch_root))
            # Select data loaders for the current epoch
            cur_epoch_loaders = self.get_epoch_loaders(epoch)
            # Dictionary (of dictionaries) to collect four metrics from different phases for tensorboard
            epoch_metric_names = ['epoch_loss', 'epoch_accuracy', 'epoch_precision', 'epoch_recall']
            epoch_metric_dict = {metric_name: dict.fromkeys(cur_epoch_loaders.keys()) for metric_name in epoch_metric_names}
            # Loop over phases within one epoch [train, validation, test]
            for phase in cur_epoch_loaders.keys():
                # Select training state of the NN model
                if phase == 'train':
                    self.model.train(True)
                else:
                    self.model.train(False)

                # Select Loader
                cur_loader = cur_epoch_loaders[phase]
                # Number of samples
                columns = self.target_names.columns
                sample_count_df = pd.DataFrame(np.zeros([2, len(columns)],
                                               dtype=np.int64),
                                               columns=columns,
                                               index=('No', 'Yes'))
                num_samples = len(cur_loader.batch_sampler.sampler)
                total_sample_counter = 0
                num_target_class = self.model.classifier.num_output
                # initializing variables for keeping track of results for tensorboard reporting
                results_phase = self.init_results_phase(num_samples=num_samples, num_target_class=num_target_class)
                for i, data in enumerate(cur_loader):
                    batch_counter += 1
                    # Copy input and targets to the device object
                    inputs = data['input'].to(self.device)
                    type_indices = self.get_target_type_index()
                    targets = data['target'][:, type_indices].float().squeeze().to(self.device)
                    # Zero the parameter gradients
                    self.optimizer.zero_grad()
                    # Forward pass
                    outputs = self.model(inputs).squeeze()
                    loss = self.criterion(outputs, targets)
                    # Backward + Optimize(in training)
                    if phase == 'train':
                        loss.mean().backward()
                        self.optimizer.step()
                    
                    # Record results of the operation for the reporting
                    results_batch = self.get_results_batch(results_phase.keys(), data, loss, outputs)
                    # Aggregate results into a phase array for complete epoch reporting
                    cur_batch_size = inputs.shape[0]
                    nominal_batch_size = cur_loader.batch_size
                    results_phase, batch_idx_range = self.update_results_phase(results_batch=results_batch, 
                                                                               results_phase=results_phase,
                                                                               nominal_batch_size=nominal_batch_size,
                                                                               cur_batch_size=cur_batch_size,
                                                                               batch_idx=i)
                    # Gather number of each class in mini batch
                    total_sample_counter += cur_batch_size
                    non_zero_count = np.count_nonzero(results_batch['target'], axis=0)
                    cur_sample_count = np.vstack((cur_batch_size-non_zero_count, non_zero_count))
                    assert (cur_sample_count.sum(axis=0) == cur_batch_size).all(), 'Sum to batch size check failed'
                    sample_count_df = sample_count_df + cur_sample_count
                    # logging for the running loss and accuracy for each target class
                    if i % self.log_int == 0:
                        running_loss_log = results_phase['loss'][:batch_idx_range[1]].mean(axis=0)
                        running_accuracy = results_phase['correct'][:batch_idx_range[1]].mean(axis=0)
                        accuracy_dict = self.add_target_names(running_accuracy.round(3))
                        running_loss_dict = self.add_target_names(running_loss_log.round(3))
                        print(Trainer.time_str() + ' Phase: ' + phase +
                              f', epoch: {epoch}, batch: {i}, running loss: {running_loss_dict}, running accuracy: {accuracy_dict}')
                        writer.add_scalars(f'running_loss/{phase}', running_loss_dict, batch_counter)
                        writer.add_scalars(f'running_accuracy/{phase}', accuracy_dict, batch_counter)

                # Number of samples in epoch checked two ways
                assert total_sample_counter == num_samples
                # Make sure no -1s left in the phase results (excluding input which throws errors)
                for key in ['loss', 'output_prob', 'prediction', 'target', 'correct']:
                    assert not (results_phase[key] == -1).any()
                # Fraction for each class of target
                class_fraction_df = sample_count_df / num_samples
                assert np.isclose(class_fraction_df.sum(), 1.0).all(), 'All fraction sum to 1.0 failed'
                # the index for positive examples in each class
                with_index = 'Yes'
                fraction_positive_dict = class_fraction_df.loc[with_index].to_dict()
                writer.add_scalars(f'Fraction_with_target/{phase}', fraction_positive_dict, epoch)
                # calculate epoch loss and accuracy average over batch samples
                # Epoch error measures
                epoch_loss_log = results_phase['loss'].mean(axis=0)
                epoch_loss_dict = self.add_target_names(epoch_loss_log.round(3))
                epoch_accuracy_log = results_phase['correct'].mean(axis=0)
                epoch_acc_dict = self.add_target_names(epoch_accuracy_log.round(3))
                print(Trainer.time_str() + ' Phase: ' + phase +
                      f', epoch: {epoch}: epoch loss: {epoch_loss_dict}, epoch accuracy: {epoch_acc_dict}')
                
                # Pickle important results dict elements: loss, output_prob and dataset_indices
                dict_to_save = {key: results_phase[key] for key in ['loss', 'output_prob', 'dataset_indices']}
                io.save_dict(dict_to_save, os.path.join(self.log_root, epoch_root, 'results_saved.pkl'))

                # Precision, recall, accuracy and loss 
                precision, recall, _, num_pos = sk_metrics.precision_recall_fscore_support(results_phase['target'].squeeze(),
                                                                                           results_phase['prediction'].squeeze(),
                                                                                           zero_division=0)
               
                # The metrics function returns the result for both positive and negative labels when operated with
                # a single target type. When the task is a multilabel decision it only returns the positive label results
                if num_target_class == 1:
                    precision = [precision[1]]
                    recall = [recall[1]]
                    num_pos = num_pos[1]
                assert (np.asarray(sample_count_df.loc['Yes']) == num_pos).all(), 'Number of positive samples matching failed'
                cur_metrics = [epoch_loss_dict, epoch_acc_dict,
                               self.add_target_names(precision), self.add_target_names(recall)]
                for i, metric_name in enumerate(epoch_metric_names):
                    epoch_metric_dict[metric_name][phase] = cur_metrics[i]
                
                # Confusion matrix Figure
                if num_target_class == 1:
                    confusion_matrix = sk_metrics.confusion_matrix(results_phase['target'].squeeze(),
                                                                   results_phase['prediction'].squeeze())
                elif num_target_class > 1:
                    confusion_matrix = sk_metrics.multilabel_confusion_matrix(results_phase['target'],
                                                                              results_phase['prediction'])
                else:
                    raise Exception('number of target classes is negative')
                fig_confusion_norm = self.plot_confusion_matrix(confusion_matrix)
                figname_confusion = 'Confusion_matrix'
                fig_confusion_norm.savefig(os.path.join(self.log_root,
                                                        epoch_root,
                                                        figname_confusion + phase + '.png'),
                                           dpi=300)
                writer.add_figure(f'{figname_confusion}/{phase}', fig_confusion_norm, epoch)

                # Images with highest loss in each target type (Myelin and artefact currently)
                fig = self.show_imgs(results_phase=results_phase)
                figname_examples = 'Examples_with_highest_loss'
                fig.savefig(os.path.join(self.log_root, epoch_root, figname_examples + '_' + phase + '.png'), dpi=300)
                writer.add_figure(f'{figname_examples}/{phase}', fig, epoch)

                # Precision/Recall curves
                for i, t_type in enumerate(self.target_names):
                    writer.add_pr_curve(f'{t_type}/{phase}',
                                        labels=results_phase.get('target')[:, i],
                                        predictions=results_phase.get('output_prob')[:, i],
                                        global_step=epoch,
                                        num_thresholds=100)

                # save model
                if self.save & (phase == 'train') & (epoch % self.save_int == 0):
                    print(Trainer.time_str() + ' Writing model graph ... ')
                    # writer.add_graph(self.model, inputs)
                    print(Trainer.time_str() + ' Saving model state... ')
                    self.model.epoch = torch.nn.Parameter(torch.tensor(epoch), requires_grad=False)
                    self.model.iteration = torch.nn.Parameter(torch.tensor(batch_counter), requires_grad=False)
                    torch.save({
                        'model_state_dict': self.model.state_dict(),
                        'optimizer_state_dict': self.optimizer.state_dict()
                    }, os.path.join(self.log_root, epoch_root, 'torch_model_optim.pth'))

            # write the epoch related metrics to the tensorboard
            for metric_name in epoch_metric_names:
                cur_metric = epoch_metric_dict[metric_name]
                for ph in cur_metric:
                    cur_metric_phase = {f'{ph}_{t_type}': val for t_type, val in cur_metric[ph].items()}
                    writer.add_scalars(metric_name, cur_metric_phase, epoch)
        print(Trainer.time_str() + ' Finished training ... ')
        writer.close()
        print(Trainer.time_str() + ' Closed writer ... ')
예제 #9
0
def train_net(model,
              device,
              epochs=10,
              batch_size=8,
              lr=0.1,
              save_cp=True,
              optim='adam'):
    # First we read 'labels.csv' and construct input dataframe
    inp_df = pd.read_csv(correct_labels)

    # Now we get the stratified DataFrames for input to our Dataset Objects
    train_df, val_df, _ = utils.get_stratified_train_val_test_sets(
        inp_df=inp_df, seed=seed_val)

    resize = (128, 128)

    # Now, let's build dataset objects
    # Building up the Dataset objects
    train_set = CovidDataset(
        root=dir_data_root,
        inp_df=train_df,
        transformations=utils.get_transformations(for_train=True))

    val_set = CovidDataset(
        root=dir_data_root,
        inp_df=val_df,
        transformations=utils.get_transformations(for_train=False))

    # Creating DataLoaders for each set
    train_set_dl = DataLoader(train_set, batch_size=batch_size, shuffle=True)

    val_set_dl = DataLoader(val_set, batch_size=4, shuffle=True)

    # dictionary of dataloaders
    dataloaders = {
        'train': train_set_dl,
        'val': val_set_dl,
    }

    # Deciding the optimizer
    if optim == 'rmsprop':
        optimizer = torch.optim.RMSprop(model.parameters(),
                                        lr=lr,
                                        weight_decay=1e-8)
    elif optim == 'adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    elif optim == 'sgd':
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=lr,
                                    momentum=0.95,
                                    weight_decay=0.001,
                                    nesterov=True)

    # Setting up loss based
    criterion = nn.CrossEntropyLoss()

    params = {'optimizer': optimizer, 'criterion': criterion}

    # Let's create a tensorboard object
    writer = SummaryWriter(
        log_dir='runs/train_proposed_model_run1',
        comment=
        f'OPTIM_{optim}_LOSS_{criterion}_LR_{lr}_BATCH_SIZE_{batch_size}_IMG_SIZE_{resize}'
    )
    # Let's create a random input which may be passed to network and its depiction be displayed in tensorboard graph viz
    ran_inp = torch.randn((2, 3, 128, 128), device=device)
    writer.add_graph(model=model, input_to_model=ran_inp)
    optim_string = optimizer.__str__().replace("\n", ' ')
    text = f'''
        Input Type:         Chest XRays - RGB Images
        Output Type;        0/1 - Classification
        Batch Norm:         True
        Activation:         ReLU
        Epochs:             {epochs}
        Optimizer:          {optim_string}  
        Learning Rate:      {lr} 
        Train Batch Size:   {batch_size}
        Val Batch Size:     4 
        Loss Criterion:     {criterion.__repr__()}  
        Weight Init:        Default 
        '''
    writer.add_text('Configurations', text, 1)
    writer.flush()

    logging.info(f'''Starting training:
        Epochs:          {epochs}
        Batch size:      {batch_size}
        Optimizer:       {optim}
        Learning rate:   {lr}
        Training size:   {train_set.__len__()}
        Validation size: {val_set.__len__()}
        Checkpoints:     {save_cp}
        Device:          {device.type}
        Image Size:      {resize}
    ''')

    # store train/val loss history
    train_epoch_loss = []
    train_epoch_acc = []
    train_epoch_metrics = []
    val_epoch_loss = []
    val_epoch_acc = []
    val_epoch_metrics = []

    # To create reference for making decision for best results
    prev_val_loss = np.Infinity  # Any arbitrary Number would do fine
    prev_val_acc = 0.0
    prev_val_f1 = 0.0

    # Initialization to save best weights and model
    best_model_wts = copy.deepcopy(model.state_dict())

    for epoch in range(epochs):

        start_time = time()

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:

            if phase == 'train':
                model.train()  # Set model to training mode
                set_len = len(train_set)
                desc = f'Epoch {epoch + 1}/{epochs}'
                leave = True
            else:
                model.eval(
                )  # Set model to evaluate mode i.e. freeze weight updates
                set_len = len(val_set)
                desc = 'Validation Phase'
                leave = True

            total = 0
            running_loss = 0
            correct_preds = 0
            original_labels = []
            predicted_labels = []

            with tqdm(total=set_len, desc=desc, unit='img',
                      leave=leave) as pbar:
                for batch in dataloaders[
                        phase]:  # Get Each Batch According to Phase
                    _, images, labels = batch  # Let's get images and labels for each batch

                    images = images.to(device)
                    labels = labels.to(device)

                    # We want to zero out the gradients every time as by default pytorch accumulates gradients
                    optimizer.zero_grad()

                    # We want to calculate and update gradients only in 'train phase'
                    if phase == 'val':
                        with torch.no_grad():
                            outputs = model(images)
                            loss = params['criterion'](outputs, labels).cuda()

                            pbar.set_postfix(
                                **{'Val CE loss (running)': loss.item()})

                            total += images.size(0)
                            _, predicted = torch.max(outputs.data, 1)
                            correct_preds += (predicted == labels).sum().item()

                            original_labels += labels.data.cpu().numpy(
                            ).tolist()
                            predicted_labels += predicted.data.cpu().numpy(
                            ).tolist()
                            pbar.update(images.shape[0])

                    else:
                        # forward pass
                        outputs = model(images)
                        loss = params['criterion'](outputs, labels).cuda()

                        pbar.set_postfix(
                            **{'train CE Loss (running)': loss.item()})

                        loss.backward()  # Calculate Gradients
                        params['optimizer'].step()  # Update Weights

                        total += images.size(0)
                        _, predicted = torch.max(outputs.data, 1)
                        correct_preds += (predicted == labels).sum().item()

                        original_labels += labels.data.cpu().numpy().tolist()
                        predicted_labels += predicted.data.cpu().numpy(
                        ).tolist()

                        pbar.update(images.shape[0])

                    running_loss += loss.item() * images.size(0)

                epoch_loss = running_loss / total
                epoch_acc = (correct_preds / total) * 100
                epoch_metrics = utils.evaluate_metrics(original_labels,
                                                       predicted_labels)

                if phase == 'train':
                    train_epoch_loss.append(epoch_loss)
                    train_epoch_acc.append(epoch_acc)
                    # logging.info(f'''Train:
                    #     CE:    {epoch_loss}
                    # ''')
                    writer.add_scalar('Loss/Train/Cross Entropy Loss',
                                      epoch_loss, (epoch + 1))
                    writer.add_scalar('Metrics/Train/Accuracy', epoch_acc,
                                      (epoch + 1))
                    writer.add_pr_curve('Metrics/Train/PR-Curve',
                                        np.asarray(original_labels),
                                        np.asarray(predicted_labels),
                                        (epoch + 1))
                    writer.add_scalar('Metrics/Train/F1-Score',
                                      epoch_metrics['f1_score'], (epoch + 1))
                    writer.add_scalar('Metrics/Train/Precision',
                                      epoch_metrics['precision'], (epoch + 1))
                    writer.add_scalar('Metrics/Train/Recall',
                                      epoch_metrics['recall'], (epoch + 1))
                    writer.add_scalar('Metrics/Train/Specificity',
                                      epoch_metrics['specificity'],
                                      (epoch + 1))
                    writer.add_scalar('Metrics/Train/Sensitivity',
                                      epoch_metrics['sensitivity'],
                                      (epoch + 1))
                    writer.flush()
                elif phase == 'val':
                    writer.add_scalar('Loss/Validation/Cross Entropy Loss',
                                      epoch_loss, (epoch + 1))
                    writer.add_scalar('Metrics/Validation/Accuracy', epoch_acc,
                                      (epoch + 1))
                    writer.add_pr_curve('Metrics/Validation/PR-Curve',
                                        np.asarray(original_labels),
                                        np.asarray(predicted_labels),
                                        (epoch + 1))
                    writer.add_scalar('Metrics/Validation/F1-Score',
                                      epoch_metrics['f1_score'], (epoch + 1))
                    writer.add_scalar('Metrics/Validation/Precision',
                                      epoch_metrics['precision'], (epoch + 1))
                    writer.add_scalar('Metrics/Validation/Recall',
                                      epoch_metrics['recall'], (epoch + 1))
                    writer.add_scalar('Metrics/Validation/Specificity',
                                      epoch_metrics['specificity'],
                                      (epoch + 1))
                    writer.add_scalar('Metrics/Validation/Sensitivity',
                                      epoch_metrics['sensitivity'],
                                      (epoch + 1))
                    # logging.info(f'''Validation:
                    #     CE:    {epoch_loss}
                    # ''')
                    writer.flush()
                    val_epoch_loss.append(epoch_loss)
                    val_epoch_acc.append(epoch_acc)
                    if round(epoch_loss, 5) < prev_val_loss and round(
                            epoch_acc, 5) > prev_val_acc and round(
                                epoch_metrics['f1_score'], 5) > prev_val_f1:
                        prev_val_loss = epoch_loss
                        prev_val_acc = epoch_acc
                        prev_val_f1 = epoch_metrics['f1_score']
                        best_model_wts = copy.deepcopy(model.state_dict())
                        best_res = f'''
                                                    Val Loss:   {epoch_loss}
                                                    Accuracy:   {epoch_acc}
                                                    Metrics:    {epoch_metrics}
                                                '''
                        writer.add_text('Best Results', best_res, (epoch + 1))
                        writer.flush()

        if save_cp:
            try:
                if not os.path.exists(dir_checkpoint):
                    os.mkdir(dir_checkpoint)
                    logging.info('Created checkpoint directory')
            except OSError:
                pass
            if (epoch + 1) % 5 == 0:
                torch.save(
                    model.state_dict(),
                    os.path.join(dir_checkpoint, f'modelp6{epoch + 1}.pth'))
                logging.info(f'Checkpoint {epoch + 1} saved !')

        end_time = time()
        logging.info('Epoch took time: {}'.format(str(end_time - start_time)))

    writer.close()

    # Load n Save Best Model Weights
    model.load_state_dict(best_model_wts)
    torch.save(model.state_dict(),
               os.path.join(dir_checkpoint, 'modelp6_best_weights' + '.pth'))

    return train_epoch_loss, train_epoch_acc, train_epoch_metrics, val_epoch_loss, val_epoch_acc, val_epoch_metrics
예제 #10
0
class MyTensorBoard():
    def __init__(self, net, LabelStr, EventDir):
        self.labelStr = labelStr
        self.writer = SummaryWriter(EventDir + '/')
        self.net = net

    def matplotlib_imshow(self, img, one_channel = True):
        if one_channel:
            img = img.mean(dim = 0)
        img = img / 2 + 0.5
        npimg = img.numpy()
        if one_channel:
            plt.imshow(npimg, cmap = 'Greys')
        else:
            plt.imshow(np.transpose(npimg, (1, 2, 0)))

    def ImageVisualize(self, images, labels):
        img_grid = torchvision.utils.make_grid(images)
        self.matplotlib_imshow(img_grid, one_channel = True)
        self.writer.add_image('Images', img_grid)
        self.writer.close()

    # Add Net structure to Tensorboard
    def NetVisualize(self, sampleInput):
        self.writer.add_graph(self.net, sampleInput)
        self.writer.close()

    def images_to_probs(self, images):
        '''
        Generates predictions and corresponding probabilities from a trained network
        and a list of images
        '''
        output = net(images)
        _, preds_tensor = torch.max(output, 1)
        preds = np.squeeze(preds_tensor.numpy())
        return preds, [F.softmax(el, dim = 0)[i].item() for i, el in zip(preds, output)]

    def plot_classes_preds(self, images, labels):
        '''
            Generates matplotlib Figure using a trained network, along with images and labels
            from a batch, that shows the network's top predictions along with its probability,
            alongside the actual label, coloring this information based on whether the predictions
            was correct or not. Uses the "Images_to_probs" function
        '''
        preds, probs = images_to_probs(images)
        fig = plt.figure(figsize = (12, 48))
        for idx in np.arange(4):
            ax = fig.add_subplot(1, 4, idx + 1, xticks = [], yticks = [])
            matplotlib_imshow(images[idx], one_channel = True)
            ax.set_title("{0}, {1:.1f}%\n(label: {2})".format(self.labelStr[preds[idx]],
                                                              probs[idx] * 100.0,
                                                              self.labelStr[labels[idx]]),
                                                              color = ("green" if preds[idx] == labels[idx].item() else "red"))
        return fig

    def ScalarVisualize(self, graphTitle, loss, currentStep):
        '''
            Log scalar values to plots, e.g. loss, acc, during training
        '''
        self.writer.add_scalar(graphTitle, loss, currentStep)
        self.writer.close()
    
    def PredVisualize(self, images, labels, currentStep):
        '''
            Log matplotlib figures of model's predictions to specified mini-batch
        '''
        self.writer.add_figure('predictions vs. actuals',
                               self.plot_classes_preds(images, labels),
                               global_step = currentStep)
        self.writer.close()

    def ProjVisualize(self, data, labels, use_rand_instance = True, num_rand = 100):
        '''
            Add projection visualization to Tensorboard
        '''
        assert len(data) == len(labels)
        if use_rand_instance:
            perm = torch.randperm(len(data))
            images, labels = data[perm][:num_rand], labels[perm][:num_rand]
        else:
            images, labels = data, labels
        class_labels = [self.labelStr[lab] for lab in labels]
        features = images.view(-1, 28 * 28)
        self.writer.add_embedding(features,
                                  metadata = class_labels,
                                  label_img = images.unsqueeze(1))
        self.writer.close()

    def PRcurveVisualize(self, test_probs, test_preds):
        '''
            Plot the Precision - Recall curve in Tensorboard, per - class wise
        '''
        for class_index in range(len(self.labelStr)):
            tensorboard_preds = test_preds == class_index
            tensorboard_probs = test_probs[:, class_index]
            self.writer.add_pr_curve(self.labelStr[class_index],
                                     tensorboard_preds,
                                     tensorboard_probs,
                                     global_step = 0)
        self.writer.close()
예제 #11
0
파일: classifier.py 프로젝트: genEM3/genEM3
    def train(self):

        if self.resume:
            print('(' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ') Resuming training ... ')
            checkpoint = torch.load(os.path.join(self.log_root, 'torch_model'))
            self.model.load_state_dict(checkpoint['model_state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        else:
            print('(' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ') Starting training ... ')

        writer = SummaryWriter(self.log_root)
        self.model = self.model.to(self.device)

        epoch = int(self.model.epoch) + 1
        it = int(self.model.iteration)
        sample_inds = dict()
        for epoch in range(epoch, epoch + self.num_epoch):
            if isinstance(self.data_loaders, list):
                # each element of the list is a data loader for an epoch
                loader_change_interval = self.num_epoch / len(self.data_loaders)
                division_index, _ = divmod(epoch, loader_change_interval)
                # Make sure that the index does not exceed the length of the data_loader list
                index = round(min(division_index, len(self.data_loaders)-1))
                cur_data_loaders = self.data_loaders[index]
            else:
                # same dataloaders for all epochs
                cur_data_loaders = self.data_loaders
            # Dictionary (of dictionaries) to collect four metrics from different phases for tensorboard
            epoch_metric_names = ['epoch_loss', 'epoch_accuracy', 'precision/PPV', 'recall/TPR']
            epoch_metric_dict = {metric_name: dict.fromkeys(cur_data_loaders.keys()) for metric_name in epoch_metric_names}
            epoch_root = 'epoch_{:02d}'.format(epoch)
            if not os.path.exists(os.path.join(self.log_root, epoch_root)):
                os.makedirs(os.path.join(self.log_root, epoch_root))

            for phase in cur_data_loaders.keys():

                if phase == 'train':
                    self.model.train(True)
                else:
                    self.model.train(False)

                epoch_loss = 0
                running_loss = 0.0
                target_sum = 0
                predicted_sum = 0
                correct_sum = 0
                batch_idx_start = 0

                num_items = len(cur_data_loaders[phase].batch_sampler.sampler)

                inputs_phase = -np.ones((num_items, 1, 140, 140)).astype(float)
                outputs_phase = -np.ones((num_items, self.model.classifier.num_output)).astype(float)
                predictions_phase = -np.ones(num_items).astype(int)
                targets_phase = -np.ones(num_items).astype(int)
                correct_phase = -np.ones(num_items).astype(int)

                sample_ind_phase = []
                for i, data in enumerate(cur_data_loaders[phase]):

                    it += 1

                    # copy input and targets to the device object
                    inputs = data['input'].to(self.device)
                    targets = data['target'].to(self.device)
                    sample_ind_batch = data['sample_idx']
                    sample_ind_phase.extend(sample_ind_batch)

                    # zero the parameter gradients
                    self.optimizer.zero_grad()

                    # forward + backward + optimize
                    outputs = self.model(inputs).squeeze()
                    loss = self.criterion(outputs, targets)

                    if phase == 'train':
                        loss.backward()
                        self.optimizer.step()

                    inputs, outputs, targets = Trainer.copy2cpu(inputs, outputs, targets)

                    predicted_classes = np.argmax(np.exp(outputs.detach().numpy()), axis=1)
                    predicted_sum += np.sum(predicted_classes)
                    target_classes = targets.detach().numpy()
                    target_sum += np.sum(target_classes)
                    correct_classes = predicted_classes == target_classes
                    correct_sum += np.sum(correct_classes)

                    if i > 0:
                        batch_idx_start = batch_idx_end
                    batch_idx_end = batch_idx_start + len(targets)
                    inputs_phase[batch_idx_start:batch_idx_end, :, :, :] = inputs.detach().numpy()
                    outputs_phase[batch_idx_start:batch_idx_end, :] = outputs.detach().numpy()
                    predictions_phase[batch_idx_start:batch_idx_end] = predicted_classes
                    targets_phase[batch_idx_start:batch_idx_end] = target_classes
                    correct_phase[batch_idx_start:batch_idx_end] = correct_classes

                    running_loss += loss.item()
                    epoch_loss += loss.item()
                    # Report fraction of clean data in mini batch
                    clean_num = float((targets == 0).sum())
                    debris_num = float((targets == 1).sum())
                    fraction_clean = clean_num / (debris_num + clean_num)
                    writer.add_scalars('Fraction_clean_samples', {phase: fraction_clean}, it)

                    if i % self.log_int == 0:
                        running_loss_log = float(running_loss) / batch_idx_end
                        running_accuracy_log = float(correct_sum) / batch_idx_end
                        print('(' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ')' + ' Phase: ' + phase +
                              ', epoch: {}, batch: {}, running loss: {:0.4f}, running accuracy: {:0.3f} '.
                              format(epoch, i, running_loss_log, running_accuracy_log))
                        writer.add_scalars('running_loss', {phase: running_loss_log}, it)
                        writer.add_scalars('running_accuracy', {phase: running_accuracy_log}, it)

                epoch_loss_log = float(epoch_loss) / num_items
                epoch_accuracy_log = float(correct_sum) / num_items
                print('(' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ')' + ' Phase: ' + phase +
                      ', epoch: {}: epoch loss: {:0.4f}, epoch accuracy: {:0.3f} '.
                      format(epoch, epoch_loss_log, epoch_accuracy_log))

                metrics = Metrics(
                    targets=targets_phase, outputs=outputs_phase, output_prob_fn=lambda x: np.exp(x[:, 1]),
                    sample_ind=sample_ind_phase)
                metrics.confusion_table(
                    path_out=os.path.join(self.log_root, epoch_root, 'confusion_table_' + phase + '.csv'))
                metrics.prediction_table(
                    path_out=os.path.join(self.log_root, epoch_root, 'prediction_table_' + phase + '.csv'))
                # Set the current values of the epoch error metrics
                cur_metrics = [epoch_loss_log, epoch_accuracy_log, metrics.metrics['PPV'], metrics.metrics['TPR']]
                for i, metric_name in enumerate(epoch_metric_names):
                    epoch_metric_dict[metric_name][phase] = cur_metrics[i]

                fig = Trainer.show_imgs(inputs=inputs_phase, outputs=outputs_phase, predictions=predictions_phase,
                                        targets=targets_phase,
                                        sample_ind=sample_ind_phase)
                figname = 'image_examples_'
                fig.savefig(os.path.join(self.log_root, epoch_root, figname + '_' + phase + '.png'))
                writer.add_figure(figname + phase, fig, epoch)

                fig = Trainer.show_classification_matrix(targets=targets_phase, predictions=predictions_phase,
                                                         metrics=metrics.metrics)
                figname = 'targets_outputs_correct_'
                fig.savefig(os.path.join(self.log_root, epoch_root, figname + '_' + phase + '.png'))
                fig.savefig(os.path.join(self.log_root, epoch_root, figname + '_' + phase + '.eps'))
                writer.add_figure(figname + phase, fig, epoch)

                writer.add_pr_curve(
                    'pr_curve_'+phase, labels=targets_phase, predictions=np.exp(outputs_phase[:, 1]), global_step=epoch,
                    num_thresholds=50)

                if self.save & (phase == 'train') & (epoch % self.save_int == 0):
                    print('(' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ') Writing model graph ... ')
                    # writer.add_graph(self.model, inputs)

                    print('(' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ') Saving model state... ')
                    self.model.epoch = torch.nn.Parameter(torch.tensor(epoch), requires_grad=False)
                    self.model.iteration = torch.nn.Parameter(torch.tensor(it), requires_grad=False)
                    torch.save({
                        'model_state_dict': self.model.state_dict(),
                    }, os.path.join(self.log_root, epoch_root, 'model_state_dict'))
                    torch.save({
                        'optimizer_state_dict': self.optimizer.state_dict()
                    }, os.path.join(self.log_root, 'optimizer_state_dict'))
            # write the epoch related metrics to the tensorboard
            for metric_name in epoch_metric_names:
                writer.add_scalars(metric_name, epoch_metric_dict[metric_name], epoch)
        print('(' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ') Finished training ... ')

        writer.close()
        print('(' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ') Closed writer ... ')
예제 #12
0
def train(
    net,
    optimizer,
    lossfunc,
    train_dataloader,
    val_dataloader,
    batchsize=32,
    numepochs=1,
    device='cuda',
    log_basedir='trainlogs',
    log_subdir='run0',
    log_frequency=100,
    val_frequency=100,
):
    """
    train

    Train a sign language classifier network on the asl-alphabet.

    inputs:
        net - (SLClassifier) Sign language classifier network
        optimizer - (torch.optim optimizer) Optimizer object created using net's parameters
        train_dataloader - (ASLAlphabet) ASLAlphabet training dataloader
        val_dataloader - (ASLAlphabet) ASLAlphabet validation dataloader (note that this gets refreshed with shuffle=True when val_dataloader iterator hits stopping point)
        batchsize - (int) number of samples per batches
        numepochs - (int) number of epochs to train on
        device - (str) device to perform computations on
        log_basedir - (str) project logging folder that holds all logs (e.g. "C:\\Users...\\project_name\logs")
        log_subdir - (str) subdirectory of log_basedir specifying the storage folder for _this_ experiment (e.g. "run1")
        log_frequency - (int) logging frequency (in number of batches)
        val_frequency - (int) process validation batch every val_frequency samples

    """
    if log_frequency > val_frequency:
        raise Exception(
            "log_frequency must be less than or equal to val_frequency!")

    net.to(device)
    net.train()
    print('[ network pushed to device ]')

    logpath = os.path.join(log_basedir, log_subdir)
    trainwriter = SummaryWriter(log_dir=os.path.join(
        logpath, 'training'))  # start tensorboard writer

    # create state_dict log folder
    state_dict_path = os.path.join(logpath, 'state_dicts')
    os.mkdir(state_dict_path)

    trainwriter.add_graph(net, torch.rand(1, 3, 200, 200).to(device))

    print('[ starting training ]')
    print('----------------------------------------------------------------')
    t_start = time.time()  # record

    if val_frequency is not None and val_frequency != 0:
        val_dataloader_it = iter(
            val_dataloader)  # use this to load validation batches when we want
        valwriter = SummaryWriter(log_dir=os.path.join(logpath, 'validation'))

    batches_processed = 0
    val_batches_processed = 0
    logstep = 0  # the "global_step" variable for tensorboard logging
    for epoch in range(numepochs):
        print('epoch =', epoch)
        for i, batch in enumerate(train_dataloader):
            # start device transfer timing
            transfer_start = time.time()
            # sample and move to device
            labels, samples = batch
            samples = samples.to(device)
            labels = labels.to(device)
            transfer_time = time.time(
            ) - transfer_start  # record cpu dataload time

            # gpu computations
            compute_start = time.time()
            scores = net(samples)
            probs = scores.softmax(dim=1)

            loss = lossfunc(scores, labels)  # reduced to scalar

            # TODO: add regularization

            # backprop + paramater update
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            compute_time = time.time() - compute_start

            batches_processed += 1

            #
            # Tensorboard logging
            #
            if log_frequency != 0 and batches_processed % log_frequency == 0:

                # log time for log_frequency batches
                t_end = time.time()
                mtpb = (t_end -
                        t_start) / log_frequency  # mean time per sample
                trainwriter.add_scalars(
                    'times', {
                        'mean_time_per_batch': mtpb,
                        'transfer_time': transfer_time,
                        'compute_time': compute_time
                    }, logstep)
                t_start = t_end

                # compute accuracy
                _, class_pred = probs.max(dim=1)
                acc = (class_pred == labels).sum() / float(
                    len(labels))  # accuracy
                trainwriter.add_scalars('accuracies', {'train': acc}, logstep)

                # record batch loss
                trainwriter.add_scalars('losses', {'loss': loss}, logstep)

                # TODO: get PR curves working
                one_hot = torch.nn.functional.one_hot(labels)
                # sometimes 29th class isn't represented, so one_hot results in <29 columns
                if one_hot.size(1) < 29:
                    one_hot = torch.cat([
                        one_hot.cpu(),
                        torch.zeros(one_hot.size(0),
                                    29 - one_hot.size(1)).long()
                    ],
                                        dim=1)

                trainwriter.add_pr_curve('pr',
                                         labels=one_hot,
                                         predictions=probs,
                                         global_step=logstep)

                # gpu usage
                # TODO: optimize gpu usage
                trainwriter.add_scalars(
                    'gpu_usage', {
                        'mem_allocated': torch.cuda.memory_allocated('cuda'),
                        'mem_cached': torch.cuda.memory_cached('cuda')
                    }, logstep)

                print('logstep =', logstep)
                print('batches_processed =', batches_processed)
                print(
                    'epoch_progress =', batchsize * batches_processed /
                    len(train_dataloader.dataset))
                print('train_samples_processed =',
                      batchsize * batches_processed)
                print('mean_time_per_batch =', mtpb)
                print(
                    '----------------------------------------------------------------'
                )

            #
            # Validation
            #
            if val_frequency != 0:
                if batches_processed % val_frequency == 0 and val_frequency != 0:
                    net.eval()  # set evaluation mode
                    with torch.no_grad():
                        labels, samples = next(val_dataloader_it)
                        labels = labels.to(device)
                        samples = samples.to(device)

                        scores = net(samples)
                        probs = scores.softmax(dim=1)

                        # val losses
                        loss_val = lossfunc(scores, labels)
                        valwriter.add_scalars('losses', {'val': loss_val},
                                              logstep)

                        # val accuracy
                        _, class_pred = probs.max(dim=1)
                        val_acc = (class_pred == labels).sum() / float(
                            len(labels))  # accuracy
                        valwriter.add_scalars('accuracies',
                                              {'validation': val_acc}, logstep)

                    val_batches_processed += 1

                    # reset validation dataloader if we just completed the last batch
                    if torch.Tensor([val_batches_processed]) % torch.ceil(
                            torch.Tensor([
                                len(val_dataloader.dataset) /
                                val_dataloader.batch_size
                            ])) == 0:
                        val_dataloader_it = iter(val_dataloader)

                    net.train()

            logstep += 1

        # checkpoint model every epoch
        pth_path = os.path.join(state_dict_path,
                                'net_state_dict_epoch{}.pth'.format(epoch))
        torch.save(net.state_dict(), pth_path)
        print('[ model saved, path = {} ]'.format(pth_path))

    return net  # return the network for subsequent usage
예제 #13
0
import os
import shutil
import time

import numpy as np

from torch.utils.tensorboard import SummaryWriter

os.makedirs('tensorboard_runs', exist_ok=True)
shutil.rmtree('tensorboard_runs')
writer = SummaryWriter(log_dir='tensorboard_runs',
                       filename_suffix=str(time.time()))

for k in range(11):
    for i in range(10):
        data = np.random.random(10)
        # for j in range(data.shape[0]):
        #     writer.add_scalars('ROC curve/{}_data'.format(k), {str(i): data[j]}, j / 10)
        writer.add_pr_curve('ROC curve/{}_data'.format(k), data, data)

writer.flush()
예제 #14
0
def train(logger, config, model, processor):
    comment = f"_TASK-{config.task_name}_MODEL-{config.model_name}" + \
                f"_EPOCH-{config.epoch}_BATCH-{config.batch_size}_LR-{config.lr}"
    suffix = get_logdir_suffix(comment)
    logger.info(suffix)
    tb_log_dir = os.path.join(config.output_dir, 'tb_log_dir', suffix)
    tb_writer = SummaryWriter(log_dir=tb_log_dir)

    if config.model_name == 'sent_crcnn':
        criterion = RankingLoss(processor.class_num, config)
    else:
        criterion = nn.CrossEntropyLoss()

    optimizer = optim.Adam(model.parameters(), lr=config.lr)
    # Train!
    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {processor.num_train_examples}")
    logger.info(f"  Num Epochs = {config.epoch}")
    logger.info(f"  Train batch size = {config.batch_size}")

    global_step = 0
    train_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(config.epoch, desc="Epoch")
    set_seed(config.seed)
    best_nonna_macro_f1 = 0.0
    epoch_num = 1
    for _ in train_iterator:
        train_loader = processor.train_loader
        epoch_iterator = tqdm(train_loader, desc="Iteration", ncols=60)
        for step, raw_batch in enumerate(epoch_iterator):
            model.train()
            if config.task_name == 'sent':
                batch = tuple(t.to(config.device) for t in raw_batch[:-2])
                rel_labels = batch[4]
                bag_labels = batch[5]
                instance_id = raw_batch[6]
                bag_id = raw_batch[7]
                inputs = {
                    "token2ids": batch[0],
                    "pos1s": batch[1],
                    "pos2s": batch[2],
                    "mask": batch[3],
                }
            elif config.task_name == 'bag':
                batch = tuple(t.to(config.device) for t in raw_batch[:-3])
                rel_labels = batch[4]
                bag_labels = batch[5]
                instance_id = raw_batch[6]
                bag_id = raw_batch[7]
                inputs = {
                    "token2ids": batch[0],
                    "pos1s": batch[1],
                    "pos2s": batch[2],
                    "mask": batch[3],
                    "scopes": raw_batch[8],
                    "is_training": True,
                    "rel_labels": rel_labels,
                }
            else:
                raise NotImplementedError

            optimizer.zero_grad()
            out = model(**inputs)
            loss = criterion(out, rel_labels.to(config.device))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            global_step += 1
            if config.do_eval_while_train:
                if global_step % config.tb_logging_step == 0:
                    if config.model_name == 'sent_crcnn':
                        results, eval_loss, preds, labels, outs = evaluate_crcnn(
                            model, criterion, logger, processor, config,
                            "train", f"E-{epoch_num}_S-{step+1}")
                    else:
                        results, eval_loss, preds, labels, outs = evaluate_nyth(
                            model, criterion, logger, processor, config,
                            "train", f"E-{epoch_num}_S-{step+1}")

                    for key, val in results.items():
                        if 'report' in key:
                            continue
                        tb_writer.add_scalar(f"{key}/train", val, global_step)
                    tb_writer.add_scalar("loss/train",
                                         (train_loss - logging_loss) /
                                         config.tb_logging_step, global_step)
                    probs = torch.nn.functional.softmax(torch.tensor(outs),
                                                        dim=1)
                    thresholds, indices = probs.max(dim=1)
                    tb_writer.add_pr_curve('pr_curve/train',
                                           labels == preds,
                                           thresholds,
                                           global_step=global_step,
                                           num_thresholds=len(preds))
                    logging_loss = train_loss

        if config.model_name == 'sent_crcnn':
            results, eval_loss, preds, labels, outs = evaluate_crcnn(
                model, criterion, logger, processor, config, "dev",
                f"E-{epoch_num}_S-{step+1}")
        else:
            results, eval_loss, preds, labels, outs = evaluate_nyth(
                model, criterion, logger, processor, config, "dev",
                f"E-{epoch_num}_S-{step+1}")
        for key, val in results.items():
            if 'report' in key:
                continue
            tb_writer.add_scalar(f"{key}/dev", val, global_step)

        probs = torch.nn.functional.softmax(torch.tensor(outs), dim=1)
        thresholds, indices = probs.max(dim=1)
        tb_writer.add_pr_curve('pr_curve/dev',
                               labels == preds,
                               thresholds,
                               global_step=global_step,
                               num_thresholds=len(preds))
        nonna_macro_f1 = results[config.select_score]
        if nonna_macro_f1 > best_nonna_macro_f1:
            best_nonna_macro_f1 = nonna_macro_f1
            logger.info(
                f"Epoch: {epoch_num}, *Best DEV {config.select_score}: {best_nonna_macro_f1}"
            )
            if config.save_best_model:
                output_dir = os.path.join(config.output_dir, 'checkpoints',
                                          'best')
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save = model.module if hasattr(model,
                                                        'module') else model
                torch.save(model_to_save.state_dict(),
                           os.path.join(output_dir, 'best_model.pth'))
                logger.info(
                    f"Epoch: {epoch_num}, Saving model to {output_dir}")
        else:
            logger.info(
                f"Epoch: {epoch_num}, DEV {config.select_score}: {nonna_macro_f1}"
            )
        epoch_num += 1
    tb_writer.close()
    return global_step, train_loss / global_step