Exemplo n.º 1
0
    def learn(self, epoch, dataloader, init=True):
        self.train()
        loss_curve = {loss: [] for loss in self.loss_names}
        self.acc_reset_mnist()
        bar = ProgressBar()
        for data in bar(dataloader):
            data_var = [to_tensor(_, self.opt.device) for _ in data]
            self.set_input(data_var, init)
            self.optimize_parameters(init)
            for loss in self.loss_names:
                loss_curve[loss].append(
                    getattr(self, 'loss_' + loss).detach().item())
            self.acc_update_mnist()

        self.loss_msg = '[Train][{}] Loss:'.format(epoch)
        for loss in self.loss_names:
            self.loss_msg += ' {} {:.3f}'.format(loss,
                                                 np.mean(loss_curve[loss]))
        self.acc_msg = '[Train][{}] Acc: source {:.3f} ({}/{}) target {:.3f} ({}/{})'.format(
            epoch, self.acc_source, self.hit_source, self.cnt_source,
            self.acc_target, self.hit_target, self.cnt_target)
        self.print_log()

        for lr_scheduler in self.lr_schedulers:
            lr_scheduler.step()
Exemplo n.º 2
0
    def learn(self, epoch, dataloader, init=True):
        self.epoch = epoch
        self.train()
        loss_curve = {
            loss: []
            for loss in self.loss_names
        }
        acc_curve = []

        if init:
            for data in dataloader:
                x_seq, y_seq = [d[0][None, :, :] for d in data], [d[1][None, :] for d in data]
                x_seq = torch.cat(x_seq, 0).cuda()
                y_seq = torch.cat(y_seq, 0).cuda()

                self.set_input(input=(x_seq, y_seq))
                self.optimize_parameters(init)

                for loss in self.loss_names:
                    loss_curve[loss].append(getattr(self, 'loss_' + loss).item())

                acc_curve.append(self.g_seq.eq(self.y_seq).to(torch.float).mean(-1, keepdim=True))

            loss_msg = '[Train][{}] Loss:'.format(epoch)
            for loss in self.loss_names:
                loss_msg += ' {} {:.3f}'.format(loss, np.mean(loss_curve[loss]))

            acc = to_np(torch.cat(acc_curve, 1).mean(-1))
            acc_msg = '[Train][{}] Acc: {:.2f} {}'.format(epoch, acc.mean(), np.around(acc, decimals=2))
            print(loss_msg)
            print(acc_msg)
        else:
            dataloader, continual_dataloader = dataloader
            for data_1, data_2 in zip(dataloader, continual_dataloader):
                x_seq, y_seq = [d[0][None, :, :] for d in data_1], [d[1][None, :] for d in data_1]
                x_seq = torch.cat(x_seq, 0).cuda()
                y_seq = torch.cat(y_seq, 0).cuda()

                x_rpy, y_rpy = [d[0][None, :, :] for d in data_2], [d[1][None, :] for d in data_2]
                x_rpy = torch.cat(x_rpy, 0).cuda()
                y_rpy = torch.cat(y_rpy, 0).cuda()

                self.set_input([x_seq, y_seq, x_rpy, y_rpy], init)
                self.optimize_parameters(init)
                for loss in self.loss_names:
                    loss_curve[loss].append(getattr(self, 'loss_' + loss).detach().item())

                acc_curve.append(self.g_tgt.eq(self.y_tgt).to(torch.float).mean(-1, keepdim=True))

            loss_msg = '[Train][{}] Loss:'.format(epoch)
            for loss in self.loss_names:
                loss_msg += ' {} {:.3f}'.format(loss, np.mean(loss_curve[loss]))

            acc = to_np(torch.cat(acc_curve, 1).mean(-1))
            acc_msg = '[Train][{}] Acc: {:.2f} {}'.format(epoch, acc.mean(), np.around(acc, decimals=2))
            print(loss_msg)
            print(acc_msg)

        for lr_scheduler in self.lr_schedulers:
            lr_scheduler.step()
Exemplo n.º 3
0
def train_model(model, criterion, optimizer, dataload, lr_scheduler):
    num_epochs = args.num_epochs
    loss_record = []
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        dataset_size = len(dataload.dataset)
        epoch_loss = 0
        step = 0  # minibatch数
        for x, y in dataload:  # 分100次遍历数据集,每次遍历batch_size=4
            optimizer.zero_grad()  # 每次minibatch都要将梯度(dw,db,...)清零
            inputs = x.to(device)
            labels = y.to(device)
            outputs = model(inputs)  # 前向传播
            outputs = outputs.squeeze()
            loss = criterion(outputs, labels)  # 计算损失
            loss.backward()  # 梯度下降,计算出梯度
            # print(lr_scheduler.get_lr()[0])
            optimizer.step()
            lr_scheduler.step(
            )  # 更新参数一次:所有的优化器Optimizer都实现了step()方法来对所有的参数进行更新
            epoch_loss += loss.item()
            loss_record.append(loss.item())
            step += 1
            print("%d/%d,train_loss:%0.3f" %
                  (step, dataset_size // dataload.batch_size, loss.item()))
        print("epoch %d loss:%0.3f" % (epoch, epoch_loss))
    loss_data = pd.DataFrame(data=loss_record)
    loss_data.to_csv(args.loss_record)
    plt.plot(loss_data)
    torch.save(model.state_dict(), args.weight)  # 返回模型的所有内容
    plt.show()
    return model
Exemplo n.º 4
0
def test_cosine_decay_function() -> None:
    """
    Tests Cosine lr decay function at (pi/2) and verifies if the value is correct.
    """
    config = DummyModel(l_rate_scheduler=LRSchedulerType.Cosine,
                        num_epochs=10,
                        min_l_rate=0.0)

    # create lr scheduler
    test_epoch = 5
    lr_scheduler, _ = _create_lr_scheduler_and_optimizer(config)
    for _ in range(test_epoch):
        lr_scheduler.step()
    assert lr_scheduler.get_last_lr()[0] == 0.5 * config.l_rate
Exemplo n.º 5
0
def train(train_loader, model, optimizer, lr_scheduler, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to train mode
    model.train()

    end = time.time()
    for i, (data, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        target = target.cuda(non_blocking=True)
        data = data.cuda()

        output = model(data)
        loss = criterion(output, target)

        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), data.size(0))
        top1.update(acc1.item(), data.size(0))
        top5.update(acc5.item(), data.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()

        # adjust lr
        lr = lr_scheduler.step()
        for pg in optimizer.param_groups:
            pg["lr"] = lr

        # impose L1 penalty to BN factors
        if args.sparsity != 0:
            for m in model.modules():
                if isinstance(m, nn.BatchNorm2d):
                    m.weight.grad.data.add_(args.sparsity*torch.sign(m.weight.data))  # L1

        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        lr = optimizer.param_groups[0]["lr"]

        if i % args.print_freq == 0:
            logger.info('Epoch[{0}/{1}] Iter[{2}/{3}]\t'
                  'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data Time {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Train Loss {loss.val:.3f} ({loss.avg:.3f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})\t'
                  'LR: {lr:.4f}'.format(
                   epoch, args.epochs, i, len(train_loader),
                   batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5,
                   lr=lr))

    return losses.avg
Exemplo n.º 6
0
def train(training_model, n_epochs, optim, lr_scheduler, model_dir, target_device, train_loader, test_loader):
    tlog('Training the model...')
    tlog('working on {}'.format(target_device))
    
    best_accuracy = 0. # determines whether we save a copy of the model
    saved_model_filename = None
    
    for epoch in range(n_epochs):
        # train for one epoch, printing every 10 iterations
        train_one_epoch(training_model, optim, train_loader, target_device, epoch, print_freq=10)
        # update the learning rate
        lr_scheduler.step()
        # evaluate on the test dataset
        x = evaluate(training_model, test_loader, device=target_device)
        print(x)
    
    saved_model_filename = save_model(training_model, args.model_dir)
    return (saved_model_filename, 1)
Exemplo n.º 7
0
    def learn(self, epoch, dataloader):
        self.epoch = epoch
        self.stage = epoch // self.epoch_in_a_stage

        self.train()

        loss_curve = {loss: [] for loss in self.loss_names}
        acc_curve = []

        for data in dataloader:
            x_seq, y_seq, idx_seq = [d[0][None, :, :] for d in data
                                     ], [d[1][None, :] for d in data
                                         ], [d[2][None, :] for d in data]
            x_seq = torch.cat(x_seq, 0).to(self.device)
            y_seq = torch.cat(y_seq, 0).to(self.device)
            idx_seq = torch.cat(idx_seq, 0).to(self.device)

            self.set_input(input=(x_seq, y_seq, idx_seq))
            self.optimize_parameters()

            for loss in self.loss_names:
                loss_curve[loss].append(getattr(self, 'loss_' + loss).item())

            acc_curve.append(
                self.g_seq.eq(self.y_seq).to(torch.float).mean(-1,
                                                               keepdim=True))

        loss_msg = '[Train][{}] Loss:'.format(epoch)
        for loss in self.loss_names:
            loss_msg += ' {} {:.3f}'.format(loss, np.mean(loss_curve[loss]))

        acc = to_np(torch.cat(acc_curve, 1).mean(-1))
        acc_msg = '[Train][{}] Accuracy: total average {:.1f}, in each domain {}'.format(
            epoch,
            acc.mean() * 100, np.around(acc * 100, decimals=1))

        if (epoch + 1) % 10 == 0:
            print(loss_msg)
            print(acc_msg)
        with open(self.train_log, 'a') as f:
            f.write(loss_msg + "\n" + acc_msg + "\n")
        for lr_scheduler in self.lr_schedulers:
            lr_scheduler.step()
Exemplo n.º 8
0
def train_one_epoch(model, optimizer, lr_scheduler, data_loader, epoch):
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    print("Start Train ...")
    model.train()

    losses = []
    accur = []

    for data, target in data_loader:

        device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')

        data = data.permute(0, 3, 1, 2).to(device)
        targets = target.permute(0, 3, 1, 2).to(device)

        outputs = model(data)

        out_cut = np.copy(outputs.data.cpu().numpy())
        out_cut[np.nonzero(out_cut < 0.5)] = 0.0
        out_cut[np.nonzero(out_cut >= 0.5)] = 1.0

        train_dice = dice_metric(out_cut, targets.data.cpu().numpy())

        loss = bce_dice_loss(outputs, targets)

        losses.append(loss.item())
        accur.append(train_dice)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if lr_scheduler is not None:
        lr_scheduler.step()

    print("Epoch [%d]" % (epoch))
    print("Mean loss on train:",
          np.array(losses).mean(), "Mean DICE on train:",
          np.array(accur).mean())

    return np.array(losses).mean(), np.array(accur).mean()
Exemplo n.º 9
0
def train_step(train_loader, model, criterion, optimizer, epoch, lr_scheduler):
    print(f'epoch {epoch}')
    batch_time = AverageMeter()
    losses = AverageMeter()
    avg_score = AverageMeter()

    model.train()
    num_steps = min(len(train_loader), MAX_STEPS_PER_EPOCH)

    print(f'total batches: {num_steps}')

    end = time.time()
    lr = None

    for i, data in enumerate(train_loader):
        input_ = data['image']
        target = data['target']
        batch_size, _, _, _ = input_.shape

        output = model(input_.cuda())
        loss = criterion(output, target.cuda())
        confs, predicts = torch.max(output.detach(), dim=1)
        avg_score.update(GAP(predicts, confs, target))
        losses.update(loss.data.item(), input_.size(0))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        lr = optimizer.param_groups[0]['lr']

        batch_time.update(time.time() - end)
        end = time.time()

        if i % LOG_FREQ == 0:
            print(f'{epoch} [{i}/{num_steps}]\t'
                  f'time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  f'loss {losses.val:.4f} ({losses.avg:.4f})\t'
                  f'GAP {avg_score.val:.4f} ({avg_score.avg:.4f})' + str(lr))

    print(f' * average GAP on train {avg_score.avg:.4f}')
Exemplo n.º 10
0
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    lr_scheduler.step()  ############add
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

        progress_bar(
            batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' %
            (train_loss /
             (batch_idx + 1), 100. * correct / total, correct, total))
Exemplo n.º 11
0
            loss_hist.append(float(loss))
            epoch_loss.append(float(loss))
            
            print(json.dumps({
                "epoch_num"  : epoch_num,
                "iter_num"   : iter_num,
                "img_num"    : iter_num * args.batch_size,
                "cls_loss"   : float(cls_loss),
                "reg_loss"   : float(reg_loss), 
                "loss_hist"  : float(np.mean(loss_hist)),
                "elapsed"    : time() - start_time,
            }))
            sys.stdout.flush()
            
            del cls_loss
            del reg_loss
        
        print('Evaluating dataset')
        if args.dataset == 'coco':
            coco_eval.evaluate_coco(dataset_val, retinanet)
        elif args.dataset == 'csv' and args.csv_val is not None:
            _ = csv_eval.evaluate(dataset_val, retinanet)
        
        lr_scheduler.step(np.mean(epoch_loss))
        
        torch.save(retinanet.module, '{}_retinanet_{}.pt'.format(args.dataset, epoch_num))
    
    retinanet.eval()
    torch.save(retinanet, 'model_final.pt'.format(epoch_num))

Exemplo n.º 12
0
    def train_epoch(self,
                    epoch,
                    model,
                    dataloader,
                    optimizer,
                    lr_scheduler,
                    grad_normalizer=None,
                    prefix="train"):
        model.train()

        _timer = Timer()
        lossMeter = LossMeter()
        perfMeter = PerfMeter()

        for i, (imgs, labels) in enumerate(dataloader):
            _timer.tic()
            # zero the parameter gradients
            optimizer.zero_grad()

            if self.cfg.HALF:
                imgs = imgs.half()

            if len(self.device) > 1:
                out = data_parallel(model, (imgs, labels, prefix),
                                    device_ids=self.device,
                                    output_device=self.device[0])
            else:
                imgs = imgs.cuda()
                labels = [label.cuda() for label in labels] if isinstance(
                    labels, list) else labels.cuda()
                out = model(imgs, labels, prefix)

            if not isinstance(out, tuple):
                losses, performances = out, None
            else:
                losses, performances = out

            if losses["all_loss"].sum().requires_grad:
                if self.cfg.GRADNORM is not None:
                    grad_normalizer.adjust_losses(losses)
                    grad_normalizer.adjust_grad(model, losses)
                else:
                    losses["all_loss"].sum().backward()

            optimizer.step()

            self.n_iters_elapsed += 1

            _timer.toc()

            lossMeter.__add__(losses)

            if performances is not None and all(performances):
                perfMeter.put(performances)

            if (i + 1) % self.cfg.N_ITERS_TO_DISPLAY_STATUS == 0:
                avg_losses = lossMeter.average()
                template = "[epoch {}/{}, iter {}, lr {}] Total train loss: {:.4f} " "(ips = {:.2f} )\n" "{}"
                self.logger.info(
                    template.format(
                        epoch,
                        self.cfg.N_MAX_EPOCHS,
                        i,
                        round(get_current_lr(optimizer), 6),
                        avg_losses["all_loss"],
                        self.batch_size * self.cfg.N_ITERS_TO_DISPLAY_STATUS /
                        _timer.total_time,
                        "\n".join([
                            "{}: {:.4f}".format(n, l)
                            for n, l in avg_losses.items() if n != "all_loss"
                        ]),
                    ))

                if self.cfg.TENSORBOARD:
                    tb_step = int((epoch * self.n_steps_per_epoch + i) /
                                  self.cfg.N_ITERS_TO_DISPLAY_STATUS)
                    # Logging train losses
                    [
                        self.tb_writer.add_scalar(f"loss/{prefix}_{n}", l,
                                                  tb_step)
                        for n, l in avg_losses.items()
                    ]

                lossMeter.clear()

            del imgs, labels, losses, performances

        lr_scheduler.step()

        if self.cfg.TENSORBOARD and len(perfMeter):
            avg_perf = perfMeter.average()
            [
                self.tb_writer.add_scalar(f"performance/{prefix}_{k}", v,
                                          epoch) for k, v in avg_perf.items()
            ]

        if self.cfg.TENSORBOARD_WEIGHT and False:
            for name, param in model.named_parameters():
                layer, attr = os.path.splitext(name)
                attr = attr[1:]
                self.tb_writer.add_histogram("{}/{}".format(layer, attr),
                                             param, epoch)
Exemplo n.º 13
0
def train(train_loader, model, optimizer, lr_scheduler, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    if args.use_dali:
        train_loader_len = int(np.ceil(train_loader._size/args.batch_size))
    else:
        train_loader_len = len(train_loader)

    # switch to train mode
    model.train()

    end = time.time()
    for i, data in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        if args.use_dali:
            target = torch.cat([i["label"].to(torch.device('cuda:0')) for i in data], dim=0)
            data = torch.cat([i["data"].to(torch.device('cuda:0')) for i in data], dim=0)
            target = target.cuda().squeeze().long()
        else:
            data, target = data
            data = data.cuda()
            target = target.cuda()

        output = model(data)
        loss = criterion(output, target)

        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), data.size(0))
        top1.update(acc1.item(), data.size(0))
        top5.update(acc5.item(), data.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()

        lr = lr_scheduler.step()
        for pg in optimizer.param_groups:
            pg["lr"] = lr

        # impose L1 penalty to BN factors
        if args.sparsity != 0:
            for m in model.modules():
                if isinstance(m, nn.BatchNorm2d):
                    m.weight.grad.data.add_(args.sparsity*torch.sign(m.weight.data))  # L1

        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        lr = optimizer.param_groups[0]["lr"]

        if i % args.print_freq == 0:
            logger.info('Epoch[{0}/{1}] Iter[{2}/{3}]\t'
                  'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data Time {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Train Loss {loss.val:.3f} ({loss.avg:.3f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})\t'
                  'LR: {lr:.4f}'.format(
                   epoch, args.epochs, i, train_loader_len,
                   batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5,
                   lr=lr))

    if args.use_dali:
        train_loader.reset()

    return losses.avg
def train(opt):
    """ dataset preparation """
    logging.info("dataset preparation ...")
    dataset = Dateloader(opt.data_path, mode="train", dataset=opt.Datasets)
    data_loader = torch.utils.data.DataLoader(dataset,
                                              batch_size=opt.batch_size,
                                              shuffle=True,
                                              num_workers=opt.num_workers,
                                              drop_last=True,
                                              pin_memory=True)

    dataset_val = Dateloader(opt.data_path, mode="test", dataset=opt.Datasets)
    data_loader_val = torch.utils.data.DataLoader(dataset_val,
                                                  batch_size=opt.batch_size,
                                                  shuffle=False,
                                                  num_workers=opt.num_workers)

    logging.info('| Building net...')
    model = create_model(opt.Backbone, opt.num_classes)
    model = torch.nn.DataParallel(model)
    cudnn.benchmark = True

    optimizer = optim.SGD(model.parameters(),
                          lr=opt.lr,
                          momentum=0.9,
                          weight_decay=2e-5)
    # optimizer = optim.Adam(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=[80, 130, 170, 200, 230, 250], gamma=0.1)
    CEloss = nn.CrossEntropyLoss()

    best_acc = 0
    for epoch in range(opt.epoch_iter):
        model.train()
        epoch_loss = 0
        lr_scheduler.step()
        epoch_time = time.time()
        for i, (image, gt) in enumerate(data_loader):

            start_time = time.time()
            inputs, labels = image.cuda(), gt.cuda()
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = CEloss(outputs, labels)
            epoch_loss += loss.item()
            loss.backward()
            optimizer.step()
            logging.info('Epoch is [{}/{}], mini-batch is [{}/{}], time consumption is {:.8f}, batch_loss is {:.8f}'.format( \
                epoch + 1, opt.epoch_iter, i + 1, int(len(data_loader)), time.time() - start_time, loss.item()))

        if epoch > 1:
            validate(data_loader_val, model, CEloss)
            best_acc = test(epoch, model, data_loader_val, best_acc)
            model.train()
        logging.info(
            "----------------------------------------------------------")
        logging.info("            best_acc: {:.3f}".format(best_acc))
        logging.info("              lr: {:.3f}".format(
            optimizer.param_groups[0]['lr']))
        logging.info(
            "----------------------------------------------------------")

        logging.info('epoch_loss is {:.8f}, epoch_time is {:.8f}'.format(
            epoch_loss / int(len(data_loader)),
            time.time() - epoch_time))
        logging.info(time.asctime(time.localtime(time.time())))
Exemplo n.º 15
0
def train_model(model, criterion, optimizer, lr_scheduler, num_epochs=5):
    start = time.time()

    #deepcopy needed for references
    best_model = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    train_losses = []
    train_acc = []

    dev_losses = []
    dev_acc = []

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        for phase in ['train', 'dev']:
            if phase == 'train':
                lr_scheduler.step()
                #Toggle 'train' mode for model.
                model.train()
            else:
                #Toggle 'eval' mode for model
                model.eval()

            running_loss = 0.0
            running_corrects = 0

            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # reset gradients
                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # accumulate
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            if phase == 'train':
                train_losses.append(epoch_loss)
                train_acc.append(epoch_acc)
            elif phase == 'dev':
                dev_losses.append(epoch_loss)
                dev_acc.append(epoch_acc)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            #keep updating best model
            if phase == 'dev' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - start
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best dev Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model)
    return model, train_losses, train_acc, dev_losses, dev_acc
Exemplo n.º 16
0
            else:
                val_loss = evaluate(val_data, eval_batch_size)
                print('-' * 89)
                print('| end of epoch {:3d} |lr {:5.2f}| time: {:5.2f}s | valid loss {:5.2f} | '
                      'valid ppl {:8.2f} | valid bpc {:8.3f}'.format(
                    epoch, lr, (time.time() - epoch_start_time), val_loss, math.exp(val_loss), val_loss / math.log(2)))
                print('-' * 89)

                if val_loss < stored_loss:
                    model_save(args.save)
                    print('Saving model (new best validation)')
                    stored_loss = val_loss

                if args.optimizer == 'adam':
                    lr_scheduler.step(val_loss)

                if args.optimizer == 'sgd' and 't0' not in optimizer.param_groups[0] and (
                        len(best_val_loss) > args.nonmono and val_loss > min(best_val_loss[:-args.nonmono])):
                    print('Switching to ASGD')
                    optimizer = Sparse_ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay)
                    mask.optimizer = optimizer
                    mask.update_optimizer_mask()

                if args.sparse and ('t0' not in optimizer.param_groups[0]):
                    mask.at_end_of_epoch(epoch)

                best_val_loss.append(val_loss)

            print("PROGRESS: {}%".format((epoch / args.epochs) * 100))
def train(model_ft,
          criterion,
          optimizer_ft,
          train_generator,
          val_generator,
          regularize=False,
          n_epochs=20,
          lr_scheduler=None):
    start_time = time.time()
    # Current time
    data_actual = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    # Interesting metrics to keep
    loss_train = []
    acc_train = []
    loss_val = []
    acc_val = []

    best_val_acc = 0.

    # Main loop
    for epoch in range(n_epochs):
        # Train
        model_ft.train()
        cont = 0
        running_loss = 0.0
        running_corrects = 0

        if lr_scheduler:
            lr_scheduler.step()

        for rgbs, labels in training_generator:
            cont += 1
            # Get items from generator
            if torch.cuda.is_available():
                inputs = rgbs.cuda()
                labels = labels.cuda()

            else:
                inputs = rgbs

            # Clean grads
            optimizer_ft.zero_grad()

            #Forward
            outs = model_ft(inputs)
            _, preds = torch.max(outs, 1)
            loss = criterion(outs, labels)

            # Track losses + correct predictions
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)
            loss.backward()
            optimizer_ft.step()

        # Get avg loss + accuracies in %
        epoch_loss = running_loss / dataset.__len__()
        epoch_acc = running_corrects.double().detach() / dataset.__len__()
        print('{} Loss: {:.4f} Acc: {:.4f}'.format('Train epoch ' + str(epoch),
                                                   epoch_loss, epoch_acc))
        loss_train.append(epoch_loss)  #.data.cpu().numpy()[0])
        acc_train.append(epoch_acc.data.cpu().numpy())

        # Val
        model_ft.eval()
        cont = 0
        running_loss = 0.0
        running_corrects = 0
        predicts = []
        val_labels = []

        for rgbs, labels in val_generator:
            cont += 1
            val_labels += list(labels.numpy())
            # Get items from generator
            if torch.cuda.is_available():
                inputs = rgbs.cuda()
                labels = labels.cuda()
            else:
                inputs = rgbs
            # Clean grads
            optimizer_ft.zero_grad()

            #Forward
            outs = model_ft(inputs)
            _, preds = torch.max(outs, 1)
            predicts += list(preds.cpu().numpy())
            loss = criterion(outs, labels)
            loss.backward()
            optimizer_ft.step()

            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

        epoch_loss = running_loss / dataset_val.__len__()
        epoch_acc = running_corrects.double().detach() / dataset_val.__len__()
        epoch_acc = epoch_acc.data.cpu().numpy()
        print('{} Loss: {:.4f} Acc: {:.4f}'.format('Val epoch ' + str(epoch),
                                                   epoch_loss, epoch_acc))
        loss_val.append(epoch_loss)  #.data.cpu().numpy())
        acc_val.append(epoch_acc)

        # Save model and early stop?
        if epoch_acc > best_val_acc:
            best_model_wts = copy.deepcopy(model_ft.state_dict())
            best_predicts = predicts
            best_labels = val_labels
            torch.save(best_model_wts, 'attention_resnet_' + data_actual)

    results = {}
    loss = {}
    acc = {}
    # losses
    loss['train'] = np.array(loss_train)
    loss['val'] = np.array(loss_val)

    # accuracies
    acc['train'] = np.array(acc_train)
    acc['val'] = np.array(acc_val)

    results['losses'] = loss
    results['acc'] = acc
    print("--- %s seconds ---" % (time.time() - start_time))
    return results, best_labels, best_predicts, data_actual
Exemplo n.º 18
0
def main():

    # --- parse parameters ---
    for i in np.arange(1, len(sys.argv), 1):
        [key, val] = sys.argv[i].split('=', 1)
        if key in [
                'd', 'nIneq', 'randseed', 'nEpochs', 'Ktrain', 'Kval', 'Ktest'
        ]:
            args[key] = int(val)
        elif key in ['bound']:
            args[key] = float(val)
        elif key in ['nunits']:
            args[key] = [int(s) for s in val.split(',')]
        elif key in ['videofilename', 'datafilename']:
            if val == 'None':
                args[key] = None
            else:
                args[key] = val
        else:
            print('WARNING: invalid input option {0:s}'.format(key))

    if args['nunits'] is None:
        args['nunits'] = [args['d'], 4 * args['d'], 16 * args['d'], args['d']]

    # check if cuda available
    args['useCuda'] = False  #torch.cuda.is_available()
    print('CUDA enabled: {0:}'.format(args['useCuda']))

    # seed rng
    np.random.seed(args['randseed'])

    # --- generate inequalities to make convex set ---
    print('Making data...')
    #ineq = linearIneqTestData.makeRandomData(args['d'], args['nIneq'])
    #ineq = linearIneqTestData.makeSimplePolygonData()
    ineq = linearIneqTestData.makeSimpleTriangleData()
    print('done.')

    # --- generate point/projected point pairs ---
    print('Making training/validation/test sets:')
    print('Training set...')
    dataTrain = linearIneqTestData.makePointProjectionPairs(
        ineq, args['Ktrain'], args['bound'])
    trainDataset = ProjectionDataset(dataTrain['P'], dataTrain['Pproj'])
    print('Validation set...')
    dataVal = linearIneqTestData.makePointProjectionPairs(
        ineq, args['Kval'], args['bound'])
    valDataset = ProjectionDataset(dataVal['P'], dataVal['Pproj'])
    print('Test set...')
    dataTest = linearIneqTestData.makePointProjectionPairs(
        ineq, args['Ktest'], args['bound'])
    testDataset = ProjectionDataset(dataTest['P'], dataTest['Pproj'])
    print('done.')
    #linearIneqTestData.plot(ineq, P=dataTrain['P'], Pproj=dataTrain['Pproj'], Pproj_hat=None,
    #                        showplot=True, savefile="traindata.png")

    # --- train network ---
    print('Constructing network...')
    model = Network()
    if args['useCuda']:
        model.cuda()
    print(model)
    print('done.')

    print('Making optimizer...')
    optimizer = torch.optim.SGD(model.parameters(),
                                0.05,
                                momentum=0.9,
                                weight_decay=1e-4)
    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                        milestones=[225, 240])
    if args['useCuda']:
        criterion = torch.nn.SmoothL1Loss().cuda()  # huber loss
    else:
        criterion = torch.nn.SmoothL1Loss()
    print('done.')

    print('Training...')
    Pproj_hat = np.zeros((args['d'], args['Kval'], args['nEpochs']))
    errs = np.zeros((args['nEpochs']))
    losses = np.zeros((args['nEpochs']))
    for epoch in range(args['nEpochs']):

        # train one epoch
        currentLR = optimizer.param_groups[0]['lr']
        train(trainDataset, model, criterion, optimizer)
        lr_scheduler.step()

        # evaluate on validation set
        errs[epoch], Pproj_hat[:, :, epoch] = validate(valDataset, model,
                                                       criterion)
        #linearIneqTestData.plot(ineq, P=dataTrain['P'], Pproj=dataTrain['Pproj'], Pproj_hat=None,
        #                    showplot=False, savefile=None)
        print('Epoch {0:d}/{1:d}\tlr = {2:.5e}\tmean l2 err = {3:.7f}'.format(
            epoch + 1, args['nEpochs'], currentLR, errs[epoch]))

    print('Training ({0:d} epochs) complete!'.format(args['nEpochs']))

    # --- save results on training/eval set ---
    print('Generating output files:')
    if args['videofilename'] is None:
        print('Video creation disabled.')
    else:
        print('Making video...')
        linearIneqTestData.makevideo(ineq,
                                     dataTest['P'],
                                     dataTest['Pproj'],
                                     Pproj_hat,
                                     savefile=args['videofilename'] + '.mp4',
                                     errs=errs)
        print('done.')

    if args['datafilename'] is None:
        print('Data output disabled.')
    else:
        print('Saving results...')
        saveTestResults(trainDataset, model,
                        args['datafilename'] + '_train.mat')
        saveTestResults(valDataset, model, args['datafilename'] + '_val.mat')
        saveTestResults(testDataset, model, args['datafilename'] + '_test.mat')
        print('done.')
    print('Output complete!')
Exemplo n.º 19
0
    def _train_one_epoch(self,
                         train_loader,
                         batch_size=0,
                         epoch=0,
                         print_freq=1,
                         multi_scale=False,
                         img_size=(512, 512),
                         grid_min=None,
                         grid_max=None,
                         grid_size=32,
                         random_size=64,
                         device=torch.device('cuda'),
                         warmup=False):
        self.model.train()
        metric_logger = utils.MetricLogger(delimiter="  ")
        metric_logger.add_meter(
            'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
        header = 'Epoch: [{}]'.format(epoch)

        lr_scheduler = None
        if epoch == 0 and warmup:  # 当训练第一轮(epoch=0)时,启用warmup训练方式,可理解为热身训练
            warmup_factor = 1.0 / 1000
            warmup_iters = min(1000, len(train_loader) - 1)

            lr_scheduler = utils.warmup_lr_scheduler(self.optimizer,
                                                     warmup_iters,
                                                     warmup_factor)
            random_size = 1

        enable_amp = 'cuda' in device.type
        scale = amp.GradScaler(enabled=enable_amp)

        lr_now = 0.
        loss_mean = torch.zeros(4).to(device)  # mean losses
        batch_size = len(train_loader)  # number of batches
        for i, (images, targets, paths, _, _) in enumerate(
                metric_logger.log_every(train_loader, print_freq, header)):
            # count_batch 统计从 epoch0 开始的所有 batch 数
            count_batch = i + batch_size * epoch  # number integrated batches (since train start)
            images = images.to(device).float(
            ) / 255.0  # uint8 to float32, 0 - 255 to 0.0 - 1.0
            targets = targets.to(device)

            # Multi-Scale
            # 由于label已转为相对坐标,故缩放图片不影响label的值
            # 每训练64张图片,就随机修改一次输入图片大小
            if multi_scale:
                images, img_size = self.random_size(
                    images, img_size, count_batch % random_size == 0, grid_min,
                    grid_max, grid_size)

            # 混合精度训练上下文管理器,如果在CPU环境中不起任何作用
            with amp.autocast(enabled=enable_amp):
                # loss: compute_loss
                loss_dict = self.loss(self.model(images), targets)

                losses = sum(loss for loss in loss_dict.values())

                # reduce losses over all GPUs for logging purpose
                loss_dict_reduced = utils.reduce_dict(loss_dict)
                losses_reduced = sum(loss
                                     for loss in loss_dict_reduced.values())
                loss_items = torch.cat((loss_dict_reduced["box_loss"],
                                        loss_dict_reduced["obj_loss"],
                                        loss_dict_reduced["class_loss"],
                                        losses_reduced)).detach()
                loss_mean = (loss_mean * i + loss_items) / (
                    i + 1)  # update mean losses

                if not torch.isfinite(losses_reduced):
                    print('WARNING: non-finite loss, ending training ',
                          loss_dict_reduced)
                    print("training image path: {}".format(",".join(paths)))
                    sys.exit(1)

                losses *= 1. / random_size  # scale loss

            # backward
            scale.scale(losses).backward()
            # optimize
            # 每训练64张图片更新一次权重
            if count_batch % random_size == 0:
                scale.step(self.optimizer)
                scale.update()
                self.optimizer.zero_grad()

            metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
            lr_now = self.optimizer.param_groups[0]["lr"]
            metric_logger.update(lr=lr_now)

            if count_batch % random_size == 0 and lr_scheduler is not None:  # 第一轮使用warmup训练方式
                self.optimizer.step()
                lr_scheduler.step()

        return loss_mean, lr_now
Exemplo n.º 20
0
def train_cls(dataloader, val_dataloader, model_root, net, args):
    net.train()

    start_epoch = 1
    optimizer = torch.optim.SGD(net.parameters(),
                                lr=args.lr,
                                momentum=args.momentum,
                                nesterov=True,
                                weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
        optimizer,
        lr_lambda=LambdaLR(args.maxepoch, start_epoch, args.decay_epoch).step)
    train_loss, step_cnt, batch_count = 0.0, 0, 0
    best_acc = 0.50
    for epoc_num in np.arange(start_epoch, args.maxepoch + 1):
        for batch_idx, (batch_data, gt_classes, true_num,
                        bboxes) in enumerate(dataloader):
            im_data = batch_data.cuda().float()
            im_label = gt_classes.cuda().long()
            num_data = true_num.cuda().long()

            im_label = im_label.view(-1, 1)
            train_pred, assignments = net(im_data, im_label, true_num=num_data)

            vecloss = net.loss
            loss = torch.mean(vecloss)
            n_data = im_data.size()[0]
            num_sample = im_data.size()[0]
            train_loss_val = loss.data.cpu().item()
            train_loss += train_loss_val
            # backward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            step_cnt += 1
            batch_count += 1

            train_loss /= step_cnt
        print((' epoch {}, loss: {}, learning rate: {:.5f}'. \
                format(epoc_num, train_loss, optimizer.param_groups[0]['lr'])))

        net.eval()
        total_pred, total_gt = [], []
        for val_data, val_label, val_num, val_boxes in val_dataloader:
            val_data = val_data.cuda().float()
            val_num = val_num.cuda().long()
            val_pred_pro, assignments = net(val_data, true_num=val_num)
            val_pred_pro = val_pred_pro.cpu()
            _, cls_labels = torch.topk(val_pred_pro, 1, dim=1)
            cls_labels = cls_labels.data.cpu().numpy()[:, 0]

            total_pred.extend(cls_labels.tolist())
            total_gt.extend(val_label.tolist())
        precision, recall, fscore, support = score(total_gt, total_pred)
        con_mat = confusion_matrix(total_gt, total_pred)
        # print(' p:  {}\n r:  {}\n f1: {} \n'.format(precision, recall, fscore))
        # print('confusion matrix:')
        # print(con_mat)
        cls_acc = np.trace(con_mat) * 1.0 / np.sum(con_mat)
        print("\n Current classification accuracy is: {:.4f}".format(cls_acc))
        train_loss, step_cnt = 0, 0
        net.train()

        lr_scheduler.step()
        if epoc_num % args.save_freq == 0 and cls_acc >= best_acc and epoc_num >= args.maxepoch - 10:
            save_model_name = 'epoch-{}-acc-{:.3f}.pth'.format(
                str(epoc_num).zfill(3), cls_acc)
            torch.save(net.state_dict(),
                       os.path.join(model_root, save_model_name))
            print('Model saved as {}'.format(save_model_name))
            best_acc = cls_acc
def main():
    for epoch in range(1, epochs + 1):
        scheduler.step()
        train(epoch)
        torch.save(model.state_dict(), '%sage_epoch_%d.pth' % (outf, epoch))
Exemplo n.º 22
0
for step in range(start_step, n_steps):
    if es.early_stop:
        break
    data, target, meta = next(iter(train_loader))
    step_loss, step_precision = train_triplet_step(data, target, model, device,
                                                   optimizer, miner)

    print('Train Step: {} Precision@1: {:.4f}\tLoss: {:.6f}'.format(
        step, step_precision, step_loss),
          flush=True)

    if step % args.val_freq == 0:
        total_loss, acc_dict, embedding_list, target_list = representation(
            model, device, validation_loader)
        lr_scheduler.step(total_loss)
        es(total_loss, step, model.state_dict(), output_dir / 'model.pt')

        save_checkpoint(
            model, optimizer, lr_scheduler,
            train_loader.sampler.state_dict(train_loader._infinite_iterator),
            step + 1, es, torch.random.get_rng_state())

_, acc_dict, embedding_list, target_list = representation(
    model, device, test_loader)
_, acc_dict_aug, embedding_list_aug, target_list_aug = representation(
    model, device, test_loader_aug)

results = {}
acc_calc = AccuracyCalculator()
for m, embedding, target in zip(['unaug', 'aug'],
Exemplo n.º 23
0
		global_step = int(checkpoint["global_step"])

	global_start_time = time.time()



	if not PREDICT_ONLY:
		print("Training...")
		criterion = nn.CrossEntropyLoss()

		optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
		if CHECKPOINT_NAME != None:
			optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

		# model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
		lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=LR_STEP,
													   gamma=LR_FACTOR)

		if USE_PARALLEL:
			print("[Using all the available GPUs]")
			model = nn.DataParallel(model, device_ids=[0, 1])

		for epoch in range(epoch, NUM_EPOCHS + 1):
			print('-' * 50)
			train(train_loader, model, criterion, optimizer, epoch, lr_scheduler, tensorboard, label_encoder)
			eval(val_loader, train_loader, model, tensorboard, epoch)
			lr_scheduler.step()

			if has_time_run_out():
				break
Exemplo n.º 24
0
def main():
    global args, best_prec1
    args = parser.parse_args()

    # Check the save_dir exists or not
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)

    model = torch.nn.DataParallel(resnet.__dict__[args.arch]())
    #model.cuda()

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            print(list(checkpoint.keys()))
            #args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            #print("=> loaded checkpoint '{}' (epoch {})"
            #     .format(args.evaluate, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    print('Making train_loader...')
    train_loader = torch.utils.data.DataLoader(datasets.CIFAR10(
        root='./data',
        train=True,
        transform=transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.RandomCrop(32, 4),
            transforms.ToTensor(),
            normalize,
        ]),
        download=True),
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True)

    print('Making val_loader...')
    val_loader = torch.utils.data.DataLoader(datasets.CIFAR10(
        root='./data',
        train=False,
        transform=transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ])),
                                             batch_size=128,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    # define loss function (criterion) and pptimizer
    #criterion = nn.CrossEntropyLoss().cuda()
    criterion = nn.CrossEntropyLoss()

    if args.half:
        model.half()
        criterion.half()

    print('Making optimizer...')
    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=[100, 150], last_epoch=args.start_epoch - 1)

    if args.arch in ['resnet1202', 'resnet110']:
        # for resnet1202 original paper uses lr=0.01 for first 400 minibatches for warm-up
        # then switch back. In this implementation it will correspond for first epoch.
        for param_group in optimizer.param_groups:
            param_group['lr'] = args.lr * 0.1

    if args.evaluate:
        print('Evaluating...')
        validate(val_loader, model, criterion)
        return

    for epoch in range(args.start_epoch, args.epochs):

        # train for one epoch
        print('current lr {:.5e}'.format(optimizer.param_groups[0]['lr']))
        train(train_loader, model, criterion, optimizer, epoch)
        lr_scheduler.step()

        # evaluate on validation set
        prec1 = validate(val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)

        if epoch > 0 and epoch % args.save_every == 0:
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'best_prec1': best_prec1,
                },
                is_best,
                filename=os.path.join(args.save_dir, 'checkpoint.th'))

    save_checkpoint(
        {
            'state_dict': model.state_dict(),
            'best_prec1': best_prec1,
        },
        is_best,
        filename=os.path.join(args.save_dir, 'model.th'))
Exemplo n.º 25
0
    def fit(self, dataset_train, nb_epochs=10, batch_size=64, optimizer=None, lr=0.001, lr_step_size=0,
            dataset_val=None):
        if self._crayon_exp is None and self.crayon_exp_name is not None:
            self._crayon_exp = get_crayon_experiment(self.crayon_exp_name)

        if optimizer is None:
            optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)

        lr_scheduler = None
        if lr_step_size != 0:
            lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=lr_step_size, gamma=0.5)

        data_loader_train = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, shuffle=True,
                                                        num_workers=1, pin_memory=torch.cuda.is_available())

        phases = ['train', ]
        data_loaders = [data_loader_train]
        if dataset_val is not None:
            data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size=batch_size, shuffle=False,
                                                          num_workers=1, pin_memory=torch.cuda.is_available())

            phases.append('val')
            data_loaders.append(data_loader_val)

        model = cuda(self.model)
        loss_fn = cuda(self.loss)

        j = 1
        loss_best = np.inf
        for epoch in range(nb_epochs):
            for phase, data_loader in zip(phases, data_loaders):
                if phase == 'train':
                    model.train(True)
                else:
                    model.train(False)

                if phase == 'train' and lr_scheduler is not None:
                    lr_scheduler.step()

                pbar_desc = f'Epoch {epoch}, {phase}'
                pbar = tqdm(total=len(data_loader.dataset), desc=pbar_desc, postfix={f'loss_{phase}': 0}, ncols=120)

                running_loss = 0.0
                for j, (inputs, targets) in enumerate(data_loader, 1):
                    volatile = phase == 'val'
                    inputs = variable(inputs, volatile=volatile)
                    targets = variable(targets, volatile=volatile)

                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # forward + backward + optimize
                    outputs = model(inputs)

                    loss = loss_fn(outputs, targets)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                    batch_loss = loss.data[0]
                    running_loss += batch_loss

                    pbar.update(inputs.size(0))

                    pbar.set_postfix(**{f'loss_{phase}': batch_loss})

                    if self._crayon_exp is not None:
                        self._crayon_exp.add_scalar_value(f'loss_batch/{phase}', batch_loss)

                        lr = optimizer.param_groups[0]['lr']
                        self._crayon_exp.add_scalar_value(f'learning_rate', lr)

                    del loss
                    del outputs
                    del targets

                epoch_loss = running_loss / j
                pbar.set_postfix(**{f'loss_{phase}': epoch_loss})
                pbar.close()

                if self._crayon_exp is not None:
                    self._crayon_exp.add_scalar_value(f'loss_epoch/{phase}', epoch_loss)

                if phase == 'val' and epoch_loss < loss_best and self.checkpoint_filename is not None:
                    save_weights(model, self.checkpoint_filename)
                    loss_best = epoch_loss
Exemplo n.º 26
0
def train_model(output_path,
                model,
                dataloaders,
                dataset_sizes,
                criterion,
                optimizer,
                num_epochs=5,
                scheduler=None,
                lr=0.1):
    if not os.path.exists('models/' + str(output_path)):
        os.makedirs('models/' + str(output_path))
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    since = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    best = 0
    for epoch in range(num_epochs):
        top1 = AverageMeter()
        top5 = AverageMeter()
        losses = AverageMeter()
        print('Epoch {}/{}'.format(epoch + 1, num_epochs))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                if scheduler != None:
                    scheduler.step()
                model.train()  # Set model to training mode
            else:
                model.eval()  # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for i, (inputs, labels) in enumerate(dataloaders[phase]):
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)

                    acc1, acc5 = accuracy(outputs, labels, topk=(1, 5))

                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        losses.update(loss.item(), inputs.size(0))
                        top1.update(acc1[0], inputs.size(0))
                        top5.update(acc5[0], inputs.size(0))
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                # print("\rIteration: {}/{}, Loss: {}.".format(i+1, len(dataloaders[phase]), loss.item() * inputs.size(0)), end="")
                sys.stdout.flush()

                print(
                    '\rLoss {loss.val:.4f} ({loss.avg:.4f}) Acc@1 {top1.val:.3f} ({top1.avg:.3f}) Acc@5 {top5.val:.3f} ({top5.avg:.3f})'
                    .format(loss=losses, top1=top1, top5=top5),
                    end="")

#                 print( (i+1)*100. / len(dataloaders[phase]), "% Complete" )

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            if phase == 'train':
                avg_loss = epoch_loss
                t_acc = epoch_acc
            else:
                val_loss = epoch_loss
                val_acc = epoch_acc


#             print('{} Loss: {:.4f} Acc: {:.4f}'.format(
#                 phase, epoch_loss, epoch_acc))

# deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best = epoch + 1
                best_model_wts = copy.deepcopy(model.state_dict())

        print('Train Loss: {:.4f} Acc: {:.4f}'.format(avg_loss, t_acc))
        print('Val Loss: {:.4f} Acc: {:.4f}'.format(val_loss, val_acc))
        print()
        # torch.save(model.state_dict(), './models/' + str(output_path) + '/model_{}_epoch.pth'.format(epoch+1))
        torch.save(
            {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': epoch_loss,
            }, './models/' + str(output_path) +
            '/model_{}_epoch.pth'.format(epoch + 1))
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best Validation Accuracy: {}, Epoch: {}'.format(best_acc, best))
def learning_process(train_loader,
                     network,
                     criterion,
                     test_loader,
                     all_outputs_test,
                     all_labels_test,
                     mode,
                     optimizer=None,
                     start_epoch=0,
                     lr_scheduler=lr_scheduler):
    vis = visdom.Visdom()
    r_loss = []
    iterations = []
    total_iteration = 0

    loss_plot = vis.line(Y=np.zeros(1), X=np.zeros(1))

    number_of_epochs = 0
    name_prefix_for_saved_model = ''
    if mode == params.mode_classification:
        number_of_epochs = params.number_of_epochs_for_classification
        name_prefix_for_saved_model = params.name_prefix_for_saved_model_for_classification
    if mode == params.mode_representation:
        number_of_epochs = params.number_of_epochs_for_representation
        name_prefix_for_saved_model = params.name_prefix_for_saved_model_for_representation

    for epoch in range(
            start_epoch,
            number_of_epochs):  # loop over the dataset multiple times
        pr = cProfile.Profile()
        pr.enable()

        lr_scheduler.step(epoch=epoch)
        print('current_learning_rate =', optimizer.param_groups[0]['lr'])
        print(datetime.datetime.now())
        running_loss = 0.0
        i = 0

        # for representation we need clever sampling which should change every epoch
        # if mode == params.mode_representation:
        #    train_loader, test_loader, \
        #    train_loader_for_classification, test_loader_for_classification = cifar.download_CIFAR100()

        for i, data in enumerate(train_loader, 0):
            # print('i = ', i)
            # get the inputs
            # inputs are [torch.FloatTensor of size 4x3x32x32]
            # labels are [torch.LongTensor of size 4]
            # here 4 is a batch size and 3 is a number of channels in the input images
            # 32x32 is a size of input image
            inputs, labels = data

            # wrap them in Variable
            inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = network(inputs)

            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            # print statistics
            current_batch_loss = loss.data[0]
            if i % params.skip_step == 0:  # print every 2000 mini-batches
                print('[ephoch %d, itteration in the epoch %5d] loss: %.30f' %
                      (epoch + 1, i + 1, current_batch_loss))

                r_loss.append(current_batch_loss)
                iterations.append(total_iteration + i)

                options = dict(legend=['loss for' + mode])
                loss_plot = vis.line(
                    Y=np.array(r_loss),
                    X=np.array(iterations),
                    # , update='append',
                    win=loss_plot,
                    opts=options)

        if epoch % 10 == 0:
            # print the train accuracy at every epoch
            # to see if it is enough to start representation training
            # or we should proceed with classification
            if mode == params.mode_classification:
                accuracy = test.test_for_classification(
                    test_loader=test_loader, network=network)
            if mode == params.mode_representation:
                # we should recalculate all outputs before the evaluation because our network changed during the trainig
                all_outputs_test, all_labels_test = metric_learning_utils.get_all_outputs_and_labels(
                    test_loader, network)
                recall_at_k = test.full_test_for_representation(
                    k=params.k_for_recall,
                    all_outputs=all_outputs_test,
                    all_labels=all_labels_test)
            utils.save_checkpoint(network=network,
                                  optimizer=optimizer,
                                  filename=name_prefix_for_saved_model +
                                  '-%d' % epoch,
                                  epoch=epoch)
        total_iteration = total_iteration + i
        print('total_iteration = ', total_iteration)

        pr.disable()
        # s = io.FileIO('profiler-statistic')
        s = io.StringIO()
        sortby = 'tottime'
        ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
        # ps.print_stats()
        # print(s.getvalue())

    print('Finished Training')
Exemplo n.º 28
0
def k_train_fcn(k_fold,
                model,
                batch_size,
                max_iterations,
                save_dir='./logs',
                eval_every=50,
                checkpoint_every=1000,
                mode='reg',
                config=None):
    train_weight = torch.tensor([1, 200, 3500, 20000],
                                dtype=torch.float).to(config['DEVICE'])
    mse_loss = torch.nn.MSELoss().to(config['DEVICE'])
    cls_loss = cross_entropy2d

    save_dir += datetime.now().strftime("_%m_%d_%H_%M")

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    data_gen = DataGenerator_FCN(global_config['DATA_PATH'],
                                 k_fold,
                                 batch_size,
                                 config['IN_LEN'],
                                 config['OUT_LEN'],
                                 config['IN_LEN'] + config['OUT_LEN'],
                                 config=config)
    writer = SummaryWriter(os.path.join(save_dir, 'train_logs'))

    for k in range(1, k_fold + 1):

        k_model, optimizer, lr_scheduler = model()

        data_gen.set_k(k)
        train_loss = 0.0
        train_acc = 0.0
        train_f1 = 0.0
        train_csi = np.zeros((len(global_config['LEVEL_BUCKET']) + 1, ),
                             dtype=np.float32)
        train_count = 0
        i_batch = 0
        best_val_loss = np.inf

        pbar = tqdm(range(1, max_iterations + 1))
        for itera in pbar:

            n_train_batch = data_gen.n_train_batch()
            pbar_b = tqdm(
                np.random.choice(data_gen.n_train_batch(),
                                 10000))  #range(data_gen.n_train_batch()))
            for b in pbar_b:

                pbar.set_description("Fold %d Training at batch %d / %d" %
                                     (k, i_batch, n_train_batch))

                train_data, train_label = data_gen.get_train(b)
                #train_data, train_label, train_label_cat = data_gen.get_train(b)
                k_model.train()
                optimizer.zero_grad()
                output = k_model(train_data)
                #                 print(train_label.size())
                output = output[:, 0]
                #                 loss = None
                #                 if mode == 'reg':
                loss = mse_loss(output, train_label[:, 0])
                #                 elif mode == 'seg':
                #                     loss = cls_loss(output, train_label_cat, weight=train_weight)
                #                 elif mode == 'reg_multi':
                #                     loss = mse_loss(output, train_label)
                #                     loss += cls_loss(output, train_label_cat, weight=train_weight)
                #                 else:
                #                     raise Exception('wrong mode')

                loss.backward()
                # torch.nn.utils.clip_grad_value_(k_model.parameters(), clip_value=50.0)
                optimizer.step()
                lr_scheduler.step()
                train_loss += loss.item()

                #                 pred_numpy = output.cpu().max(1)[1].detach().numpy().flatten()
                #                 label_numpy = train_label_cat.cpu().numpy().flatten()

                #                 train_acc += accuracy_score(label_numpy, pred_numpy)
                #                 train_f1 += f1_score(label_numpy, pred_numpy, average='macro', zero_division=1)
                #                 train_csi += fp_fn_image_csi_muti_reg(pred_numpy, label_numpy)
                train_csi += fp_fn_image_csi_muti_reg(
                    dbz_mm(output.detach().cpu().numpy()),
                    dbz_mm(train_label[:, 0].detach().cpu().numpy()))
                train_count += 1

                if i_batch % eval_every == 0:

                    val_loss = 0.0
                    val_acc = 0.0
                    val_f1 = 0.0
                    val_csi = np.zeros(
                        (len(global_config['LEVEL_BUCKET']) + 1, ),
                        dtype=np.float32)
                    val_count = 0

                    with torch.no_grad():
                        k_model.eval()
                        n_val_batch = data_gen.n_val_batch()

                        for ib_val, b_val in enumerate(
                                np.random.choice(n_val_batch,
                                                 20)):  #range(n_val_batch)
                            val_data, val_label = data_gen.get_val(b_val)
                            #                             val_data, val_label, val_label_cat = data_gen.get_val(b_val)
                            output = k_model(val_data)
                            output = output[:, 0]
                            loss = mse_loss(output, val_label[:, 0])
                            #                             loss = None
                            #                             if mode == 'reg':
                            #                                 loss = mse_loss(output, val_label)
                            #                             elif mode == 'seg':
                            #                                 loss = cls_loss(output, val_label_cat, weight=train_weight)
                            #                             elif mode == 'reg_multi':
                            #                                 loss = mse_loss(output, val_label)
                            #                                 loss += cls_loss(output, val_label_cat, weight=train_weight)

                            val_loss += loss.item()

                            #                             pred_numpy = output.cpu().max(1)[1].detach().numpy().flatten()
                            #                             label_numpy = val_label_cat.cpu().numpy().flatten()

                            #                             val_acc += accuracy_score(label_numpy, pred_numpy)
                            #                             val_f1 += f1_score(label_numpy, pred_numpy, average='macro', zero_division=1)
                            val_csi += fp_fn_image_csi_muti_reg(
                                dbz_mm(output.detach().cpu().numpy()),
                                dbz_mm(val_label[:, 0].detach().cpu().numpy()))
                            val_count += 1
                            pbar.set_description(
                                "Fold %d Validating at batch %d / %d" %
                                (k, ib_val, 20))

                    train_loss /= train_count
                    train_f1 /= train_count
                    train_acc /= train_count
                    train_csi /= train_count
                    val_loss /= val_count
                    val_f1 /= val_count
                    val_acc /= val_count
                    val_csi /= val_count

                    writer.add_scalars('loss/' + str(k), {
                        'train': train_loss,
                        'valid': val_loss
                    }, i_batch)

                    #                     writer.add_scalars('f1/'+str(k), {
                    #                         'train': train_f1,
                    #                         'valid': val_f1
                    #                     }, i_batch)

                    #                     writer.add_scalars('acc/'+str(k), {
                    #                         'train': train_acc,
                    #                         'valid': val_acc
                    #                     }, i_batch)

                    for i in range(train_csi.shape[0]):
                        writer.add_scalars('csi_' + str(i) + '/' + str(k), {
                            'train': train_csi[i],
                            'valid': val_csi[i]
                        }, i_batch)

#                     writer.add_image('result/val', torch.tensor(cv2.cvtColor(np.array(output.cpu().max(1)[1].detach().numpy() / 4 * 255, dtype=np.uint8)[0,:,:,None], cv2.COLOR_GRAY2RGB).swapaxes(0,2)), i_batch)
#                     writer.add_image('result/gt', torch.tensor(cv2.cvtColor(np.array(val_label_cat.cpu().numpy()[0, 0] / 4 * 255, dtype=np.uint8)[:,:,None], cv2.COLOR_GRAY2RGB).swapaxes(0,2)), i_batch)

                    writer.add_image(
                        'result/val',
                        torch.tensor(
                            cv2.cvtColor(
                                np.array(
                                    dbz_mm(output.detach().cpu().numpy()) /
                                    60 * 255,
                                    dtype=np.uint8)[0, :, :, None],
                                cv2.COLOR_GRAY2RGB).swapaxes(0, 2)), i_batch)
                    writer.add_image(
                        'result/gt',
                        torch.tensor(
                            cv2.cvtColor(
                                np.array(
                                    dbz_mm(val_label[:, 0].cpu().numpy()) /
                                    60 * 255,
                                    dtype=np.uint8)[0, :, :, None],
                                cv2.COLOR_GRAY2RGB).swapaxes(0, 2)), i_batch)

                    train_loss = 0.0
                    train_acc = 0.0
                    train_f1 = 0.0
                    train_count = 0
                    train_csi = 0.0

                    if val_loss <= best_val_loss:
                        torch.save(
                            k_model.state_dict(),
                            os.path.join(
                                save_dir,
                                'model_f{}_i{}_best.pth'.format(k, i_batch)))
                        best_val_loss = val_loss

                if i_batch % checkpoint_every == 0:
                    torch.save(
                        k_model.state_dict(),
                        os.path.join(save_dir,
                                     'model_f{}_i{}.pth'.format(k, i_batch)))

                i_batch += 1
        try:
            torch.save(
                k_model.state_dict(),
                os.path.join(save_dir, 'model_f{}_i{}.pth'.format(k, i_batch)))
        except:
            print('cannot save model')

    writer.close()
Exemplo n.º 29
0
def train(args):
    model, model_file = create_model(args.encoder_type,
                                     work_dir=args.work_dir,
                                     ckp=args.ckp)
    model = model.cuda()

    loaders = get_train_val_loaders(batch_size=args.batch_size)

    #optimizer = RAdam([
    #    {'params': model.decoder.parameters(), 'lr': args.lr},
    #    {'params': model.encoder.parameters(), 'lr': args.lr / 10.},
    #])
    if args.optim_name == 'RAdam':
        optimizer = RAdam(model.parameters(), lr=args.lr)
    elif args.optim_name == 'Adam':
        optimizer = optim.Adam(model.parameters(), lr=args.lr)
    elif args.optim_name == 'SGD':
        optimizer = optim.SGD(model.parameters(), momentum=0.9, lr=args.lr)

    #model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0)

    if torch.cuda.device_count() > 1:
        model = DataParallel(model)

    if args.lrs == 'plateau':
        lr_scheduler = ReduceLROnPlateau(optimizer,
                                         mode='max',
                                         factor=args.factor,
                                         patience=args.patience,
                                         min_lr=args.min_lr)
    else:
        lr_scheduler = CosineAnnealingLR(optimizer,
                                         args.t_max,
                                         eta_min=args.min_lr)

    best_metrics = 0.
    best_key = 'dice'

    print(
        'epoch |    lr    |      %        |  loss  |  avg   |   loss |  dice  |  best  | time |  save |'
    )

    if not args.no_first_val:
        val_metrics = validate(args, model, loaders['valid'])
        print(
            'val   |          |               |        |        | {:.4f} | {:.4f} | {:.4f} |        |        |'
            .format(val_metrics['loss'], val_metrics['dice'],
                    val_metrics['dice']))

        best_metrics = val_metrics[best_key]

    if args.val:
        return

    model.train()

    #if args.lrs == 'plateau':
    #    lr_scheduler.step(best_metrics)
    #else:
    #    lr_scheduler.step()
    train_iter = 0

    for epoch in range(args.num_epochs):
        train_loss = 0

        current_lr = get_lrs(optimizer)
        bg = time.time()
        for batch_idx, data in enumerate(loaders['train']):
            train_iter += 1
            img, targets = data[0].cuda(), data[1].cuda()
            batch_size = img.size(0)

            outputs = model(img)
            loss = _reduce_loss(criterion(outputs, targets))
            (loss).backward()

            #with amp.scale_loss(loss*batch_size, optimizer) as scaled_loss:
            #    scaled_loss.backward()

            if batch_idx % 4 == 0:
                optimizer.step()
                optimizer.zero_grad()

            train_loss += loss.item()
            print('\r {:4d} | {:.6f} | {:06d}/{} | {:.4f} | {:.4f} |'.format(
                epoch, float(current_lr[0]),
                args.batch_size * (batch_idx + 1), loaders['train'].num,
                loss.item(), train_loss / (batch_idx + 1)),
                  end='')

            if train_iter > 0 and train_iter % args.iter_val == 0:
                save_model(model, model_file + '_latest')
                val_metrics = validate(args, model, loaders['valid'])

                _save_ckp = ''
                if val_metrics[best_key] > best_metrics:
                    best_metrics = val_metrics[best_key]
                    save_model(model, model_file)
                    _save_ckp = '*'
                print(' {:.4f} | {:.4f} | {:.4f} | {:.2f} |  {:4s} |'.format(
                    val_metrics['loss'], val_metrics['dice'], best_metrics,
                    (time.time() - bg) / 60, _save_ckp))

                model.train()

                if args.lrs == 'plateau':
                    lr_scheduler.step(best_metrics)
                else:
                    lr_scheduler.step()
                current_lr = get_lrs(optimizer)
def train(model, optimizer, lr_scheduler, dataloaders, device, epochs):
    generator = model[0]
    discriminator = model[1]
    optimizer_G = optimizer[0]
    optimizer_D = optimizer[1]

    for e in range(epochs):
        for x, y in tqdm(dataloaders['train']):
            generator.train()
            discriminator.train()

            valid = torch.ones((x.shape[0], 1), requires_grad=False)
            fake = torch.zeros((x.shape[0], 1), requires_grad=False)
            sampled_latent = torch.tensor(
                np.random.normal(0, 1, (x.shape[0], latent_dim)),
                dtype=torch.float32).to(device=device)
            x = x.to(device=device)
            valid = valid.to(device=device)
            fake = fake.to(device=device)

            generated_imgs = generator(sampled_latent)

            ge_ = discriminator(generated_imgs)
            gt_ = discriminator(x)

            gen_loss = nn.BCELoss()(ge_, valid)
            optimizer_G.zero_grad()
            gen_loss.backward()
            optimizer_G.step()

            dis_loss = (nn.BCELoss()(discriminator(generated_imgs.detach()),
                                     fake) + nn.BCELoss()(gt_, valid)) / 2

            optimizer_D.zero_grad()
            dis_loss.backward()
            optimizer_D.step()
            if lr_scheduler:
                lr_scheduler.step()
        print('epoche %d, gen loss = %f, dis loss = %f' %
              (e, gen_loss.item(), dis_loss.item()))
        logging.info('epoche %d, gen loss = %f, dis loss = %f' %
                     (e, gen_loss.item(), dis_loss.item()))

        sample(model, device, e)

        writer.add_scalars("loss", {
            "GEN": gen_loss.item(),
            "DIS": dis_loss.item()
        }, e)

        save_model(save_dir='model_checkpoint',
                   file_name="check_point_G",
                   model=generator,
                   optimizer=optimizer_G,
                   lr_scheduler=lr_scheduler)
        save_model(save_dir='model_checkpoint',
                   file_name="check_point_D",
                   model=discriminator,
                   optimizer=optimizer_D,
                   lr_scheduler=lr_scheduler)

    save_model(save_dir='model_checkpoint',
               file_name=task_name + "_G",
               model=generator,
               optimizer=optimizer_G,
               lr_scheduler=lr_scheduler)
    save_model(save_dir='model_checkpoint',
               file_name=task_name + "_D",
               model=discriminator,
               optimizer=optimizer_D,
               lr_scheduler=lr_scheduler)

    return model