Пример #1
0
def validate(args, val_loader, model, criterion, epoch):
    """
    Run evaluation
    """
    top1 = utils.AverageMeter()

    # switch to evaluate mode
    model = flopscounter.add_flops_counting_methods(model)
    model.eval().start_flops_count()
    model.reset_flops_count()

    num_step = len(val_loader)
    with torch.no_grad():
        for input, target in tqdm.tqdm(val_loader,
                                       total=num_step,
                                       ascii=True,
                                       mininterval=5):
            input = input.to(device=device, non_blocking=True)
            target = target.to(device=device, non_blocking=True)

            # compute output
            meta = {
                'masks': [],
                'device': device,
                'gumbel_temp': 1.0,
                'gumbel_noise': False,
                'epoch': epoch
            }
            output, meta = model(input, meta)
            output = output.float()

            # measure accuracy and record loss
            prec1 = utils.accuracy(output.data, target)[0]
            top1.update(prec1.item(), input.size(0))

            if args.plot_ponder:
                viz.plot_image(input)
                viz.plot_ponder_cost(meta['masks'])
                viz.plot_masks(meta['masks'])
                plt.show()

    print(f'* Epoch {epoch} - Prec@1 {top1.avg:.3f}')
    print(
        f'* FLOPS (multiply-accumulates, MACs) per image:  {model.compute_average_flops_cost()[0]/1e6:.6f} MMac'
    )
    model.stop_flops_count()
    return top1.avg
Пример #2
0
def train(net, epoch, criterion, optimizer, trainloader, args):
    loss_meter = utils.AverageMeter()
    net.train()

    for i, data in enumerate(trainloader, 0):
        inputs, labels = data
        optimizer.zero_grad()
        inputs = inputs.cuda()
        labels = labels.cuda()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        loss_meter.update(loss.item())
        optimizer.step()

        #running_loss += loss.item()
        if i % 1 == 0 and i > 0:
            print('[Epoch %02d, Minibatch %05d] Loss: %.5f' %
                  (epoch, i, loss_meter.average()))
 def evaluate_accuracy(self, data_iterator, net):
     """
     compute top-1 accuracy
     :param data_iterator: 
     :param net: 
     :return: 
     """
     loss = utils.AverageMeter()
     acc = mx.metric.Accuracy()
     for idx, (d, l) in enumerate(data_iterator):
         data = d.as_in_context(self.ctx[0])
         label = l.as_in_context(self.ctx[0])
         output = net(data)
         _loss = self.get_loss(output, label)
         curr_loss = nd.mean(_loss).asscalar()
         loss.update(curr_loss, data.shape[0])
         predictions = nd.argmax(output, axis=1)
         acc.update(preds=predictions, labels=label)
         utils.view_bar(idx + 1, len(data_iterator))  # view_bar
     return acc.get()[1], loss.avg
Пример #4
0
    def extract(self):
        batch_time = utils.AverageMeter()

        self.model.eval()
        end = time.time()

        for batch_idx, (imgs, target, img_files,
                        class_ids) in tqdm.tqdm(enumerate(self.val_loader),
                                                total=len(self.val_loader),
                                                desc='Extract',
                                                ncols=80,
                                                leave=False):

            gc.collect()

            if self.cuda:
                imgs = imgs.cuda()
            imgs = Variable(imgs, volatile=True)
            output = self.model(imgs)  # N C H W torch.Size([1, 1, 401, 600])
            if self.flatten_feature:
                output = output.view(output.size(0), -1)
            output = output.data.cpu().numpy()

            assert output.shape[0] == len(img_files)
            for i, img_file in enumerate(img_files):
                base_name = os.path.splitext(img_file)[0]
                feature_file = os.path.join(self.feature_dir,
                                            base_name + ".npy")
                utils.create_dir(os.path.dirname(feature_file))
                np.save(feature_file, output[i])

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            if batch_idx % self.print_freq == 0:
                log_str = 'Extract: [{0}/{1}]\tTime: {batch_time.val:.3f} ({batch_time.avg:.3f})'.format(
                    batch_idx, len(self.val_loader), batch_time=batch_time)
                print(log_str)
                self.print_log(log_str)
Пример #5
0
    def train(self):
        batch_time = utils.AverageMeter()
        data_time = utils.AverageMeter()
        losses = utils.AverageMeter()
        D1 = utils.AverageMeter()
        EPE = utils.AverageMeter()

        # switch to train mode
        self.model.train()

        time_end = time.time()
        transform = myTransforms.Stereo_color(same_group=True)
        nedge = 64 if self.lossfun.flag_mask else 0
        for i, (batch, filenames) in enumerate(self.dataloader_train):
            assert batch.shape[2] >= 6
            if (self.use_cuda):
                batch = batch.cuda()
            bn, c, h, w = batch.shape
            assert h > 2 * nedge and w > 2 * nedge
            batch1 = self.flip_lr_tensor(batch)
            tmp = batch[:, :6, nedge:h - nedge, nedge:w - nedge]
            batch_aug = torch.zeros(tmp.shape).type_as(tmp)
            batch_aug.copy_(tmp)
            batch_aug = myTransforms.Stereo_color_batch(batch_aug, transform)
            batch1_aug = self.flip_lr_tensor(batch_aug)
            imL_pre = Variable(batch_aug[:, :3],
                               volatile=False,
                               requires_grad=False)
            imR_pre = Variable(batch_aug[:, 3:6],
                               volatile=False,
                               requires_grad=False)
            imL1_pre = Variable(batch1_aug[:, 3:6],
                                volatile=False,
                                requires_grad=False)
            imR1_pre = Variable(batch1_aug[:, :3],
                                volatile=False,
                                requires_grad=False)
            # measure data loading time
            data_time.update(time.time() - time_end)

            # compute output
            scale_dispLs, dispLs = self.model(imL_pre, imR_pre)
            scale_dispL1s, dispL1s = self.model(imL1_pre, imR1_pre)

            # compute loss
            imL = Variable(batch[:, :3, nedge:h - nedge, nedge:w - nedge],
                           volatile=False,
                           requires_grad=False)
            imR_src = Variable(batch[:, 3:6],
                               volatile=False,
                               requires_grad=False)
            imL1 = Variable(batch1[:, 3:6, nedge:h - nedge, nedge:w - nedge],
                            volatile=False,
                            requires_grad=False)
            imR1_src = Variable(batch1[:, :3],
                                volatile=False,
                                requires_grad=False)
            argst = {
                "imR_src": imR_src,
                "imL": imL,
                "dispLs": dispLs,
                "scale_dispLs": scale_dispLs,
                "LeftTop": [nedge, nedge],
                "imR1_src": imR1_src,
                "imL1": imL1,
                "dispL1s": dispL1s,
                "scale_dispL1s": scale_dispL1s,
                "LeftTop1": [nedge, nedge],
            }
            loss = self.lossfun(argst)
            losses.update(loss.data[0], imL.size(0))

            #            if(i < 5):
            #                # visualize images
            #                import matplotlib.pyplot as plt
            #                row, col = 4, 4
            #                plt.subplot(row, col, 1); plt.imshow(imL[0].data.cpu().numpy().transpose(1, 2, 0))
            #                plt.subplot(row, col, 2); plt.imshow(imR_src[0].data.cpu().numpy().transpose(1, 2, 0))
            #                plt.subplot(row, col, 3); plt.imshow(imL1[0].data.cpu().numpy().transpose(1, 2, 0))
            #                plt.subplot(row, col, 4); plt.imshow(imR1_src[0].data.cpu().numpy().transpose(1, 2, 0))
            #                plt.subplot(row, col, 5); plt.imshow(imL_pre[0].data.cpu().numpy().transpose(1, 2, 0))
            #                plt.subplot(row, col, 6); plt.imshow(imR_pre[0].data.cpu().numpy().transpose(1, 2, 0))
            #                plt.subplot(row, col, 7); plt.imshow(imL1_pre[0].data.cpu().numpy().transpose(1, 2, 0))
            #                plt.subplot(row, col, 8); plt.imshow(imR1_pre[0].data.cpu().numpy().transpose(1, 2, 0))
            #                for i in range(len(dispLs)):
            #                    plt.subplot(row, col, 9+i); plt.imshow(dispLs[i][0, 0].data.cpu().numpy())
            #                plt.show()

            # compute gradient and do SGD step
            self.optim.zero_grad()
            loss.backward()
            self.optim.step()

            # measure accuracy
            if (batch.shape[1] >= 7):
                dispL = batch[:, 6:7, nedge:h - nedge, nedge:w - nedge]
                d1, epe = self.accuracy(dispLs[0].data, dispL)
            else:
                d1, epe = -1, -1
            D1.update(d1, imL.size(0))
            EPE.update(epe, imL.size(0))

            # measure elapsed time
            batch_time.update(time.time() - time_end)
            time_end = time.time()

            # 每十步输出一次
            if i % self.args.print_freq == 0:  # default=20
                print('Train: [{0}][{1}/{2}] | '
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f}) | '
                      'Data {data_time.val:.3f} ({data_time.avg:.3f}) | '
                      'Loss {loss.val:.4f} ({loss.avg:.4f}) | '
                      'D1 {D1.val:.3f} ({D1.avg:.3f}) | '
                      'EPE {EPE.val:.3f} ({EPE.avg:.3f})'.format(
                          self.epoch,
                          i,
                          len(self.dataloader_train),
                          batch_time=batch_time,
                          data_time=data_time,
                          loss=losses,
                          D1=D1,
                          EPE=EPE))

        msg = 'mean train loss: %.3f | mean D1: %.3f | mean EPE: %.3f' % (
            losses.avg, D1.avg, EPE.avg)
        logging.info(msg)
        return losses.avg, EPE.avg, D1.avg
Пример #6
0
def train(args, model, optimizer, criterion, dataloader_train, dataloader_val,
          writer, k_fold):
    best_pred, best_acc, best_jac, best_sen, best_spe = 0.0, 0.0, 0.0, 0.0, 0.0
    best_epoch = 0
    step = 0
    train_loss = u.AverageMeter()
    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    with open("./logs/%s_%s.txt" % (args.net_work, args.net_index), "a") as f:
        print(current_time, file=f)
    for epoch in range(args.num_epochs):
        train_progressor = pb.Train_ProgressBar(mode='train',
                                                fold=k_fold,
                                                epoch=epoch,
                                                total_epoch=args.num_epochs,
                                                model_name=args.net_work,
                                                total=len(dataloader_train) *
                                                args.batch_size)
        lr = u.adjust_learning_rate(args, optimizer, epoch)
        model.train()

        for i, (data, label) in enumerate(dataloader_train):
            train_progressor.current = i * args.batch_size
            if torch.cuda.is_available() and args.use_gpu:
                data = data.cuda()
                label = label.cuda()
            main_out = model(data)
            # get weight_map
            weight_map = torch.zeros(args.num_classes).cuda()
            for t in range(args.num_classes):
                weight_map[t] = 1 / (torch.sum((label == t).float()) + 1.0)
            loss_aux = F.binary_cross_entropy_with_logits(main_out,
                                                          label,
                                                          weight=None)
            loss_main = criterion[1](main_out, label)
            loss = loss_main + loss_aux
            train_loss.update(loss.item(), data.size(0))
            train_progressor.current_loss = train_loss.avg
            train_progressor.current_lr = lr
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_progressor()
            step += 1
            if step % 10 == 0:
                writer.add_scalar('Train/loss_step_{}'.format(int(k_fold)),
                                  loss, step)
        train_progressor.done()
        writer.add_scalar('Train/loss_epoch_{}'.format(int(k_fold)),
                          float(train_loss.avg), epoch)
        Dice, Acc, jaccard, Sensitivity, Specificity = val(
            args, model, dataloader_val, k_fold, epoch)
        writer.add_scalar('Valid/Dice_val_{}'.format(int(k_fold)), Dice, epoch)
        writer.add_scalar('Valid/Acc_val_{}'.format(int(k_fold)), Acc, epoch)
        writer.add_scalar('Valid/Jac_val_{}'.format(int(k_fold)), jaccard,
                          epoch)
        writer.add_scalar('Valid/Sen_val_{}'.format(int(k_fold)), Sensitivity,
                          epoch)
        writer.add_scalar('Valid/Spe_val_{}'.format(int(k_fold)), Specificity,
                          epoch)

        is_best = Dice > best_pred
        if is_best:
            best_pred = max(best_pred, Dice)
            best_jac = max(best_jac, jaccard)
            best_acc = max(best_acc, Acc)
            best_sen = max(best_sen, Sensitivity)
            best_spe = max(best_spe, Specificity)
            best_epoch = epoch + 1
        checkpoint_dir = os.path.join(args.save_model_path, str(k_fold))
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        checkpoint_latest_name = os.path.join(checkpoint_dir,
                                              'checkpoint_latest.path.tar')
        u.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'best_dice': best_pred
            },
            best_pred,
            epoch,
            is_best,
            checkpoint_dir,
            filename=checkpoint_latest_name)
    # 记录该折分割效果最好一次epoch的所有参数
    best_indicator_message = "f{} best pred in Epoch:{}\nDice={} Accuracy={} jaccard={} Sensitivity={} Specificity={}".format(
        k_fold, best_epoch, best_pred, best_acc, best_jac, best_sen, best_spe)
    with open("./logs/%s_%s_best_indicator.txt" %
              (args.net_work, args.net_index),
              mode='a') as f:
        print(best_indicator_message, file=f)
Пример #7
0
def train_epoch(epoch,
                train_loader,
                model,
                criterion,
                optimizer,
                use_cuda=True):
    batch_time = utils.AverageMeter('Time', ':6.3f')
    data_time = utils.AverageMeter('Data', ':6.3f')
    losses = utils.AverageMeter('Loss', ':.4e')
    top1 = utils.AverageMeter('Acc@1', ':6.2f')
    top5 = utils.AverageMeter('Acc@5', ':6.2f')
    progress = utils.ProgressMeter(len(train_loader),
                                   batch_time,
                                   data_time,
                                   top1,
                                   top5,
                                   losses,
                                   prefix="Epoch: [{}]".format(epoch + 1))

    print_freq = len(train_loader) // 4 + 1
    all_preds = []
    all_labels = []
    model.train()
    end = time.time()
    for i, (paths, inputs, labels) in enumerate(train_loader):

        if use_cuda:
            inputs, labels = inputs.cuda(), labels.cuda()
        data_time.update(time.time() - end)

        # forward + backward + optimize
        if type(model).__name__ == 'Inception3' and model.aux_logits:
            outputs, aux_outputs = model(inputs)
            loss_aux = criterion(aux_outputs, labels)
            loss_final = criterion(outputs, labels)
            loss = loss_final + 0.4 * loss_aux
        else:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
        acc1, acc5 = utils.accuracy(outputs, labels, topk=(1, 5))
        losses.update(loss.item(), inputs.size(0))
        top1.update(acc1[0], inputs.size(0))
        top5.update(acc5[0], inputs.size(0))

        # for confusion matrix calculation
        _, preds = outputs.topk(1, 1, True, True)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        # zero the parameter gradients
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # print statistics
        if i % print_freq == 0 or i + 1 == len(train_loader):
            progress.print(i + 1)

    print(confusion_matrix(all_labels, all_preds))
    return top1.avg, top5.avg
Пример #8
0
def run_train_epoch(model, optimizer, criterion, train_dataloader, epoch,
                    args):
    batch_time = utils.AverageMeter('Time', ':6.3f')
    losses = utils.AverageMeter('Loss', ':.4e')
    grad_norm = utils.AverageMeter('grad_norm', ':.4e')
    progress = utils.ProgressMeter(len(train_dataloader),
                                   batch_time,
                                   losses,
                                   grad_norm,
                                   prefix="Epoch: [{}]".format(epoch))

    end = time.time()
    # trainloader is an iterator. This line extract one minibatch at one time
    for i, data in enumerate(train_dataloader, 0):
        feat = data["x"]
        label = data["y"]
        num_frs = data["num_frs"]
        utt_ids = data["utt_ids"]

        x = feat.to(th.float32)
        y = label.squeeze(2).long()

        if th.cuda.is_available():
            x = x.cuda()
            y = y.cuda()

        x = x.transpose(0, 1)
        key_padding_mask = th.ones((x.size(1), x.size(0)))

        for utt in range(len(num_frs)):
            key_padding_mask[utt, :num_frs[utt]] = 0

        src_mask = None
        if (args.look_ahead > -1):
            src_mask = th.tril(th.ones(x.size(0), x.size(0)),
                               diagonal=args.look_ahead)
            src_mask = src_mask.float().masked_fill(src_mask == 0,
                                                    float('-inf')).masked_fill(
                                                        src_mask == 1,
                                                        float(0.0))
            src_mask = src_mask.cuda()

        key_padding_mask = key_padding_mask.bool().cuda()
        prediction = model(x, src_mask, key_padding_mask)
        prediction = prediction.transpose(0, 1).contiguous()
        loss = criterion(prediction.view(-1, prediction.size(2)), y.view(-1))

        optimizer.zero_grad()
        loss.backward()

        # Gradient Clipping
        norm = nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        #update lr
        step = len(train_dataloader) * epoch + i + 1
        lr = utils.noam_decay(step, args.warmup_step, args.lr)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

        optimizer.step()

        grad_norm.update(norm)

        # update loss
        losses.update(loss.item(), x.size(1))

        # measure elapsed time
        batch_time.update(time.time() - end)

        if i % args.print_freq == 0:
            #        if not args.hvd or hvd.rank() == 0:
            progress.print(i)
Пример #9
0
def train_model(trainloader, testloader, net, device):
    if torch.cuda.device_count() > 1:
        # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
        print("Activate multi GPU support.")
        net = nn.DataParallel(net)
    net.to(device)
    # define the loss function
    criterion = (nn.CrossEntropyLoss().cuda()
                 if torch.cuda.is_available() else nn.CrossEntropyLoss())
    # Scale the lr linearly with the batch size.
    # Should be 0.1 when batch_size=128
    initial_lr = 0.1 * batch_size / 128
    # initialize the optimizer
    optimizer = optim.SGD(net.parameters(),
                          lr=initial_lr,
                          momentum=0.9,
                          weight_decay=_WEIGHT_DECAY)
    # multiply the lr by 0.1 at 100, 150, and 200 epochs
    div = num_epoch // 4
    lr_decay_milestones = [div * 2, div * 3]
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                               milestones=lr_decay_milestones,
                                               gamma=0.1,
                                               last_epoch=_LAST_EPOCH)

    for epoch in range(num_epoch):  # loop over the dataset multiple times

        # set printing functions
        batch_time = util.AverageMeter('Time/batch', ':.3f')
        losses = util.AverageMeter('Loss', ':6.2f')
        top1 = util.AverageMeter('Acc', ':6.2f')
        progress = util.ProgressMeter(len(trainloader),
                                      [losses, top1, batch_time],
                                      prefix="Epoch: [{}]".format(epoch + 1))

        # switch the model to the training mode
        net.train()

        print('current learning rate = {}'.format(
            optimizer.param_groups[0]['lr']))

        # each epoch
        end = time.time()
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data[0].to(device), data[1].to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            for name, param in net.named_parameters():
                if 'threshold' in name:
                    loss += args.sigma * torch.norm(param - args.gtarget)
            loss.backward()
            optimizer.step()

            # measure accuracy and record loss
            _, batch_predicted = torch.max(outputs.data, 1)
            batch_accu = 100.0 * (batch_predicted
                                  == labels).sum().item() / labels.size(0)
            losses.update(loss.item(), labels.size(0))
            top1.update(batch_accu, labels.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % 50 == 49:
                # print statistics every 100 mini-batches each epoch
                progress.display(i)  # i = batch id in the epoch

        # update the learning rate
        scheduler.step()

        # print test accuracy every few epochs
        if epoch % 10 == 9:
            print('epoch {}'.format(epoch + 1))
            test_accu(testloader, net, device)

    # save the model if required
    if args.save:
        print("Saving the trained model.")
        util.save_models(net.state_dict(), save_folder, suffix=_ARCH)

    print('Finished Training')
Пример #10
0
def train(opt, logging):

    ## Data Prepare ##
    if opt.main_proc:
        logging.info("Building dataset")
                                           
    train_dataset = DeepSpeakerDataset(opt, os.path.join(opt.dataroot, 'dev'))
    train_loader = DeepSpeakerDataLoader(train_dataset, batch_size=1, num_workers=opt.num_workers, shuffle=True, pin_memory=True)
             
    val_dataset = DeepSpeakerTestDataset(opt, os.path.join(opt.dataroot, 'test'))
    val_loader = DeepSpeakerTestDataLoader(val_dataset, batch_size=1, num_workers=opt.num_workers, shuffle=False, pin_memory=True)
    
    opt.in_size = train_dataset.in_size
    opt.out_size = train_dataset.class_nums  
    print('opt.in_size {} opt.out_size {}'.format(opt.in_size, opt.out_size))  
                                           
    if opt.main_proc:
        logging.info("Building dataset Sucessed")
    
    ##  Building Model ##
    if opt.main_proc:
        logging.info("Building Model")
    
    opt.model_type = opt.model_type_1
    model_1 = model_select(opt, seq_training=False) ## rnn ge2e
    opt.model_type = opt.model_type_2
    model_2 = model_select(opt, seq_training=False) ## cnn class
    embedding_size = opt.embedding_size
    opt.embedding_size = 2 * embedding_size
    margin = margin_select(opt)
    opt.embedding_size = embedding_size
    
    if opt.resume_1:
        model_1, opt.total_iters = load(model_1, opt.resume_1, 'state_dict')    
    if opt.resume_2:
        model_2, opt.total_iters = load(model_2, opt.resume_2, 'state_dict')
        margin, opt.total_iters = load(margin, opt.resume_2, 'margin_state_dict')
        
    if opt.resume:
        model_1, opt.total_iters = load(model_1, opt.resume, 'state_dict_1')
        model_2, opt.total_iters = load(model_2, opt.resume, 'state_dict_2')
        margin, opt.total_iters = load(margin, opt.resume, 'margin_state_dict')
        
    # define optimizers for different layer
    criterion = torch.nn.CrossEntropyLoss().to(opt.device)
    if opt.optim_type == 'sgd':
        optimizer = optim.SGD([
            {'params': model_1.parameters(), 'weight_decay': 5e-4},
            {'params': model_2.parameters(), 'weight_decay': 5e-4},
            {'params': margin.parameters(), 'weight_decay': 5e-4},
        ], lr=opt.lr, momentum=0.9, nesterov=True)
    elif opt.optim_type == 'adam':
        optimizer = optim.Adam([
            {'params': model_1.parameters(), 'weight_decay': 5e-4},
            {'params': model_2.parameters(), 'weight_decay': 5e-4},
            {'params': margin.parameters(), 'weight_decay': 5e-4},
        ], lr=opt.lr, betas=(opt.beta1, 0.999))
        
    scheduler = lr_scheduler.StepLR(optimizer=optimizer, step_size=opt.lr_reduce_step, gamma=opt.lr_reduce_factor, last_epoch=-1)
        
    model_1.to(opt.device)
    model_2.to(opt.device)
    margin.to(opt.device)
    
    if opt.distributed:
        model_1 = DistributedDataParallel(model_1, device_ids=[opt.local_rank], output_device=opt.local_rank)
        model_2 = DistributedDataParallel(model_2, device_ids=[opt.local_rank], output_device=opt.local_rank)
        margin  = DistributedDataParallel(margin, device_ids=[opt.local_rank], output_device=opt.local_rank)
    if opt.main_proc:
        print(model_1)
        print(model_2)
        print(margin)
        logging.info("Building Model Sucessed") 
        
    best_perform_acc = 1.0
    
    losses = utils.AverageMeter()
    class_losses = utils.AverageMeter()
    embedding_losses = utils.AverageMeter()
    penalty_losses = utils.AverageMeter()

    # Initial performance
    if opt.main_proc:
        EER = union_evaluate(opt, model_1, model_2, val_loader, logging)
        best_perform_acc = EER
        print('>>Start performance: EER = {}<<'.format(best_perform_acc))
    
    save_model = model_1
    if isinstance(model_1, DistributedDataParallel):
        save_model = model_1.module
                            
    # Start Training
    total_iters = opt.total_iters
    for epoch in range(1, opt.total_epoch + 1):
        while True:
            model_1.train()
            model_2.train()
            margin.train()
            for i, (data) in enumerate(train_loader, start=0):
                if i == len(train_loader):
                    break

                optimizer.zero_grad()

                # Perform forward and Obtain the loss
                feature_input, spk_ids = data               
                feature_input = feature_input.to(opt.device)
                label = spk_ids.to(opt.device).squeeze(0)
                
                output_1, attn_1, w_1, b_1 = model_1(feature_input)                                
                output_2, attn_2, w_2, b_2 = model_2(feature_input)                
                margin_input = torch.cat((output_1, output_2), dim=1)
                margin_output = margin(margin_input, label)
                
                output_1 = save_model.normalize(output_1)  
                sim_matrix_out = save_model.similarity(output_1, w_1, b_1)  
                embedding_loss = opt.embedding_loss_lamda / (opt.speaker_num * opt.utter_num) * save_model.loss_cal(sim_matrix_out) 
                if opt.att_type == 'multi_attention' and attn_1 is not None:
                    penalty_loss = opt.penalty_loss_lamda * save_model.penalty_loss_cal(attn_1)
                else:
                    penalty_loss = 0
                class_loss = opt.class_loss_lamda * criterion(margin_output, label)
                loss = embedding_loss + penalty_loss + class_loss
                
                loss_dict_reduced = reduce_loss_dict(opt, {'embedding_loss': embedding_loss, 'penalty_loss': penalty_loss, 'class_loss': class_loss})                
                losses_reduced = sum(loss for loss in loss_dict_reduced.values())
                loss_value = losses_reduced.item()
                embedding_loss_value = loss_dict_reduced['embedding_loss'].item()
                penalty_loss_value = loss_dict_reduced['penalty_loss'].item()
                class_loss_value = loss_dict_reduced['class_loss'].item()

                # Check the loss and avoid the invaided loss
                inf = float("inf")
                if loss_value == inf or loss_value == -inf:
                    print("WARNING: received an inf loss, setting loss value to 0")
                    loss_value = 0
                    embedding_loss_value = 0
                    penalty_loss_value = 0
                    class_loss_value = 0
                    continue

                # Perform backward and Check and update the grad
                loss.backward()
                if utils.check_grad(model_1.parameters(), opt.clip_grad, opt.ignore_grad) or utils.check_grad(model_2.parameters(), opt.clip_grad, opt.ignore_grad):
                    if opt.main_proc:
                        logging.info('Not a finite gradient or too big, ignoring')
                    optimizer.zero_grad()
                    continue
                optimizer.step()
    
                total_iters += opt.num_gpus

                # Update the loss for logging
                losses.update(loss_value)
                embedding_losses.update(embedding_loss_value)
                penalty_losses.update(penalty_loss_value)
                class_losses.update(class_loss_value)

                # Print the performance on the training dateset 'opt': opt, 'learning_rate': lr,
                if total_iters % opt.print_freq == 0:
                    scheduler.step(total_iters)
                    if opt.main_proc:
                        lr = scheduler.get_lr()
                        if isinstance(lr, list):
                            lr = max(lr)
                        logging.info('==> Train set steps {} lr: {:.6f}, loss: {:.4f} [ class: {:.4f}, embedding: {:.4f}, penalty_loss {:.4f}]'.format(
                                     total_iters, lr, losses.avg, class_losses.avg, embedding_losses.avg, penalty_losses.avg))
        
                        if opt.distributed:
                            model_state_dict_1 = model_1.module.state_dict()
                            model_state_dict_2 = model_2.module.state_dict()
                            margin_state_dict = margin.module.state_dict()
                        else:
                            model_state_dict_1 = model_1.state_dict()
                            model_state_dict_2 = model_2.state_dict()
                            margin_state_dict = margin.state_dict()
                        state = {'state_dict_1': model_state_dict_1, 'total_iters': total_iters,
                                 'state_dict_2': model_state_dict_2, 'margin_state_dict': margin_state_dict}
                        filename = 'newest_model.pth'
                        if os.path.isfile(os.path.join(opt.model_dir, filename)):
                            shutil.copy(os.path.join(opt.model_dir, filename), os.path.join(opt.model_dir, 'newest_model.pth_bak'))
                        utils.save_checkpoint(state, opt.model_dir, filename=filename)

                # Validate the trained model
                if total_iters % opt.validate_freq == 0:
                    EER = union_evaluate(opt, model_1, model_2, val_loader, logging)
                    ##scheduler.step(EER)
                    
                    if opt.main_proc and EER < best_perform_acc:
                        best_perform_acc = EER
                        print("Found better validated model (EER = %.3f), saving to model_best.pth" % (best_perform_acc))
                        
                        if opt.distributed:
                            model_state_dict_1 = model_1.module.state_dict()
                            model_state_dict_2 = model_2.module.state_dict()
                            margin_state_dict = margin.module.state_dict()
                        else:
                            model_state_dict_1 = model_1.state_dict()
                            model_state_dict_2 = model_2.state_dict()
                            margin_state_dict = margin.state_dict()
                        state = {'state_dict_1': model_state_dict_1, 'total_iters': total_iters,
                                 'state_dict_2': model_state_dict_2, 'margin_state_dict': margin_state_dict}
                        
                        filename = 'model_best.pth'
                        if os.path.isfile(os.path.join(opt.model_dir, filename)):
                            shutil.copy(os.path.join(opt.model_dir, filename), os.path.join(opt.model_dir, 'model_best.pth_bak'))
                        utils.save_checkpoint(state, opt.model_dir, filename=filename)                             
    
                    model_1.train()
                    model_2.train()
                    margin.train()
                    losses.reset()
                    class_losses.reset()
                    embedding_losses.reset()
                    penalty_losses.reset()
    
                if total_iters > opt.max_iters and opt.main_proc:
                    logging.info('finish training, steps is  {}'.format(total_iters))
                    return model_1
Пример #11
0
def validate(loader,
             ds_rd,
             model,
             criterion,
             n_iter=-1,
             logger=None,
             opts=None,
             if_svVis=False,
             visualizer=None):
    '''
	loop through loder, all res, get preds and gts and normled dist.
	With flip test for higher acc.
	for preds, bbs, jts_ori, jts_weigth out, recover preds_ori, dists_nmd, pckh( dist and joints_vis filter, , print, if_sv then save all these
	:param loader:
	:param ds_rd: the reader, givens the length and flip pairs
	:param model:
	:param criterion:
	:param optimizer:
	:param epoch:
	:param n_iter:
	:param logger:
	:param opts:
	:return:
	'''
    batch_time = ut.AverageMeter()
    losses = ut.AverageMeter()
    acc = ut.AverageMeter()

    # switch to evaluate mode
    model.eval()

    num_samples = ds_rd.n_smpl
    n_jt = ds_rd.joint_num_ori

    # to accum rst
    preds_hm = []
    bbs = []
    li_joints_ori = []
    li_joints_vis = []
    li_l_std_ori = []
    with torch.no_grad():
        end = time.time()
        for i, inp_dct in enumerate(loader):
            # compute output
            input = inp_dct['pch']
            target = inp_dct['hms']
            target_weight = inp_dct['joints_vis']
            bb = inp_dct['bb']
            joints_ori = inp_dct['joints_ori']
            l_std_ori = inp_dct['l_std_ori']
            if i >= n_iter and n_iter > 0:  # limiting iters
                break
            outputs = model(input)
            if isinstance(outputs, list):
                output = outputs[-1]
            else:
                output = outputs
            output_ori = output.clone()  # original output of original image
            if opts.if_flipTest:
                input_flipped = input.flip(3).clone()  # flipped input
                outputs_flipped = model(input_flipped)  # flipped output
                if isinstance(outputs_flipped, list):
                    output_flipped = outputs_flipped[-1]
                else:
                    output_flipped = outputs_flipped
                output_flipped_ori = output_flipped.clone(
                )  # hm only head changed? not possible??
                output_flipped = flip_back(output_flipped.cpu().numpy(),
                                           ds_rd.flip_pairs)
                output_flipped = torch.from_numpy(
                    output_flipped.copy()).cuda()  # N x n_jt xh x w tch

                # feature is not aligned, shift flipped heatmap for higher accuracy
                if_shiftHM = True  # no idea why
                if if_shiftHM:  # check original
                    # print('run shift flip')
                    output_flipped[:, :, :, 1:] = \
                     output_flipped.clone()[:, :, :, 0:-1]

                output = (output + output_flipped) * 0.5

            target = target.cuda(non_blocking=True)
            target_weight = target_weight.cuda(non_blocking=True)
            loss = criterion(output, target, target_weight)

            num_images = input.size(0)
            # measure accuracy and record loss
            losses.update(loss.item(), num_images)
            _, avg_acc, cnt, pred_hm = accuracy(output.cpu().numpy(),
                                                target.cpu().numpy())
            acc.update(avg_acc, cnt)

            # preds can be furhter refined with subpixel trick, but it is already good enough.
            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            # keep rst
            preds_hm.append(pred_hm)  # already numpy, 2D
            bbs.append(bb.numpy())
            li_joints_ori.append(joints_ori.numpy())
            li_joints_vis.append(target_weight.cpu().numpy())
            li_l_std_ori.append(l_std_ori.numpy())

            if if_svVis and 0 == i % opts.svVis_step:
                sv_dir = opts.vis_test_dir  # exp/vis/Human36M
                # batch version
                mod0 = opts.mod_src[0]
                mean = ds_rd.means[mod0]
                std = ds_rd.stds[mod0]
                img_patch_vis = ut.ts2cv2(input[0], mean, std)  # to CV BGR
                img_patch_vis_flipped = ut.ts2cv2(input_flipped[0], mean,
                                                  std)  # to CV BGR
                # pseudo change
                cm = getattr(cv2, ds_rd.dct_clrMap[mod0])
                img_patch_vis = cv2.applyColorMap(img_patch_vis, cm)
                img_patch_vis_flipped = cv2.applyColorMap(
                    img_patch_vis_flipped, cm)

                # original version get img from the ds_rd , different size , plot ing will vary from each other
                # warp preds to ori
                # draw and save  with index.

                idx_test = i * opts.batch_size  # image index
                skels_idx = ds_rd.skels_idx
                # get pred2d_patch
                pred2d_patch = np.ones((n_jt, 3))  # 3rd for  vis
                pred2d_patch[:, :2] = pred_hm[0] / opts.out_shp[
                    0] * opts.sz_pch[1]  # only first
                vis.save_2d_skels(
                    img_patch_vis,
                    pred2d_patch,
                    skels_idx,
                    sv_dir,
                    suffix='-' + mod0,
                    idx=idx_test
                )  # make sub dir if needed, recover to test set index by indexing.
                # save the hm images. save flip test
                hm_ori = ut.normImg(
                    output_ori[0].cpu().numpy().sum(axis=0))  # rgb one
                hm_flip = ut.normImg(
                    output_flipped[0].cpu().numpy().sum(axis=0))
                hm_flip_ori = ut.normImg(
                    output_flipped_ori[0].cpu().numpy().sum(axis=0))
                # subFd = mod0+'_hmFlip_ori'
                # vis.save_img(hm_flip_ori, sv_dir, idx_test, sub=subFd)

                # combined
                # img_cb = vis.hconcat_resize([img_patch_vis, hm_ori, img_patch_vis_flipped, hm_flip_ori])        # flipped hm
                # subFd = mod0+'_cbFlip'
                # vis.save_img(img_cb, sv_dir, idx_test, sub=subFd)

            if i % opts.print_freq == 0:
                msg = 'Test: [{0}/{1}]\t' \
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t' \
                      'Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format(
                 i, len(loader), batch_time=batch_time,
                 loss=losses, acc=acc)
                logger.info(msg)

    preds_hm = np.concatenate(preds_hm, axis=0)  # N x n_jt  x 2
    bbs = np.concatenate(bbs, axis=0)
    joints_ori = np.concatenate(li_joints_ori, axis=0)
    joints_vis = np.concatenate(li_joints_vis, axis=0)
    l_std_ori_all = np.concatenate(li_l_std_ori, axis=0)

    preds_ori = ut.warp_coord_to_original(preds_hm, bbs, sz_out=opts.out_shp)
    err_nmd = ut.distNorm(preds_ori, joints_ori, l_std_ori_all)
    ticks = np.linspace(0, 0.5, 11)  # 11 ticks
    pck_all = ut.pck(err_nmd, joints_vis, ticks=ticks)

    # save to plain format for easy processing
    rst = {
        'preds_ori': preds_ori.tolist(),
        'joints_ori': joints_ori.tolist(),
        'l_std_ori_all': l_std_ori_all.tolist(),
        'err_nmd': err_nmd.tolist(),
        'pck': pck_all.tolist()
    }

    return rst
Пример #12
0
    criterion = nn.CrossEntropyLoss()
    if cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    optim = torch.optim.SGD(model.parameters(),
                            lr=train_cfg['lr'],
                            momentum=train_cfg['momentum'],
                            weight_decay=train_cfg['weight_decay'])

    # lr_scheduler = torch.optim.lr_scheduler.StepLR(optim, train_cfg['step_size'], gamma=train_cfg['gamma'], last_epoch=-1)
    lr_scheduler = None
    log = utils.print_log(configure['log_dir'], [net_cfg['type'], timestamp])
    log.write(str(net_cfg))
    log.write(str(train_cfg))
    epoch_time = utils.AverageMeter()
    batch_time = utils.AverageMeter()
    data_time = utils.AverageMeter()
    losses = utils.AverageMeter()
    top1 = utils.AverageMeter()

    # --------------------train & validation & save checkpoint---------------- #
    epoch = 0
    last_iteration = 0
    print_freq = 1
    best_top1 = 0
    max_epoch = train_cfg['max_epoch']
    print("train max epoch {0}".format(max_epoch))
    for epoch in tqdm.trange(epoch, max_epoch, desc='Train',
                             ncols=80):  # 调整进度条宽度为80
        # self.epoch = epoch
Пример #13
0
def train(opt, logging):
    
    ## Data Prepare ##
    if opt.main_proc:
        logging.info("Building dataset")
                                           
    train_dataset = DeepSpeakerUttDataset(opt, os.path.join(opt.dataroot, 'train'))
    if not opt.distributed:
        train_sampler = BucketingSampler(train_dataset, batch_size=opt.batch_size)
    else:
        train_sampler = DistributedBucketingSampler(train_dataset, batch_size=opt.batch_size,
                                                    num_replicas=opt.num_gpus, rank=opt.local_rank)
    train_loader = DeepSpeakerUttDataLoader(train_dataset, num_workers=opt.num_workers, batch_sampler=train_sampler)
             
    val_dataset = DeepSpeakerTestDataset(opt, os.path.join(opt.dataroot, 'test'))
    val_loader = DeepSpeakerTestDataLoader(val_dataset, batch_size=1, num_workers=opt.num_workers, shuffle=False, pin_memory=True)
    
    opt.in_size = train_dataset.in_size
    opt.out_size = train_dataset.class_nums  
    print('opt.in_size {} opt.out_size {}'.format(opt.in_size, opt.out_size))  
                                           
    if opt.main_proc:
        logging.info("Building dataset Sucessed")
    
    ##  Building Model ##
    if opt.main_proc:
        logging.info("Building Model")
    
    model = model_select(opt)
    margin = margin_select(opt)
    
    if opt.resume:
        model, opt.total_iters = load(model, opt.resume, 'state_dict')
        margin, opt.total_iters = load(margin, opt.resume, 'margin_state_dict')
    
    # define optimizers for different layer
    criterion = torch.nn.CrossEntropyLoss().to(opt.device)
    if opt.optim_type == 'sgd':
        optimizer = optim.SGD([
            {'params': model.parameters(), 'weight_decay': 5e-4},
            {'params': margin.parameters(), 'weight_decay': 5e-4}
        ], lr=opt.lr, momentum=0.9, nesterov=True)
    elif opt.optim_type == 'adam':
        optimizer = optim.Adam([
            {'params': model.parameters(), 'weight_decay': 5e-4},
            {'params': margin.parameters(), 'weight_decay': 5e-4}
        ], lr=opt.lr, betas=(opt.beta1, 0.999))
    elif opt.optim_type == 'radam':
        optimizer = RAdam([
            {'params': model.parameters(), 'weight_decay': 5e-4},
            {'params': margin.parameters(), 'weight_decay': 5e-4}
        ], lr=opt.lr)
        
    scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[10, 20, 40], gamma=0.1)
        
    model.to(opt.device)
    margin.to(opt.device)
    
    if opt.distributed:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[opt.local_rank],
                                                          output_device=opt.local_rank)
        margin = torch.nn.parallel.DistributedDataParallel(margin, device_ids=[opt.local_rank],
                                                           output_device=opt.local_rank)
    if opt.main_proc:
        print(model)
        print(margin)
        logging.info("Building Model Sucessed") 
        
    best_perform_eer = 1.0
    
    losses = utils.AverageMeter()
    acc = utils.AverageMeter()

    # Initial performance
    if opt.main_proc:
        EER = evaluate(opt, model, val_loader, logging)
        best_perform_eer = EER
        print('>>Start performance: EER = {}<<'.format(best_perform_eer))
    
    total_iters = opt.total_iters
    for epoch in range(1, opt.total_epoch + 1):
        train_sampler.shuffle(epoch)
        scheduler.step()
        # train model
        if opt.main_proc:
            logging.info('Train Epoch: {}/{} ...'.format(epoch, opt.total_epoch))
        model.train()
        margin.train()

        since = time.time()
        for i, (data) in enumerate(train_loader, start=0):
            utt_ids, inputs, targets = data
            inputs, label = inputs.to(opt.device), targets.to(opt.device)
            optimizer.zero_grad()
            
            raw_logits, attn, w, b = model(inputs)
            output = margin(raw_logits, label)
            #loss = criterion(output, label)
            loss = cal_loss(output, label, criterion, smoothing=opt.smoothing)
            loss_dict_reduced = reduce_loss_dict(opt, {'loss': loss})
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            loss_value = losses_reduced.item()
            
            # Check the loss and avoid the invaided loss
            inf = float("inf")
            if loss_value == inf or loss_value == -inf:
                print("WARNING: received an inf loss, setting loss value to 0")
                loss_value = 0
                continue
                    
            loss.backward()
            if utils.check_grad(model.parameters(), opt.clip_grad, opt.ignore_grad):
                if opt.main_proc:
                    logging.info('Not a finite gradient or too big, ignoring')
                optimizer.zero_grad()
                continue
            optimizer.step()

            total_iters += opt.num_gpus
            losses.update(loss_value)
            
            # print train information
            if total_iters % opt.print_freq == 0 and opt.main_proc:
                # current training accuracy
                _, predict = torch.max(output.data, 1)
                total = label.size(0)
                correct = (np.array(predict.cpu()) == np.array(label.data.cpu())).sum()
                time_cur = (time.time() - since) / 100
                since = time.time()
                logging.info("Iters: {:0>6d}/[{:0>2d}], loss: {:.4f} ({:.4f}), train_accuracy: {:.4f}, time: {:.2f} s/iter, learning rate: {}".format(total_iters, epoch, loss_value, losses.avg, correct/total, time_cur, scheduler.get_lr()[0]))
              
            # save model
            if total_iters % opt.save_freq == 0 and opt.main_proc:
                logging.info('Saving checkpoint: {}'.format(total_iters))
                if opt.distributed:
                    model_state_dict = model.module.state_dict()
                    margin_state_dict = margin.module.state_dict()
                else:
                    model_state_dict = model.state_dict()
                    margin_state_dict = margin.state_dict()
                state = {'state_dict': model_state_dict, 'margin_state_dict': margin_state_dict, 'total_iters': total_iters,}
                filename = 'newest_model.pth'
                if os.path.isfile(os.path.join(opt.model_dir, filename)):
                    shutil.copy(os.path.join(opt.model_dir, filename), os.path.join(opt.model_dir, 'newest_model.pth_bak'))
                utils.save_checkpoint(state, opt.model_dir, filename=filename)
                    
            # Validate the trained model
            if total_iters % opt.validate_freq == 0:
                EER = evaluate(opt, model, val_loader, logging)
                ##scheduler.step(EER)
                
                if opt.main_proc and EER < best_perform_eer:
                    best_perform_eer = EER
                    logging.info("Found better validated model (EER = %.3f), saving to model_best.pth" % (best_perform_eer))
                    if opt.distributed:
                        model_state_dict = model.module.state_dict()
                        margin_state_dict = margin.module.state_dict()
                    else:
                        model_state_dict = model.state_dict()
                        margin_state_dict = margin.state_dict()
                    state = {'state_dict': model_state_dict, 'margin_state_dict': margin_state_dict, 'total_iters': total_iters,}  
                    filename = 'model_best.pth'
                    if os.path.isfile(os.path.join(opt.model_dir, filename)):
                        shutil.copy(os.path.join(opt.model_dir, filename), os.path.join(opt.model_dir, 'model_best.pth_bak'))                   
                    utils.save_checkpoint(state, opt.model_dir, filename=filename)

                model.train()
                margin.train()
                losses.reset()
                   
Пример #14
0
def train(train_loader, model, optimizer, epoch, writer, logger, config):
    device = torch.device("cuda")
    if config.label_smooth > 0:
        criterion = CrossEntropyLabelSmooth(config.n_classes,
                                            config.label_smooth).to(device)
    else:
        criterion = nn.CrossEntropyLoss().to(device)

    top1 = utils.AverageMeter()
    top5 = utils.AverageMeter()
    losses = utils.AverageMeter()

    step_num = len(train_loader)
    cur_step = epoch * step_num
    cur_lr = optimizer.param_groups[0]['lr']
    if config.local_rank == 0:
        logger.info("Train Epoch {} LR {}".format(epoch, cur_lr))
        writer.add_scalar('train/lr', cur_lr, cur_step)

    model.train()

    for step, (X, y) in enumerate(train_loader):
        X, y = X.to(device, non_blocking=True), y.to(device, non_blocking=True)
        N = X.size(0)

        X, target_a, target_b, lam = data_utils.mixup_data(X,
                                                           y,
                                                           config.mixup_alpha,
                                                           use_cuda=True)

        optimizer.zero_grad()
        logits, logits_aux = model(X)
        # loss = criterion(logits, y)
        loss = data_utils.mixup_criterion(criterion, logits, target_a,
                                          target_b, lam)
        if config.aux_weight > 0:
            # loss_aux = criterion(logits_aux, y)
            loss_aux = data_utils.mixup_criterion(criterion, logits_aux,
                                                  target_a, target_b, lam)
            loss = loss + config.aux_weight * loss_aux

        if config.use_amp:
            from apex import amp
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        # gradient clipping
        nn.utils.clip_grad_norm_(model.module.parameters(), config.grad_clip)
        optimizer.step()

        prec1, prec5 = utils.accuracy(logits, y, topk=(1, 5))
        if config.distributed:
            reduced_loss = utils.reduce_tensor(loss.data, config.world_size)
            prec1 = utils.reduce_tensor(prec1, config.world_size)
            prec5 = utils.reduce_tensor(prec5, config.world_size)
        else:
            reduced_loss = loss.data

        losses.update(reduced_loss.item(), N)
        top1.update(prec1.item(), N)
        top5.update(prec5.item(), N)

        torch.cuda.synchronize()
        if config.local_rank == 0 and (step % config.print_freq == 0
                                       or step == step_num):
            logger.info(
                "Train: Epoch {:2d}/{} Step {:03d}/{:03d} Loss {losses.avg:.3f} "
                "Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format(
                    epoch + 1,
                    config.epochs,
                    step,
                    step_num,
                    losses=losses,
                    top1=top1,
                    top5=top5))

        if config.local_rank == 0:
            writer.add_scalar('train/loss', reduced_loss.item(), cur_step)
            writer.add_scalar('train/top1', prec1.item(), cur_step)
            writer.add_scalar('train/top5', prec5.item(), cur_step)
            cur_step += 1

    if config.local_rank == 0:
        logger.info("Train: Epoch {:2d}/{} Final Prec@1 {:.4%}".format(
            epoch + 1, config.epochs, top1.avg))
Пример #15
0
def train(args, model, optimizer, criterion, dataloader_train, dataloader_val,
          writer, k_fold):
    best_pred, best_pre, best_rec, best_f1 = 0.0, 0.0, 0.0, 0.0
    best_epoch = 0
    step = 0
    train_loss = u.AverageMeter()
    top1_m = u.AverageMeter()
    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    with open("./logs/%s.txt" % (args.model_name), "a") as f:
        print(current_time, file=f)
    for epoch in range(args.num_epochs):
        train_progressor = pb.Train_ProgressBar(mode='train',
                                                fold=k_fold,
                                                epoch=epoch,
                                                total_epoch=args.num_epochs,
                                                model_name=args.model_name,
                                                total=len(dataloader_train) *
                                                args.batch_size)
        lr = u.adjust_learning_rate(args, optimizer, epoch)
        model.train()
        for i, (data, label) in enumerate(dataloader_train):
            train_progressor.current = i * args.batch_size
            if torch.cuda.is_available() and args.use_gpu:
                data = data.cuda()
                label = label.cuda()
            pred = model(data)
            loss = criterion(pred, label)
            top1 = u.accuracy(pred, label)
            top1_m.update(top1[0], data.size(0))
            train_loss.update(loss.item(), data.size(0))
            train_progressor.current_loss = train_loss.avg
            train_progressor.current_lr = lr
            train_progressor.top1 = top1_m.avg
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_progressor()
            step += 1
            if step % 10 == 0:
                writer.add_scalar('Train/loss_step_{}'.format(int(k_fold)),
                                  loss, step)
        train_progressor.done()
        writer.add_scalar('Train/loss_epoch_{}'.format(int(k_fold)),
                          float(train_loss.avg), epoch)
        Accuracy, Precision, Recall, F1 = val(args, model, criterion,
                                              dataloader_val, epoch, k_fold)
        writer.add_scalar('Valid/Accuracy_val_{}'.format(int(k_fold)),
                          Accuracy, epoch)
        writer.add_scalar('Valid/Precision_val_{}'.format(int(k_fold)),
                          Precision, epoch)
        writer.add_scalar('Valid/Recall_val_{}'.format(int(k_fold)), Recall,
                          epoch)
        writer.add_scalar('Valid/F1_val_{}'.format(int(k_fold)), F1, epoch)

        is_best = Accuracy > best_pred
        if is_best:
            best_pred = max(best_pred, Accuracy)
            best_pre = max(best_pre, Precision)
            best_rec = max(best_rec, Recall)
            best_f1 = max(best_f1, F1)
            best_epoch = epoch + 1
        checkpoint_dir = os.path.join(args.save_model_path, str(k_fold))
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        checkpoint_latest_name = os.path.join(checkpoint_dir,
                                              'checkpoint_latest.path.tar')
        # print(best_pred)
        u.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'best_dice': best_pred
            },
            best_pred,
            epoch,
            is_best,
            checkpoint_dir,
            filename=checkpoint_latest_name)
    # 记录该折分类最好一次epoch的所有参数
    best_indicator_message = "f{} best pred in Epoch:{}\Accuracy={} Precision={} Recall={} F1={}".format(
        k_fold, best_epoch, best_pred, best_pre, best_rec, best_f1)
    with open("./logs/%s_%s_best_indicator.txt" % (args.model_name),
              mode='a') as f:
        print(best_indicator_message, file=f)
def main():
    global args
    args = get_parser().parse_args()
    LOGGER.info(args)

    # Get input image size and save name list.
    # Each line of data_list should contain
    # image_0, image_1, (optional) ground truth, (optional) ground truth mask.
    with open(args.data_list, 'r') as file_list:
        fnames = file_list.readlines()
        assert len(
            fnames[0].strip().split(' ')
        ) == 2 + args.evaluate + args.evaluate * args.additional_flow_masks
        input_size = cv2.imread(
            os.path.join(args.data_root, fnames[0].split(' ')[0])).shape
        if args.visualize or args.save_inputs or args.save_refined:
            names = [l.strip().split(' ')[0].split('/')[-1] for l in fnames]
            sub_folders = [
                l.strip().split(' ')[0][:-len(names[i])]
                for i, l in enumerate(fnames)
            ]
            names = [l.split('.')[0] for l in names]

    # Prepare data.
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    target_height, target_width = get_target_size(input_size[0], input_size[1])
    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize(mean=mean, std=std)])
    data = hd3data.HD3Data(
        mode='flow',
        data_root=args.data_root,
        data_list=args.data_list,
        label_num=args.evaluate + args.evaluate * args.additional_flow_masks,
        transform=transform,
        out_size=True)
    data_loader = torch.utils.data.DataLoader(
        data,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.workers,
        pin_memory=True)

    # Setup models.
    model_hd3 = hd3model.HD3Model('flow', args.encoder, args.decoder,
                                  [4, 4, 4, 4, 4], args.context).cuda()
    model_hd3 = torch.nn.DataParallel(model_hd3).cuda()
    model_hd3.eval()

    refinement_network = PPacNet(
        args.kernel_size_preprocessing, args.kernel_size_joint,
        args.conv_specification, args.shared_filters, args.depth_layers_prob,
        args.depth_layers_guidance, args.depth_layers_joint)
    model_refine = refinement_models.EpeNet(refinement_network).cuda()
    model_refine = torch.nn.DataParallel(model_refine).cuda()
    model_refine.eval()

    # Load indicated models.
    name_hd3_model = args.model_hd3_path
    if os.path.isfile(name_hd3_model):
        checkpoint = torch.load(name_hd3_model)
        model_hd3.load_state_dict(checkpoint['state_dict'])
        LOGGER.info("Loaded HD3 checkpoint '{}'".format(name_hd3_model))
    else:
        LOGGER.info("No checkpoint found at '{}'".format(name_hd3_model))

    name_refinement_model = args.model_refine_path
    if os.path.isfile(name_refinement_model):
        checkpoint = torch.load(name_refinement_model)
        model_refine.load_state_dict(checkpoint['state_dict'])
        LOGGER.info(
            "Loaded refinement checkpoint '{}'".format(name_refinement_model))
    else:
        LOGGER.info(
            "No checkpoint found at '{}'".format(name_refinement_model))

    if args.evaluate:
        epe_hd3 = utils.AverageMeter()
        outliers_hd3 = utils.AverageMeter()
        epe_refined = utils.AverageMeter()
        outliers_refined = utils.AverageMeter()

    if args.visualize:
        visualization_folder = os.path.join(args.save_folder, 'visualizations')
        utils.check_makedirs(visualization_folder)

    if args.save_inputs:
        input_folder = os.path.join(args.save_folder, 'hd3_inputs')
        utils.check_makedirs(input_folder)

    if args.save_refined:
        refined_folder = os.path.join(args.save_folder, 'refined_flow')
        utils.check_makedirs(refined_folder)

    # Start inference.
    with torch.no_grad():
        for i, (img_list, label_list, img_size) in enumerate(data_loader):
            if i % 10 == 0:
                LOGGER.info('Done with {}/{} samples'.format(
                    i, len(data_loader)))

            img_size = img_size.cpu().numpy()
            img_list = [img.to(torch.device("cuda")) for img in img_list]
            label_list = [
                label.to(torch.device("cuda")) for label in label_list
            ]

            # Resize input images.
            resized_img_list = [
                torch.nn.functional.interpolate(
                    img, (target_height, target_width),
                    mode='bilinear',
                    align_corners=True) for img in img_list
            ]

            # Get HD3 flow.
            output = model_hd3(
                img_list=resized_img_list,
                label_list=label_list,
                get_full_vect=True,
                get_full_prob=True,
                get_epe=args.evaluate)

            # Upscale flow to full resolution.
            for level, level_flow in enumerate(output['full_vect']):
                scale_factor = 1 / 2**(6 - level)
                output['full_vect'][level] = resize_dense_vector(
                    level_flow * scale_factor, img_size[0, 1], img_size[0, 0])
            hd3_flow = output['full_vect'][-1]

            # Evaluate HD3 output if required.
            if args.evaluate:
                epe_hd3.update(
                    losses.endpoint_error(hd3_flow, label_list[0]).mean().data,
                    hd3_flow.size(0))
                outliers_hd3.update(
                    losses.outlier_rate(hd3_flow, label_list[0]).mean().data,
                    hd3_flow.size(0))

            # Upscale and interpolate flow probabilities.
            probabilities = prob_utils.get_upsampled_probabilities_hd3(
                output['full_vect'], output['full_prob'])

            if args.save_inputs:
                save_hd3_inputs(
                    hd3_flow, probabilities, input_folder,
                    sub_folders[i * args.batch_size:(i + 1) * args.batch_size],
                    names[i * args.batch_size:(i + 1) * args.batch_size])
                continue

            # Refine flow with PPAC network.
            log_probabilities = prob_utils.safe_log(probabilities)
            output_refine = model_refine(
                hd3_flow,
                log_probabilities,
                img_list[0],
                label_list=label_list,
                get_loss=args.evaluate,
                get_epe=args.evaluate,
                get_outliers=args.evaluate)

            # Evaluate refined output if required
            if args.evaluate:
                epe_refined.update(output_refine['epe'].mean().data,
                                   hd3_flow.size(0))
                outliers_refined.update(output_refine['outliers'].mean().data,
                                        hd3_flow.size(0))

            # Save visualizations of optical flow if required.
            if args.visualize:
                refined_flow = output_refine['flow']
                ground_truth = None
                if args.evaluate:
                    ground_truth = label_list[0][:, :2]
                save_visualizations(
                    hd3_flow, refined_flow, ground_truth, visualization_folder,
                    sub_folders[i * args.batch_size:(i + 1) * args.batch_size],
                    names[i * args.batch_size:(i + 1) * args.batch_size])

            # Save refined optical flow if required.
            if args.save_refined:
                refined_flow = output_refine['flow']
                save_refined_flow(
                    refined_flow, refined_folder,
                    sub_folders[i * args.batch_size:(i + 1) * args.batch_size],
                    names[i * args.batch_size:(i + 1) * args.batch_size])

    if args.evaluate:
        LOGGER.info(
            'Accuracy of HD3 optical flow:      '
            'AEE={epe_hd3.avg:.4f}, Outliers={outliers_hd3.avg:.4f}'.format(
                epe_hd3=epe_hd3, outliers_hd3=outliers_hd3))
        if not args.save_inputs:
            LOGGER.info(
                'Accuracy of refined optical flow:  '
                'AEE={epe_refined.avg:.4f}, Outliers={outliers_refined.avg:.4f}'
                .format(
                    epe_refined=epe_refined,
                    outliers_refined=outliers_refined))
Пример #17
0
def run_train_epoch(model, optimizer, dataloader, epoch, trans_model, tree,
                    supervision_opts, aligner, den, chain_opts, args):
    batch_time = utils.AverageMeter('Time', ':6.3f')
    losses = utils.AverageMeter('Loss', ':.4e')
    grad_norm = utils.AverageMeter('grad_norm', ':.4e')
    progress = utils.ProgressMeter(len(dataloader),
                                   batch_time,
                                   losses,
                                   grad_norm,
                                   prefix="Epoch: [{}]".format(epoch))

    criterion = ops.ChainObjtiveFunction.apply
    end = time.time()
    for i, batch in enumerate(dataloader):
        feat = batch["x"]
        label = batch["y"]
        num_frs = batch["num_frs"]
        utt_ids = batch["utt_ids"]
        aux = batch["aux"]  #word labels for se loss

        frame_shift = (epoch % supervision_opts.frame_subsampling_factor) * -1

        x = feat.to(th.float32)
        x = th.roll(x, frame_shift, 1)
        x = x.unfold(1, 1,
                     supervision_opts.frame_subsampling_factor).squeeze(-1)
        x = x.cuda()
        y = label.squeeze(2)

        loss = 0.0
        prediction = model(x)
        for j in range(len(num_frs)):
            trans_ids = y[j, :num_frs[j]].tolist()
            phone_ali = aligner.to_phone_alignment(trans_ids)

            phones = list()
            durations = list()
            for item in phone_ali:
                phones.append(item[0])
                durations.append(item[2])

            proto_supervision = kaldi_chain.alignment_to_proto_supervision(
                supervision_opts, phones, durations)
            supervision = kaldi_chain.proto_supervision_to_supervision(
                tree, trans_model, proto_supervision, True)

            loglike_j = prediction[j, :supervision.frames_per_sequence, :]
            loss += criterion(loglike_j, den, supervision, chain_opts)

        optimizer.zero_grad()
        loss.backward()

        #update lr
        step = len(dataloader) * epoch + i + 1
        lr = utils.noam_decay(step, args.warmup_steps, args.lr)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

        # Gradient Clipping (th 5.0)
        norm = nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
        optimizer.step()

        grad_norm.update(norm)

        # update the loss
        tot_frs = np.array(num_frs).sum()
        losses.update(loss.item() / tot_frs)

        # measure the elapsed time
        batch_time.update(time.time() - end)

        # save model
        if hvd.rank() == 0 and i % args.save_freq == 0:
            checkpoint = {}
            checkpoint['model'] = model.state_dict()
            checkpoint['optimizer'] = optimizer.state_dict()
            output_file = args.exp_dir + '/chain.model.' + str(i) + '.tar'
            th.save(checkpoint, output_file)

        if hvd.rank() == 0 and i % args.print_freq == 0:
            progress.print(i)
Пример #18
0
def main():
    # torch.manual_seed(args.seed)
    # torch.cuda.manual_seed_all(args.seed)
    # np.random.seed(args.seed)

    saver = Saver(args)
    # set log
    log_format = '%(asctime)s %(message)s'
    logging.basicConfig(level=logging.INFO,
                        format=log_format,
                        datefmt='%m/%d %I:%M:%S %p',
                        filename=os.path.join(saver.experiment_dir, 'log.txt'),
                        filemode='w')
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    logging.getLogger().addHandler(console)

    if not torch.cuda.is_available():
        logging.info('no gpu device available')
        sys.exit(1)

    np.random.seed(args.seed)
    random.seed(args.seed)
    torch.cuda.set_device(args.gpu)
    cudnn.benchmark = True
    torch.manual_seed(args.seed)
    cudnn.enabled = True
    torch.cuda.manual_seed(args.seed)

    saver.create_exp_dir(scripts_to_save=glob.glob('*.py') +
                         glob.glob('*.sh') + glob.glob('*.yml'))
    saver.save_experiment_config()
    summary = TensorboardSummary(saver.experiment_dir)
    writer = summary.create_summary()
    best_pred = 0

    logging.info(args)

    device = torch.device('cuda')
    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(device)
    #
    # ''' Compute FLOPs and Params '''
    # maml = Meta(args, criterion)
    # flops, params = get_model_complexity_info(maml.model, (84, 84), as_strings=False, print_per_layer_stat=True)
    # logging.info('FLOPs: {} MMac Params: {}'.format(flops / 10 ** 6, params))
    #
    # maml = Meta(args, criterion).to(device)
    # tmp = filter(lambda x: x.requires_grad, maml.parameters())
    # num = sum(map(lambda x: np.prod(x.shape), tmp))
    # #logging.info(maml)
    # logging.info('Total trainable tensors: {}'.format(num))

    # batch_size here means total episode number
    mini = MiniImagenet(args.data_path,
                        mode='train',
                        n_way=args.n_way,
                        k_shot=args.k_spt,
                        k_query=args.k_qry,
                        batch_size=args.batch_size,
                        resize=args.img_size,
                        task_id=None)
    mini_test = MiniImagenet(args.data_path,
                             mode='test',
                             n_way=args.n_way,
                             k_shot=args.k_spt,
                             k_query=args.k_qry,
                             batch_size=args.test_batch_size,
                             resize=args.img_size,
                             task_id=args.task_id)
    train_loader = DataLoader(mini,
                              args.meta_batch_size,
                              shuffle=True,
                              num_workers=args.num_workers,
                              pin_memory=True)
    test_loader = DataLoader(mini_test,
                             args.meta_test_batch_size,
                             shuffle=True,
                             num_workers=args.num_workers,
                             pin_memory=True)
    ''' Decoding '''
    model = Network(args,
                    args.init_channels,
                    args.n_way,
                    args.layers,
                    criterion,
                    pretrained=True).cuda()
    inner_optimizer_theta = torch.optim.SGD(model.arch_parameters(),
                                            lr=args.update_lr_theta)
    #inner_optimizer_theta = torch.optim.SGD(model.arch_parameters(), lr=100)
    inner_optimizer_w = torch.optim.SGD(model.parameters(),
                                        lr=args.update_lr_w)

    # load state dict
    pretrained_path = '/data2/dongzelian/NAS/meta_nas/run_meta_nas/mini-imagenet/meta-nas/experiment_21/model_best.pth.tar'
    pretrain_dict = torch.load(pretrained_path)['state_dict_w']
    model_dict = {}
    state_dict = model.state_dict()
    for k, v in pretrain_dict.items():
        if k[6:] in state_dict:
            model_dict[k[6:]] = v
        else:
            print(k)
    state_dict.update(model_dict)
    model.load_state_dict(state_dict)
    #model._arch_parameters = torch.load(pretrained_path)['state_dict_theta']

    for step, (x_spt, y_spt, x_qry, y_qry) in enumerate(test_loader):
        x_spt, y_spt, x_qry, y_qry = x_spt.squeeze(0).to(device), y_spt.squeeze(0).to(device), \
                                     x_qry.squeeze(0).to(device), y_qry.squeeze(0).to(device)
        for k in range(args.update_step_test):
            logits = model(x_spt, alphas=model.arch_parameters())
            loss = criterion(logits, y_spt)

            inner_optimizer_w.zero_grad()
            inner_optimizer_theta.zero_grad()
            loss.backward()
            inner_optimizer_w.step()
            inner_optimizer_theta.step()

        genotype = model.genotype()
        logging.info(genotype)
        maml = Meta_decoding(args, criterion, genotype).to(device)
        #exit()
        #print(step)
        #print(genotype)

    for epoch in range(args.epoch):
        logging.info('--------- Epoch: {} ----------'.format(epoch))
        accs_all_train = []
        # # TODO: how to choose batch data to update theta?
        # valid_iterator = iter(train_loader)
        batch_time = utils.AverageMeter()
        data_time = utils.AverageMeter()
        update_w_time = utils.AverageMeter()
        end = time.time()
        for step, (x_spt, y_spt, x_qry, y_qry) in enumerate(train_loader):
            data_time.update(time.time() - end)
            x_spt, y_spt, x_qry, y_qry = x_spt.to(device), y_spt.to(
                device), x_qry.to(device), y_qry.to(device)
            # (x_search_spt, y_search_spt, x_search_qry, y_search_qry), valid_iterator = infinite_get(valid_iterator, train_loader)
            # x_search_spt, y_search_spt, x_search_qry, y_search_qry = x_search_spt.to(device), y_search_spt.to(device), x_search_qry.to(device), y_search_qry.to(device)
            accs, update_w_time = maml(x_spt, y_spt, x_qry, y_qry,
                                       update_w_time)
            accs_all_train.append(accs)
            batch_time.update(time.time() - end)
            end = time.time()
            writer.add_scalar('train/acc_iter', accs[-1],
                              step + len(train_loader) * epoch)
            if step % args.report_freq == 0:
                logging.info(
                    'Epoch: [{0}][{1}/{2}]\t'
                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                    'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                    'W {update_w_time.val:.3f} ({update_w_time.avg:.3f})\t'
                    'training acc: {accs}'.format(epoch,
                                                  step,
                                                  len(train_loader),
                                                  batch_time=batch_time,
                                                  data_time=data_time,
                                                  update_w_time=update_w_time,
                                                  accs=accs))

            if step % args.test_freq == 0:
                test_accs, test_stds, test_ci95 = meta_test(
                    train_loader, test_loader, maml, device, epoch, writer)
                logging.info(
                    '[Epoch: {}]\t Test acc: {}\t Test ci95: {}'.format(
                        epoch, test_accs, test_ci95))

                # Save the best meta model.
                new_pred = test_accs[-1]
                if new_pred > best_pred:
                    is_best = True
                    best_pred = new_pred
                else:
                    is_best = False
                saver.save_checkpoint(
                    {
                        'epoch':
                        epoch + 1,
                        'state_dict':
                        maml.module.state_dict() if isinstance(
                            maml, nn.DataParallel) else maml.state_dict(),
                        'best_pred':
                        best_pred,
                    }, is_best)
Пример #19
0
def do_train(train_loader, model, criterion, optimizer, epoch, args):
    batch_time = utils.AverageMeter('Time', ':6.3f')
    data_time = utils.AverageMeter('Data', ':6.3f')
    losses = utils.AverageMeter('Loss', ':.3f')
    top1 = utils.AverageMeter('Acc@1', ':6.2f')
    top5 = utils.AverageMeter('Acc@5', ':6.2f')
    learning_rate = utils.AverageMeter('LR', ':.4f')
    throughputs = utils.AverageMeter('ThroughPut', ':.2f')

    losses_id = utils.AverageMeter('L_ID', ':.3f')
    losses_mag = utils.AverageMeter('L_mag', ':.6f')
    progress_template = [
        batch_time, data_time, throughputs, 'images/s', losses, losses_id,
        losses_mag, top1, top5, learning_rate
    ]

    progress = utils.ProgressMeter(len(train_loader),
                                   progress_template,
                                   prefix="Epoch: [{}]".format(epoch))
    end = time.time()

    # update lr
    learning_rate.update(current_lr)

    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        global iters
        iters += 1

        input = input.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)

        # compute output
        output, x_norm = model(input, target)

        loss_id, loss_g, one_hot = criterion(output, target, x_norm)
        loss = loss_id + args.lambda_g * loss_g

        # measure accuracy and record loss
        acc1, acc5 = utils.accuracy(args, output[0], target, topk=(1, 5))

        losses.update(loss.item(), input.size(0))
        top1.update(acc1[0], input.size(0))
        top5.update(acc5[0], input.size(0))

        losses_id.update(loss_id.item(), input.size(0))
        losses_mag.update(args.lambda_g * loss_g.item(), input.size(0))

        # compute gradient and do solver step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        duration = time.time() - end
        batch_time.update(duration)
        end = time.time()
        throughputs.update(args.batch_size / duration)

        if i % args.print_freq == 0:
            progress.display(i)
            debug_info(x_norm, args.l_a, args.u_a, args.l_margin,
                       args.u_margin)

        if args.vis_mag:
            if (i > 10000) and (i % 100 == 0):
                x_norm = x_norm.detach().cpu().numpy()
                cos_theta = torch.masked_select(
                    output[0], one_hot.bool()).detach().cpu().numpy()
                logit = torch.masked_select(F.softmax(
                    output[0]), one_hot.bool()).detach().cpu().numpy()
                np.savez(
                    '{}/vis/epoch_{}_iter{}'.format(args.pth_save_fold, epoch,
                                                    i), x_norm, logit,
                    cos_theta)
Пример #20
0
def run_train_epoch(model, optimizer, log_prior, dataloader, epoch,
                    asr_decoder, trans_model, silence_ids, aligner, args):
    batch_time = utils.AverageMeter('Time', ':6.3f')
    losses = utils.AverageMeter('Loss', ':.4e')
    grad_norm = utils.AverageMeter('grad_norm', ':.4e')
    progress = utils.ProgressMeter(len(dataloader),
                                   batch_time,
                                   losses,
                                   grad_norm,
                                   prefix="Epoch: [{}]".format(epoch))

    ce_criterion = nn.CrossEntropyLoss(ignore_index=-100, reduction='sum')
    if args.criterion == "mmi":
        criterion = ops.MMIFunction.apply
    else:
        criterion = ops.sMBRFunction.apply

    end = time.time()
    for i, batch in enumerate(dataloader):
        feat = batch["x"]
        label = batch["y"]
        num_frs = batch["num_frs"]
        utt_ids = batch["utt_ids"]
        aux = batch["aux"]  #word labels for se loss

        x = feat.to(th.float32)
        y = label.long()
        x = x.cuda()
        y = y.cuda()

        prediction = model(x)
        ce_loss = ce_criterion(prediction.view(-1, prediction.shape[2]),
                               y.view(-1))
        loss = args.ce_ratio * ce_loss

        for j in range(len(num_frs)):
            loglike = prediction[j, :, :]
            loglike_j = loglike[:num_frs[j], :]
            loglike_j = loglike_j - log_prior

            text = th.from_numpy(aux[j][0][0].astype(int)).tolist()
            #text = ' '.join(str(k) for k in text)
            try:
                align_in = kaldi_matrix.Matrix(
                    loglike_j.detach().cpu().numpy())
                align_out = aligner.align(align_in, text)
                trans_ids = align_out["alignment"]

                if args.criterion == "mmi":
                    se_loss = criterion(loglike_j, asr_decoder, trans_model,
                                        trans_ids)
                else:
                    se_loss = criterion(loglike_j, asr_decoder, trans_model,
                                        trans_ids, args.criterion, silence_ids)
                loss += se_loss.cuda()
            except:
                print(
                    "Warning: failed to align utterance {}, skip the utterance for SE loss"
                    .format(utt_ids[j]))

        optimizer.zero_grad()
        loss.backward()

        # Gradient Clipping (th 5.0)
        norm = nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
        optimizer.step()

        grad_norm.update(norm)

        # update loss
        tot_frs = np.array(num_frs).sum()
        losses.update(loss.item() / tot_frs)

        # measure elapsed time
        batch_time.update(time.time() - end)

        # save model
        if hvd.rank() == 0 and i % args.save_freq == 0:
            checkpoint = {}
            checkpoint['model'] = model.state_dict()
            checkpoint['optimizer'] = optimizer.state_dict()
            output_file = args.exp_dir + '/model.se.' + str(i) + '.tar'
            th.save(checkpoint, output_file)

        if hvd.rank() == 0 and i % args.print_freq == 0:
            progress.print(i)
Пример #21
0
def do_train(train_loader, model, criterion, optimizer, grad_scaler, epoch, args):
    batch_time = utils.AverageMeter('Time', ':6.3f')
    data_time = utils.AverageMeter('Data', ':6.3f')
    losses = utils.AverageMeter('Loss', ':.3f')
    top1 = utils.AverageMeter('Acc@1', ':6.2f')
    learning_rate = utils.AverageMeter('LR', ':.4f')
    throughputs = utils.AverageMeter('ThroughPut', ':.2f')

    losses_id = utils.AverageMeter('L_ID', ':.3f')
    losses_mag = utils.AverageMeter('L_mag', ':.6f')
    progress_template = [batch_time, data_time, throughputs, 'images/s',
                         losses, losses_id, losses_mag, 
                         top1, learning_rate]

    progress = utils.ProgressMeter(
        len(train_loader),
        progress_template,
        prefix="Epoch: [{}]".format(epoch))
    end = time.time()

    # update lr
    learning_rate.update(current_lr)

    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        global iters
        iters += 1

        input = input.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)

        # compute output
        with autocast(enabled=args.amp_mode):
            output, x_norm = model(input, target)
        
        # x_norm is not needed to be gathered, as feature x is in each rank
        target = ts.distributed.gather(target, dim=0)

        # loss
        with autocast(enabled=args.amp_mode):
            loss_id, loss_g, one_hot = criterion(output, 
                                                 target,
                                                 x_norm)
        loss = loss_id + args.lambda_g * loss_g
        # compute gradient and do solver step
        optimizer.zero_grad()

        # backward
        grad_scaler.scale(loss).backward()
        # update weights
        grad_scaler.step(optimizer)
        grad_scaler.update() 

        # syn for logging
        torch.cuda.synchronize()   

        # measure elapsed time
        if args.rank == 0:
            duration = time.time() - end
            end = time.time()
            batch_time.update(duration)
            bs = args.batch_size
            throughputs.update(args.world_size * bs / duration)

        # measure accuracy and record loss
        output = ts.distributed.gather(output[0], dim=-1)
        acc1, acc5 = accuracy(output, target, topk=(1, 5))

        losses.update(loss.item(), input.size(0))
        top1.update(acc1[0], input.size(0))

        losses_id.update(loss_id.item(), input.size(0))
        losses_mag.update(args.lambda_g*loss_g.item(), input.size(0))

        if i % args.print_freq == 0 and args.rank == 0:
            progress.display(i)
            debug_info(x_norm, args.l_a, args.u_a,
                           args.l_margin, args.u_margin)
Пример #22
0
def train(train_loader, val_loader, model, criterion, optimizer, epoch,
          converter):
    batch_time = utils.AverageMeter()
    data_time = utils.AverageMeter()
    losses = utils.AverageMeter()

    # Switch to train mode
    for p in model.parameters():
        p.requires_grad = True
    model.train()

    end = time.time()
    for i, sample in enumerate(
            tqdm(train_loader, desc='Train Epoch {}'.format(epoch + 1))):

        # Aujust learning rate
        #scheduler.step()

        # Measure data loading time
        data_time.update(time.time() - end)

        # Zero out gradients so we can accumulate new ones over batches
        optimizer.zero_grad()

        # step 2. Get our inputs targets ready for the network.
        images, targets = sample

        batch_size = images.size(0)
        encoded_targets, target_lengths = converter.encode(targets)

        # step 3. Run out forward pass.
        images = images.to(device)
        log_probs = model(images)
        input_lengths = torch.full((batch_size, ),
                                   log_probs.size(0),
                                   dtype=torch.int)

        # step 4. Compute the loss, gradients, and update the parameters
        loss = criterion(log_probs, encoded_targets, input_lengths,
                         target_lengths) / batch_size
        losses.update(loss.item())
        model.zero_grad()
        loss.backward()

        # Do one step for multiple batches accumulated gradients are used
        optimizer.step()

        # Measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if (i + 1) % args.display_interval == 0 or i == 0 or (
                i + 1) == len(train_loader):
            print('\nTrain: [{}/{}]\t'
                  'Time {batch_time.val:.3f}s ({batch_time.avg:.3f}s)\t'
                  'Load Data {data_time.val:.3f}s ({data_time.avg:.3f}s\t'
                  'Loss {loss.val:.6f} ({loss.avg:.6f})'.format(
                      i + 1,
                      len(train_loader),
                      batch_time=batch_time,
                      data_time=data_time,
                      loss=losses))

        # Evaluate on validation set
        val_acc = 0.0
        val_loss = 100000.0
        if (i + 1) % args.val_interval == 0 or (i + 1) == len(train_loader):
            with torch.no_grad():
                val_acc, val_loss = validate(val_loader, model, criterion,
                                             epoch, converter)
            for p in model.parameters():
                p.requires_grad = True
                model.train()

        # Remember best accuracy and save checkpoint
        global is_best, best_accuracy
        is_best = val_acc > 0.0 and val_acc >= best_accuracy
        best_accuracy = max(val_acc, best_accuracy)

        if (i + 1) % args.save_interval == 0 or (i + 1) == len(train_loader):
            save_checkpoint(
                {
                    'arch': args.arch,
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'best_accuracy': best_accuracy,
                    'loss': val_loss,
                    'optimizer': optimizer.state_dict(),
                }, i + 1, is_best, args.checkpoint)

    return losses.avg
Пример #23
0
def train(loader,
          ds_rd,
          model,
          criterion,
          optimizer,
          epoch,
          n_iter=-1,
          logger=None,
          opts=None,
          visualizer=None):
    '''
	iter through epoch , return rst{'acc', loss'} each as list can be used outside for updating.
	:param loader:
	:param model:
	:param criterion:
	:param optimizer:
	:param epoch:  for print infor
	:param n_iter: the iteration wanted, -1 for all iters
	:param opts: keep some additional controls
	:param visualizer: for visualizer
	:return:
	'''
    batch_time = ut.AverageMeter()
    data_time = ut.AverageMeter()
    losses = ut.AverageMeter()
    acc = ut.AverageMeter()

    # switch to train mode
    model.train()
    end = time.time()
    li_loss = []
    li_acc = []
    for i, inp_dct in enumerate(loader):
        # get items
        if i >= n_iter and n_iter > 0:  # break if iter is set and i is greater than that
            break
        input = inp_dct['pch']
        target = inp_dct['hms']  # 14 x 64 x 1??
        target_weight = inp_dct['joints_vis']

        # measure data loading time     weight, visible or not
        data_time.update(time.time() - end)

        # compute output
        outputs = model(input)  # no need to cuda it?

        target = target.cuda(non_blocking=True)
        target_weight = target_weight.cuda(non_blocking=True)

        if isinstance(outputs, list):  # list multiple stage version
            loss = criterion(outputs[0], target, target_weight)
            for output in outputs[1:]:
                loss += criterion(output, target, target_weight)
        else:
            output = outputs
            loss = criterion(output, target, target_weight)

        # compute gradient and do update step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure accuracy and record loss
        losses.update(loss.item(), input.size(0))
        _, avg_acc, cnt, pred = accuracy(
            output.detach().cpu().numpy(),
            target.detach().cpu().numpy()
        )  # hm directly, with normalize with 1/10 dim,  pck0.5,  cnt: n_smp,  pred
        acc.update(avg_acc, cnt)  # keep average acc

        if visualizer and 0 == i % opts.update_html_freq:  # update current result, get vis dict
            n_jt = ds_rd.joint_num_ori
            mod0 = opts.mod_src[0]
            mean = ds_rd.means[mod0]
            std = ds_rd.stds[mod0]
            img_patch_vis = ut.ts2cv2(
                input[0], mean,
                std)  # to CV BGR, mean std control channel detach inside
            # pseudo change
            cm = getattr(cv2, ds_rd.dct_clrMap[mod0])
            img_patch_vis = cv2.applyColorMap(img_patch_vis,
                                              cm)[..., ::-1]  # RGB

            # get pred
            pred2d_patch = np.ones((n_jt, 3))  # 3rd for  vis
            pred2d_patch[:, :2] = pred[0] / opts.out_shp[0] * opts.sz_pch[1]
            img_skel = vis.vis_keypoints(img_patch_vis, pred2d_patch,
                                         ds_rd.skels_idx)

            hm_gt = target[0].cpu().detach().numpy().sum(axis=0)  # HXW
            hm_gt = ut.normImg(hm_gt)

            hm_pred = output[0].detach().cpu().numpy().sum(axis=0)
            hm_pred = ut.normImg(hm_pred)
            img_cb = vis.hconcat_resize([img_skel, hm_gt, hm_pred])
            vis_dict = {'img_cb': img_cb}
            visualizer.display_current_results(vis_dict, epoch, False)

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % opts.print_freq == 0:
            msg = 'Epoch: [{0}][{1}/{2}]\t' \
                  'Time {batch_time.val:.3f}s ({batch_time.avg:.3f}s)\t' \
                  'Speed {speed:.1f} samples/s\t' \
                  'Data {data_time.val:.3f}s ({data_time.avg:.3f}s)\t' \
                  'Loss {loss.val:.5f} ({loss.avg:.5f})\t' \
                  'Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format(
             epoch, i, len(loader), batch_time=batch_time,
             speed=input.size(0) / batch_time.val,
             data_time=data_time, loss=losses, acc=acc)
            logger.info(msg)
            li_loss.append(losses.val)  # the current loss
            li_acc.append(acc.val)

    return {'losses': li_loss, 'accs': li_acc}
Пример #24
0
def train(args, model, optimizer, criterion, dataloader_train, dataloader_val, writer=None):
    best_pred, best_acc, best_jac, best_sen, best_spe = 0.0, 0.0, 0.0, 0.0, 0.0
    best_epoch = 0
    end_epoch = None
    step = 0         # tensorboard相关
    end_index = None   # 可以设为1,用于直接进入val过程,检查bug
    current_time = datetime.now().strftime('%b%d %H:%M:%S')
    with open("./logs/%s.txt" % (args.net_work), "a") as f:
        print(current_time, file=f)
    for epoch in range(args.num_epochs):
        if(epoch==end_epoch):
            break
        train_loss = u.AverageMeter()
        train_progressor = pb.Train_ProgressBar(mode='train', epoch=epoch, total_epoch=args.num_epochs,model_name=args.net_work, total=len(dataloader_train)*args.batch_size)
        lr = u.adjust_learning_rate(args, optimizer, epoch)
        model.train()

        for i, (data, label) in enumerate(dataloader_train):
            if(i==end_index):
                break
            train_progressor.current = i*args.batch_size
            if torch.cuda.is_available() and args.use_gpu:
                data = data.cuda()
                label = label.cuda()
            output = model(data)  

            output = torch.sigmoid(output)
            loss_aux = criterion[0](output, label)
            loss_main = criterion[1](output, label)
            loss = loss_main + loss_aux
            train_loss.update(loss.item(), data.size(0))
            train_progressor.current_loss = train_loss.avg
            train_progressor.current_lr = lr
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_progressor()
            step += 1
            if step % 10 == 0:
                writer.add_scalar('Train/loss_step', loss, step)
        train_progressor.done()
        writer.add_scalar('Train/loss_epoch', float(train_loss.avg), epoch)
        Dice, Acc, jaccard, Sensitivity, Specificity = val(args, model, dataloader_val)
        writer.add_scalar('Valid/Dice_val', Dice, epoch)
        writer.add_scalar('Valid/Acc_val', Acc, epoch)
        writer.add_scalar('Valid/Jac_val', jaccard, epoch)
        writer.add_scalar('Valid/Sen_val', Sensitivity, epoch)
        writer.add_scalar('Valid/Spe_val', Specificity, epoch)

        is_best = Dice > best_pred
        if is_best:
            best_pred = max(best_pred, Dice)
            best_jac = max(best_jac, jaccard)
            best_acc = max(best_acc, Acc)
            best_sen = max(best_sen, Sensitivity)
            best_spe = max(best_spe, Specificity)
            best_epoch = epoch+1
        checkpoint_dir = os.path.join(args.save_model_path)
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        checkpoint_latest_name = os.path.join(checkpoint_dir, 'checkpoint_latest.path.tar')
        u.save_checkpoint({
            'epoch': best_epoch,
            'state_dict': model.state_dict(),
            'best_dice': best_pred
        }, best_pred, epoch, is_best, checkpoint_dir, filename=checkpoint_latest_name)
    # 记录该折分割效果最好一次epoch的所有参数
    best_indicator_message = "best pred in Epoch:{}\nDice={:.4f} Accuracy={:.4f} jaccard={:.4f} Sensitivity={:.4f} Specificity={:.4f}".format(
        best_epoch, best_pred, best_acc, best_jac, best_sen, best_spe)
    end_time = datetime.now().strftime('%b%d %H:%M:%S')
    with open("./logs/%s_best_indicator.txt" % (args.net_work), mode='a') as f:
        print("end time: "+end_time, file=f)
        print(best_indicator_message, file=f)
Пример #25
0
                print('Test: [{0}][{1}/{2}] - loss = {3} , acc = {4}'.format(
                    epoch, i, len(val_loader), test_loss.avg, test_acc.avg))

        net.train()
        print('Test finished.')
        return test_acc.avg, test_loss.avg

    ### main training loop ###

    best_accuracy = 0
    best_epoch = 0
    step = 0

    for epoch in range(0, args.num_epochs):

        train_loss = utils.AverageMeter()
        train_acc = utils.AverageMeter()
        batch_time = utils.AverageMeter()
        data_time = utils.AverageMeter()

        # learning rate decay
        scheduler.step()

        end = time.time()

        # train for one epoch
        for i, data in enumerate(train_loader):

            states = net.init_hidden(is_train=True)

            if args.arch == 'Video' or args.arch == 'Audio':  # single modality
Пример #26
0
def do_train(train_loader, model, criterion, optimizer, grad_scaler, epoch,
             args):
    batch_time = utils.AverageMeter('Time', ':6.3f')
    data_time = utils.AverageMeter('Data', ':6.3f')
    losses = utils.AverageMeter('Loss', ':.3f')
    top1 = utils.AverageMeter('Acc@1', ':6.2f')
    learning_rate = utils.AverageMeter('LR', ':.4f')
    throughputs = utils.AverageMeter('ThroughPut', ':.2f')

    losses_id = utils.AverageMeter('L_ID', ':.3f')
    losses_mag = utils.AverageMeter('L_mag', ':.6f')
    progress_template = [
        batch_time, data_time, throughputs, 'images/s', losses, losses_id,
        losses_mag, top1, learning_rate
    ]

    progress = utils.ProgressMeter(len(train_loader),
                                   progress_template,
                                   prefix="Epoch: [{}]".format(epoch))
    end = time.time()

    # update lr
    learning_rate.update(current_lr)

    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        global iters
        iters += 1

        input = input.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)

        # compute output
        with autocast(enabled=args.amp_mode):
            output, x_norm = model(input, target)

        # x_norm is not needed to be gathered, as feature x is in each rank
        target = mpu._gather(target, dim=0)

        # loss
        with autocast(enabled=args.amp_mode):
            loss_id, loss_g, one_hot = criterion(output, target, x_norm)
        loss = loss_id + args.lambda_g * loss_g * args.world_size
        # compute gradient and do solver step
        optimizer.zero_grad()

        # backward
        grad_scaler.scale(loss).backward()
        # update weights
        grad_scaler.step(optimizer)
        grad_scaler.update()

        # syn for logging
        torch.cuda.synchronize()

        # measure elapsed time
        if args.rank == 0:
            duration = time.time() - end
            end = time.time()
            batch_time.update(duration)
            bs = args.batch_size
            throughputs.update(args.world_size * bs / duration)

        # measure accuracy and record loss
        acc1, _ = mpu.accuracy(args, output, target, topk=(1, 1))

        losses.update(loss.item(), input.size(0))
        top1.update(acc1[0], input.size(0))

        losses_id.update(loss_id.item(), input.size(0))
        losses_mag.update(args.lambda_g * loss_g.item(), input.size(0))

        if i % args.print_freq == 0 and args.rank == 0:
            progress.display(i)
            debug_info(x_norm, args.l_a, args.u_a, args.l_margin,
                       args.u_margin)

        if args.vis_mag:
            if (epoch == args.epochs - 1) and (i % 1000 == 0):
                one_hot = one_hot.bool()
                mask = torch.sum(one_hot, dim=1).bool()
                x_norm_cur_rank = torch.masked_select(
                    x_norm.squeeze(), mask).detach().cpu().numpy()
                cos_theta_cur_rank = torch.masked_select(
                    output[0], one_hot).detach().cpu().numpy()
                np.savez(
                    '{}/vis/epoch_{}_iter{}_rank_{}'.format(
                        args.pth_save_fold, epoch, i, args.rank),
                    x_norm_cur_rank, cos_theta_cur_rank)
Пример #27
0
def run_train_epoch(model, optimizer, log_prior, dataloader, epoch,
                    asr_decoder, trans_model, silence_ids, args):
    batch_time = utils.AverageMeter('Time', ':6.3f')
    losses = utils.AverageMeter('Loss', ':.4e')
    grad_norm = utils.AverageMeter('grad_norm', ':.4e')
    progress = utils.ProgressMeter(len(dataloader),
                                   batch_time,
                                   losses,
                                   grad_norm,
                                   prefix="Epoch: [{}]".format(epoch))

    ce_criterion = nn.CrossEntropyLoss(ignore_index=-100, reduction='sum')

    if args.criterion == "mmi":
        se_criterion = ops.MMIFunction.apply
    else:
        se_criterion = ops.sMBRFunction.apply

    end = time.time()
    for i, batch in enumerate(dataloader, 0):
        feat = batch["x"]
        label = batch["y"]  #pdf-ids for ce loss
        num_frs = batch["num_frs"]
        utt_ids = batch["utt_ids"]
        aux = batch["aux"]  #trans_ids for se loss

        x = feat.to(th.float32)
        y = label.long()
        x = x.cuda()
        y = y.cuda()

        prediction = model(x)
        ce_loss = ce_criterion(prediction.view(-1, prediction.shape[2]),
                               y.view(-1))

        se_loss = 0.0
        for j in range(len(num_frs)):
            log_like_j = prediction[j, :, :]
            log_like_j = log_like_j[:num_frs[j], :]
            log_like_j = log_like_j - log_prior
            #trans_id = label[j, :num_frs[j], 0].tolist()
            trans_id = th.from_numpy(aux[j][0][0].astype(int)).tolist()
            #    print(len(trans_id), num_frs[j])

            if args.criterion == "mmi":
                se_loss += se_criterion(log_like_j, asr_decoder, trans_model,
                                        trans_id)
            else:
                se_loss += se_criterion(log_like_j, asr_decoder, trans_model,
                                        trans_id, args.criterion, silence_ids)

        loss = se_loss.cuda() + args.ce_ratio * ce_loss
        optimizer.zero_grad()
        loss.backward()

        # Gradient Clipping (th 5.0)
        norm = nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
        optimizer.step()

        grad_norm.update(norm)

        # update loss
        tot_frs = np.array(num_frs).sum()
        losses.update(loss.item() / tot_frs)

        # measure elapsed time
        batch_time.update(time.time() - end)

        # save model
        if hvd.rank() == 0 and i % args.save_freq == 0:
            checkpoint = {}
            checkpoint['model'] = model.state_dict()
            checkpoint['optimizer'] = optimizer.state_dict()
            output_file = args.exp_dir + '/model.se.' + str(i) + '.tar'
            th.save(checkpoint, output_file)

        if hvd.rank() == 0 and i % args.print_freq == 0:
            progress.print(i)
    def train_epoch(self):
        batch_time = utils.AverageMeter()
        data_time = utils.AverageMeter()
        losses = utils.AverageMeter()
        top1 = utils.AverageMeter()
        top5 = utils.AverageMeter()

        self.model.train()
        self.optim.zero_grad()

        end = time.time()
        for batch_idx, (imgs, target, img_files, class_ids) in tqdm.tqdm(
                enumerate(self.train_loader),
                total=len(self.train_loader),
                desc='Train epoch={}, iter={}'.format(self.epoch,
                                                      self.iteration),
                ncols=80,
                leave=False):
            iteration = batch_idx + self.epoch * len(self.train_loader)
            data_time.update(time.time() - end)

            gc.collect()

            if self.iteration != 0 and (iteration - 1) != self.iteration:
                continue  # for resuming
            self.iteration = iteration

            if (self.iteration + 1) % self.interval_validate == 0:
                self.validate()

            if self.cuda:
                imgs, target = imgs.cuda(), target.cuda(async=True)
            # pdb.set_trace()
            imgs, target = Variable(imgs), Variable(target)

            output, out1, out2, out3, out4 = self.model(imgs)
            loss = self.criterion(output, target)
            if np.isnan(float(loss.data[0])):
                raise ValueError('loss is nan while training')

            # measure accuracy and record loss
            prec1, prec5 = utils.accuracy(output.data,
                                          target.data,
                                          topk=(1, 5))
            losses.update(loss.data[0], imgs.size(0))
            top1.update(prec1[0], imgs.size(0))
            top5.update(prec5[0], imgs.size(0))

            self.optim.zero_grad()
            loss.backward()
            self.optim.step()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            if self.iteration % self.print_freq == 0:
                log_str = 'Train: [{0}/{1}/{top1.count:}]\tepoch: {epoch:}\titer: {iteration:}\t' \
                      'Time: {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \
                      'Data: {data_time.val:.3f} ({data_time.avg:.3f})\t' \
                      'Loss: {loss.val:.4f} ({loss.avg:.4f})\t' \
                      'Prec@1: {top1.val:.3f} ({top1.avg:.3f})\t' \
                      'Prec@5: {top5.val:.3f} ({top5.avg:.3f})\tlr {lr:.6f}'.format(
                    batch_idx, len(self.train_loader), epoch=self.epoch, iteration=self.iteration,
                    lr=self.optim.param_groups[0]['lr'],
                    batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5)
                torch.save(self.model.state_dict(), self.model_dict)
                print(log_str)
                self.print_log(log_str)

            if self.lr_scheduler is not None:
                self.lr_scheduler.step()  # update lr


        log_str = 'Train_summary: [{0}/{1}/{top1.count:}]\tepoch: {epoch:}\titer: {iteration:}\t' \
                      'Time: {batch_time.avg:.3f}\tData: {data_time.avg:.3f}\t' \
                      'Loss: {loss.avg:.4f}\tPrec@1: {top1.avg:.3f}\tPrec@5: {top5.avg:.3f}\tlr {lr:.6f}'.format(
                    batch_idx, len(self.train_loader), epoch=self.epoch, iteration=self.iteration,
                    lr=self.optim.param_groups[0]['lr'],
                    batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5)
        print(log_str)
        self.print_log(log_str)
Пример #29
0
    def train(self):
        batch_time = utils.AverageMeter()
        data_time = utils.AverageMeter()
        losses = utils.AverageMeter()
        D1 = utils.AverageMeter()
        EPE = utils.AverageMeter()

        # switch to train mode
        self.model.train()

        time_end = time.time()
        for i, (batch, filenames) in enumerate(self.dataloader_train):
            # measure data loading time
            assert batch.shape[1] >= 7
            if (self.use_cuda):
                batch = batch[:, :7].cuda()
            imL = batch[:, :3]
            imR = batch[:, 3:6]
            dispL = batch[:, 6:7]
            imL = Variable(imL, volatile=False, requires_grad=False)
            imR = Variable(imR, volatile=False, requires_grad=False)
            dispL = Variable(dispL, volatile=False, requires_grad=False)
            data_time.update(time.time() - time_end)

            # compute output
            scale_dispLs, dispLs = self.model(imL, imR)

            # compute loss
            argst = {
                "disp_gt": dispL,
                "disps": dispLs,
                "scale_disps": scale_dispLs,
                "flag_smooth": True,
            }
            loss = self.lossfun(argst)
            losses.update(loss.data[0], imL.size(0))

            #            if(i < 5):
            #                # visualize images
            #                import matplotlib.pyplot as plt
            #                row, col = 4, 3
            #                plt.subplot(row, col, 1); plt.imshow(imL[0].data.cpu().numpy().transpose(1, 2, 0))
            #                plt.subplot(row, col, 2); plt.imshow(imR[0].data.cpu().numpy().transpose(1, 2, 0))
            #                plt.subplot(row, col, 3); plt.imshow(dispL[0, 0].data.cpu().numpy())
            #                for i in range(len(dispLs)):
            #                    plt.subplot(row, col, 4+i); plt.imshow(dispLs[i][0, 0].data.cpu().numpy())
            #                plt.show()

            # compute gradient and do SGD step
            self.optim.zero_grad()
            loss.backward()
            self.optim.step()

            # measure accuracy
            d1, epe = self.accuracy(dispLs[0].data, dispL.data)
            D1.update(d1, imL.size(0))
            EPE.update(epe, imL.size(0))

            # measure elapsed time
            batch_time.update(time.time() - time_end)
            time_end = time.time()

            # 每十步输出一次
            if i % self.args.print_freq == 0:  # default=20
                print('Train: [{0}][{1}/{2}] | '
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f}) | '
                      'Data {data_time.val:.3f} ({data_time.avg:.3f}) | '
                      'Loss {loss.val:.4f} ({loss.avg:.4f}) | '
                      'D1 {D1.val:.3f} ({D1.avg:.3f}) | '
                      'EPE {EPE.val:.3f} ({EPE.avg:.3f})'.format(
                          self.epoch,
                          i,
                          len(self.dataloader_train),
                          batch_time=batch_time,
                          data_time=data_time,
                          loss=losses,
                          D1=D1,
                          EPE=EPE))

        msg = 'mean train loss: %.3f | mean D1: %.3f | mean EPE: %.3f' % (
            losses.avg, D1.avg, EPE.avg)
        logging.info(msg)
        return losses.avg, EPE.avg, D1.avg
    def validate(self):
        batch_time = utils.AverageMeter()
        losses = utils.AverageMeter()
        top1 = utils.AverageMeter()
        top5 = utils.AverageMeter()

        training = self.model.training
        self.model.eval()

        end = time.time()
        for batch_idx, (imgs, target, img_files, class_ids) in tqdm.tqdm(
                enumerate(self.val_loader),
                total=len(self.val_loader),
                desc='Valid iteration={} epoch={}'.format(
                    self.iteration, self.epoch),
                ncols=80,
                leave=False):
            with torch.no_grad():
                gc.collect()
                if self.cuda:
                    imgs, target = imgs.cuda(), target.cuda(async=True)

                # pdb.set_trace()
                imgs = Variable(imgs, volatile=True)
                target = Variable(target, volatile=True)

                output, out1, out2, out3, out4 = self.model(imgs)
                loss = self.criterion(output, target)

                if np.isnan(float(loss.data[0])):
                    raise ValueError('loss is nan while validating')

                # measure accuracy and record loss
                prec1, prec5 = utils.accuracy(output.data,
                                              target.data,
                                              topk=(1, 5))
                losses.update(loss.data[0], imgs.size(0))
                top1.update(prec1[0], imgs.size(0))
                top5.update(prec5[0], imgs.size(0))

                # measure elapsed time
                batch_time.update(time.time() - end)
                end = time.time()
                if batch_idx % self.print_freq == 0:
                    log_str = 'Test: [{0}/{1}/{top1.count:}]\tepoch: {epoch:}\titer: {iteration:}\t' \
                          'Time: {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \
                          'Loss: {loss.val:.4f} ({loss.avg:.4f})\t' \
                          'Prec@1: {top1.val:.3f} ({top1.avg:.3f})\t' \
                          'Prec@5: {top5.val:.3f} ({top5.avg:.3f})\t'.format(
                        batch_idx, len(self.val_loader), epoch=self.epoch, iteration=self.iteration,
                        batch_time=batch_time, loss=losses, top1=top1, top5=top5)
                    print(log_str)
                    self.print_log(log_str)

        if self.cmd == 'train':
            is_best = top1.avg > self.best_top1
            self.best_top1 = max(top1.avg, self.best_top1)
            self.best_top5 = max(top5.avg, self.best_top5)

            log_str = 'Test_summary: [{0}/{1}/{top1.count:}] epoch: {epoch:} iter: {iteration:}\t' \
                  'BestPrec@1: {best_top1:.3f}\tBestPrec@5: {best_top5:.3f}\t' \
                  'Time: {batch_time.avg:.3f}\tLoss: {loss.avg:.4f}\t' \
                  'Prec@1: {top1.avg:.3f}\tPrec@5: {top5.avg:.3f}\t'.format(
                batch_idx, len(self.val_loader), epoch=self.epoch, iteration=self.iteration,
                best_top1=self.best_top1, best_top5=self.best_top5,
                batch_time=batch_time, loss=losses, top1=top1, top5=top5)
            print(log_str)
            self.print_log(log_str)

            checkpoint_file = os.path.join(self.checkpoint_dir,
                                           'checkpoint.pth.tar')
            torch.save(
                {
                    'epoch': self.epoch,
                    'iteration': self.iteration,
                    'arch': self.model.__class__.__name__,
                    'optim_state_dict': self.optim.state_dict(),
                    'model_state_dict': self.model.state_dict(),
                    'best_top1': self.best_top1,
                    'batch_time': batch_time,
                    'losses': losses,
                    'top1': top1,
                    'top5': top5,
                }, checkpoint_file)
            if is_best:
                shutil.copy(
                    checkpoint_file,
                    os.path.join(self.checkpoint_dir, 'model_best.pth.tar'))
            if (self.epoch + 1) % 10 == 0:  # save each 10 epoch
                shutil.copy(
                    checkpoint_file,
                    os.path.join(self.checkpoint_dir,
                                 'checkpoint-{}.pth.tar'.format(self.epoch)))

            if training:
                self.model.train()