def train(self):
        # Setting the variables before starting the training
        avg_train_loss = AverageMeter()
        avg_train_acc = AverageMeter()
        best_val_acc = -np.inf

        for epoch in range(self.num_epochs):
            avg_train_loss.reset()
            avg_train_acc.reset()

            # Mini batch loop
            for batch_idx, batch in enumerate(tqdm(self.train_loader)):
                step = epoch * len(self.train_loader) + batch_idx

                # Get the model output for the batch and update the loss and accuracy meters
                train_loss, train_acc = self.train_step(batch)
                if self.args.scheduler == 'cycle':
                    self.scheduler.step()
                avg_train_loss.update([train_loss.item()])
                avg_train_acc.update([train_acc])

                # Save the step checkpoint if needed
                # if step % self.save_every == 0:
                #     step_chkpt_path = os.path.join(self.model_dir,
                #                                    'step_chkpt_{}_{}.pth'.format(epoch, step))
                #     print("Saving the model checkpoint for epoch {} at step {}".format(epoch, step))
                #     torch.save(self.model.state_dict(), step_chkpt_path)

                # Logging and validation check
                if step % self.print_every == 0:
                    print(
                        'Epoch {}, batch {}, step {}, '
                        'loss = {:.4f}, acc = {:.4f}, '
                        'running averages: loss = {:.4f}, acc = {:.4f}'.format(
                            epoch, batch_idx, step,
                            train_loss.item(), train_acc, avg_train_loss.get(),
                            avg_train_acc.get()))

                if step % self.val_every == 0:
                    val_loss, val_acc = self.val()
                    print('Val acc = {:.4f}, Val loss = {:.4f}'.format(
                        val_acc, val_loss))
                    if self.visualize:
                        self.writer.add_scalar('Val/loss', val_loss, step)
                        self.writer.add_scalar('Val/acc', val_acc, step)

                    # Update the save the best validation checkpoint if needed
                    if val_acc > best_val_acc:
                        best_val_acc = val_acc
                        best_chkpt_path = os.path.join(self.model_dir,
                                                       'best_ckpt.pth')
                        torch.save(self.model.state_dict(), best_chkpt_path)
                    if self.args.scheduler == 'plateau':
                        self.scheduler.step(val_acc)

                if self.visualize:
                    # Log data to
                    self.writer.add_scalar('Train/loss', train_loss.item(),
                                           step)
                    self.writer.add_scalar('Train/acc', train_acc, step)
Exemplo n.º 2
0
def evaluate(loader, model):
    print("Evaluate")

    # Set model to eval
    model.eval()

    accuracy = AverageMeter()
    positive_accuracy = AverageMeter()
    negative_accuracy = AverageMeter()
    y_true = None
    y_scores = None

    with torch.no_grad():
        for batch_idx, (x, y) in enumerate(loader):
            x = x.to(device=device).to(torch.float32)
            y = y.to(device=device).to(torch.float32)

            scores = model(x)
            scores = torch.squeeze(scores, 2)

            y = torch.unsqueeze(y, 1)
            loss = criterion(scores, y)

            scores = torch.squeeze(scores, 1)
            y = torch.squeeze(y, 1)

            if y_true is None:
                y_true = y
                y_scores = scores
            else:
                y_true = torch.cat((y_true, y))
                y_scores = torch.cat((y_scores, scores))

            acc = get_accuracy(y, scores)
            # neg_acc, pos_acc = get_accuracy_per_class(y.cpu(), scores.cpu())

            accuracy.update(acc)

            # positive_accuracy.update(pos_acc)
            # negative_accuracy.update(neg_acc)

    auc = roc_auc_score(y_true.cpu(), y_scores.cpu())

    wandb.log({
        "valid_acc": accuracy.avg,
        #    "positive_acc": positive_accuracy.avg,
        #    "negative_acc": negative_accuracy.avg,
        "valid_loss": loss.item(),
        "AUC": auc
    })

    accuracy.reset()

    # Set model back to train
    model.train()
Exemplo n.º 3
0
def test(cfg, model, logger, writer, metrics, tid_done):
    model.eval()
    criterion = torch.nn.CrossEntropyLoss()
    test_loaders = [(tid, get_loader(cfg, False, tid)) for tid in range(tid_done+1)]
    avg_meter = AverageMeter()
    for tid, test_loader in test_loaders:
        avg_meter.reset()
        for idx, data in enumerate(test_loader):
            x, y = data
            x = x.to(device)
            y = y.to(device)
            output = model(x)
            test_loss = criterion(output, y)
            pred = output.argmax(dim=1, keepdim=True)
            acc = metrics.accuracy(tid, tid_done, pred, y)
        metrics.avg_accuracy(tid, tid_done, len(test_loader.dataset))
        metrics.forgetting(tid, tid_done)
    logger.info(f'Task Done:{tid_done},\
                  Test Acc:{metrics.acc_task(tid_done)},\
                  Test Forgetting:{metrics.forgetting_task(tid_done)}')
    def train(self):
        # Setting the variables before starting the training
        avg_train_loss = AverageMeter()
        avg_train_acc = AverageMeter()
        text_avg_train_acc = AverageMeter()

        best_val_acc = -np.inf

        for epoch in range(self.num_epochs):

            self.model.print_frozen()

            avg_train_loss.reset()
            avg_train_acc.reset()
            text_avg_train_acc.reset()

            # Mini batch loop
            for batch_idx, batch in enumerate(tqdm(self.train_loader)):
                step = epoch * len(self.train_loader) + batch_idx

                # Get the model output for the batch and update the loss and accuracy meters
                train_loss, train_acc, text_train_acc = self.train_step(batch)
                if self.args.scheduler == 'cycle':
                    self.scheduler.step()
                avg_train_loss.update([train_loss.item()])
                avg_train_acc.update([train_acc])
                text_avg_train_acc.update([text_train_acc])

                # Logging and validation check
                if step % self.print_every == 0:
                    print(
                        'Epoch {}, batch {}, step {}, '
                        'loss = {:.4f}, acc_audio = {:.4f}, acc_text = {:.4f}, '
                        'running averages: loss = {:.4f}, acc_audio = {:.4f}, acc_text = {:.4f}'
                        .format(epoch, batch_idx, step,
                                train_loss.item(), train_acc, text_train_acc,
                                avg_train_loss.get(), avg_train_acc.get(),
                                text_avg_train_acc.get()))

                if step % self.val_every == 0:
                    val_loss, val_acc, text_val_acc = self.val()
                    print(
                        'Val acc (audio) = {:.4f}, Val acc (text) = {:.4f}, Val loss = {:.4f}'
                        .format(val_acc, text_val_acc, val_loss))

                    # Update the save the best validation checkpoint if needed

                    audio_text_avg_acc = (val_acc + text_val_acc) / 2

                    if audio_text_avg_acc > best_val_acc:
                        best_val_acc = audio_text_avg_acc
                        #print('Start saving best checkpoint...)
                        best_chkpt_path = os.path.join(self.model_dir,
                                                       'best_ckpt.pth')
                        torch.save(self.model.state_dict(), best_chkpt_path)
                        #print('Done saving best checkpoint!!!)
                    if self.args.scheduler == 'plateau':
                        self.scheduler.step(audio_text_avg_acc)

            self.model.unfreeze_one_layer()
def train_net(param, model, train_data, valid_data, plot=False,device='cuda'):
    # 初始化参数
    model_name      = param['model_name']
    epochs          = param['epochs']
    batch_size      = param['batch_size']
    lr              = param['lr']
    gamma           = param['gamma']
    step_size       = param['step_size']
    momentum        = param['momentum']
    weight_decay    = param['weight_decay']

    disp_inter      = param['disp_inter']
    save_inter      = param['save_inter']
    min_inter       = param['min_inter']
    iter_inter      = param['iter_inter']

    save_log_dir    = param['save_log_dir']
    save_ckpt_dir   = param['save_ckpt_dir']
    load_ckpt_dir   = param['load_ckpt_dir']

    #
    scaler = GradScaler() 

    # 网络参数
    train_data_size = train_data.__len__()
    valid_data_size = valid_data.__len__()
    c, y, x = train_data.__getitem__(0)['image'].shape
    train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True, num_workers=1)
    valid_loader = DataLoader(dataset=valid_data, batch_size=batch_size, shuffle=False, num_workers=1)
    optimizer = optim.AdamW(model.parameters(), lr=3e-4 ,weight_decay=weight_decay)
    #optimizer = optim.SGD(model.parameters(), lr=1e-2, momentum=momentum, weight_decay=weight_decay)
    #scheduler = StepLR(optimizer, step_size=step_size, gamma=gamma)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=3, T_mult=2, eta_min=1e-5, last_epoch=-1)
    #criterion = nn.CrossEntropyLoss(reduction='mean').to(device)
    DiceLoss_fn=DiceLoss(mode='multiclass')
    SoftCrossEntropy_fn=SoftCrossEntropyLoss(smooth_factor=0.1)
    criterion = L.JointLoss(first=DiceLoss_fn, second=SoftCrossEntropy_fn,
                              first_weight=0.5, second_weight=0.5).cuda()
    logger = inial_logger(os.path.join(save_log_dir, time.strftime("%m-%d %H:%M:%S", time.localtime()) +'_'+model_name+ '.log'))

    # 主循环
    train_loss_total_epochs, valid_loss_total_epochs, epoch_lr = [], [], []
    train_loader_size = train_loader.__len__()
    valid_loader_size = valid_loader.__len__()
    best_iou = 0
    best_epoch=0
    best_mode = copy.deepcopy(model)
    epoch_start = 0
    if load_ckpt_dir is not None:
        ckpt = torch.load(load_ckpt_dir)
        epoch_start = ckpt['epoch']
        model.load_state_dict(ckpt['state_dict'])
        optimizer.load_state_dict(ckpt['optimizer'])

    logger.info('Total Epoch:{} Image_size:({}, {}) Training num:{}  Validation num:{}'.format(epochs, x, y, train_data_size, valid_data_size))
    #
    for epoch in range(epoch_start, epochs):
        epoch_start = time.time()
        # 训练阶段
        model.train()
        train_epoch_loss = AverageMeter()
        train_iter_loss = AverageMeter()
        for batch_idx, batch_samples in enumerate(train_loader):
            data, target = batch_samples['image'], batch_samples['label']
            data, target = Variable(data.to(device)), Variable(target.to(device))
            with autocast(): #need pytorch>1.6
                pred = model(data)
                loss = criterion(pred, target)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
            scheduler.step(epoch + batch_idx / train_loader_size) 
            image_loss = loss.item()
            train_epoch_loss.update(image_loss)
            train_iter_loss.update(image_loss)
            if batch_idx % iter_inter == 0:
                spend_time = time.time() - epoch_start
                logger.info('[train] epoch:{} iter:{}/{} {:.2f}% lr:{:.6f} loss:{:.6f} ETA:{}min'.format(
                    epoch, batch_idx, train_loader_size, batch_idx/train_loader_size*100,
                    optimizer.param_groups[-1]['lr'],
                    train_iter_loss.avg,spend_time / (batch_idx+1) * train_loader_size // 60 - spend_time // 60))
                train_iter_loss.reset()

        # 验证阶段
        model.eval()
        valid_epoch_loss = AverageMeter()
        valid_iter_loss = AverageMeter()
        iou=IOUMetric(10)
        with torch.no_grad():
            for batch_idx, batch_samples in enumerate(valid_loader):
                data, target = batch_samples['image'], batch_samples['label']
                data, target = Variable(data.to(device)), Variable(target.to(device))
                pred = model(data)
                loss = criterion(pred, target)
                pred=pred.cpu().data.numpy()
                pred= np.argmax(pred,axis=1)
                iou.add_batch(pred,target.cpu().data.numpy())
                #
                image_loss = loss.item()
                valid_epoch_loss.update(image_loss)
                valid_iter_loss.update(image_loss)
                # if batch_idx % iter_inter == 0:
                #     logger.info('[val] epoch:{} iter:{}/{} {:.2f}% loss:{:.6f}'.format(
                #         epoch, batch_idx, valid_loader_size, batch_idx / valid_loader_size * 100, valid_iter_loss.avg))
            val_loss=valid_iter_loss.avg
            acc, acc_cls, iu, mean_iu, fwavacc=iou.evaluate()
            logger.info('[val] epoch:{} miou:{:.2f}'.format(epoch,mean_iu))
                

        # 保存loss、lr
        train_loss_total_epochs.append(train_epoch_loss.avg)
        valid_loss_total_epochs.append(valid_epoch_loss.avg)
        epoch_lr.append(optimizer.param_groups[0]['lr'])
        # 保存模型
        if epoch % save_inter == 0 and epoch > min_inter:
            state = {'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}
            filename = os.path.join(save_ckpt_dir, 'checkpoint-epoch{}.pth'.format(epoch))
            torch.save(state, filename)  # pytorch1.6会压缩模型,低版本无法加载
        # 保存最优模型
        if mean_iu > best_iou:  # train_loss_per_epoch valid_loss_per_epoch
            state = {'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}
            filename = os.path.join(save_ckpt_dir, 'checkpoint-best.pth')
            torch.save(state, filename)
            best_iou = mean_iu
            best_mode = copy.deepcopy(model)
            logger.info('[save] Best Model saved at epoch:{} ============================='.format(epoch))
        #scheduler.step()
        # 显示loss
    # 训练loss曲线
    if plot:
        x = [i for i in range(epochs)]
        fig = plt.figure(figsize=(12, 4))
        ax = fig.add_subplot(1, 2, 1)
        ax.plot(x, smooth(train_loss_total_epochs, 0.6), label='train loss')
        ax.plot(x, smooth(valid_loss_total_epochs, 0.6), label='val loss')
        ax.set_xlabel('Epoch', fontsize=15)
        ax.set_ylabel('CrossEntropy', fontsize=15)
        ax.set_title('train curve', fontsize=15)
        ax.grid(True)
        plt.legend(loc='upper right', fontsize=15)
        ax = fig.add_subplot(1, 2, 2)
        ax.plot(x, epoch_lr,  label='Learning Rate')
        ax.set_xlabel('Epoch', fontsize=15)
        ax.set_ylabel('Learning Rate', fontsize=15)
        ax.set_title('lr curve', fontsize=15)
        ax.grid(True)
        plt.legend(loc='upper right', fontsize=15)
        plt.show()
            
    return best_mode, model
Exemplo n.º 6
0
def train_model(epoch, model, optimizer, lr_scheduler, loader, test_loader):
    global GLOBAL_STEP

    test_loader_it = iter(test_loader)

    loss_meter = AverageMeter()
    val_loss_meter = AverageMeter()
    acc_meter = AverageMeter()
    val_acc_meter = AverageMeter()

    model.train()
    model.to(device)

    print('=' * 20 + "Model Training" + '=' * 20)

    loss_func = nn.CrossEntropyLoss()

    for i, batch in tqdm(enumerate(loader)):
        start = time.time()
        model.train()
        optimizer.zero_grad()
        model.zero_grad()
        sentence1, sentence2, label = batch
        label = label.to(device)
        pred = model((sentence1, sentence2))
        loss = loss_func(pred, label)
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.grad_clip_bound)
        optimizer.step()
        acc = torch.mean((torch.max(pred, 1)[1] == label).type(torch.float))
        loss_meter.update(loss.item())
        acc_meter.update(acc.item())

        end = time.time()
        used_time = end - start

        if (GLOBAL_STEP) % args.log_every == 0:
            try:
                batch = next(test_loader_it)
            except:
                test_loader_it = iter(test_loader)
                batch = next(test_loader_it)
            eval_loss, eval_acc = batch_eval(batch, model)
            val_loss_meter.update(eval_loss.item())
            val_acc_meter.update(eval_acc.item())
            lr = optimizer.param_groups[0]['lr']
            display = 'epoch=' + str(epoch) + \
                      '\tglobal_step=%d' % (GLOBAL_STEP) + \
                      '\tloss=%.4f' % (loss_meter.val) + \
                      '\tloss_avg=%.4f' % (loss_meter.avg) + \
                      '\tval_loss=%.4f' % (val_loss_meter.avg) + \
                      '\tacc=%.4f' % (acc_meter.avg) + \
                      '\tval_acc=%.4f' % (val_acc_meter.avg) + \
                      '\tlr=%.6f' % (lr) + \
                      '\t|g|=%.4f' % (grad_norm) + \
                      '\ttime=%.2fit/s' % (1. / used_time)

            tb_writer.add_scalar('Training/training_loss', loss_meter.avg,
                                 GLOBAL_STEP)
            tb_writer.add_scalar('Training/training_acc', acc_meter.avg,
                                 GLOBAL_STEP)
            tb_writer.add_scalar('Training/dev_loss', val_loss_meter.avg,
                                 GLOBAL_STEP)
            tb_writer.add_scalar('Training/dev_acc', val_acc_meter.avg,
                                 GLOBAL_STEP)

            tqdm.write(display)
            loss_meter.reset()
            acc_meter.reset()
            val_loss_meter.reset()
            val_acc_meter.reset()

        if (GLOBAL_STEP) % (args.log_every * 20) == 0:
            save_mode(epoch=epoch,
                      model=model,
                      optimizer=optimizer,
                      lr_scheduler=lr_scheduler)

        GLOBAL_STEP += 1
    return
Exemplo n.º 7
0
def validate(args):
    setup_dllogger(0, filename=args.dllogger_file)

    if args.checkpoint != '':
        args.pretrained = True
    args.prefetcher = not args.no_prefetcher
    if args.waymo:
        assert args.waymo_val is not None

    memory_format = (torch.channels_last if args.memory_format == "nhwc" else
                     torch.contiguous_format)
    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1
    args.device = 'cuda:0'
    args.world_size = 1
    args.rank = 0  # global rank
    if args.distributed:
        torch.cuda.manual_seed_all(args.seed)
        args.device = 'cuda:%d' % args.local_rank
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.world_size = torch.distributed.get_world_size()
        args.rank = torch.distributed.get_rank()

        # Set device limit on the current device
        # cudaLimitMaxL2FetchGranularity = 0x05
        pValue = ctypes.cast((ctypes.c_int * 1)(),
                             ctypes.POINTER(ctypes.c_int))
        _libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128))
        _libcudart.cudaDeviceGetLimit(pValue, ctypes.c_int(0x05))
        assert pValue.contents.value == 128
    assert args.rank >= 0

    # create model
    bench = create_model(args.model,
                         input_size=args.input_size,
                         num_classes=args.num_classes,
                         bench_task='predict',
                         pretrained=args.pretrained,
                         redundant_bias=args.redundant_bias,
                         checkpoint_path=args.checkpoint,
                         checkpoint_ema=args.use_ema,
                         soft_nms=args.use_soft_nms,
                         strict_load=False)
    input_size = bench.config.image_size
    data_config = bench.config

    param_count = sum([m.numel() for m in bench.parameters()])
    print('Model %s created, param count: %d' % (args.model, param_count))

    bench = bench.cuda().to(memory_format=memory_format)

    if args.distributed > 1:
        raise ValueError(
            "Evaluation is supported only on single GPU. args.num_gpu must be 1"
        )
        bench = DDP(
            bench, device_ids=[args.device]
        )  # torch.nn.DataParallel(bench, device_ids=list(range(args.num_gpu)))

    if args.waymo:
        annotation_path = args.waymo_val_annotation
        image_dir = args.waymo_val
    else:
        if 'test' in args.anno:
            annotation_path = os.path.join(args.data, 'annotations',
                                           f'image_info_{args.anno}.json')
            image_dir = 'test2017'
        else:
            annotation_path = os.path.join(args.data, 'annotations',
                                           f'instances_{args.anno}.json')
            image_dir = args.anno
    dataset = CocoDetection(os.path.join(args.data, image_dir),
                            annotation_path, data_config)

    evaluator = COCOEvaluator(dataset.coco,
                              distributed=args.distributed,
                              waymo=args.waymo)

    loader = create_loader(dataset,
                           input_size=input_size,
                           batch_size=args.batch_size,
                           use_prefetcher=args.prefetcher,
                           interpolation=args.interpolation,
                           fill_color=args.fill_color,
                           num_workers=args.workers,
                           distributed=args.distributed,
                           pin_mem=args.pin_mem,
                           memory_format=memory_format)

    img_ids = []
    results = []
    dllogger_metric = {}
    bench.eval()
    batch_time = AverageMeter()
    throughput = AverageMeter()
    end = time.time()
    total_time_start = time.time()
    with torch.no_grad():
        for i, (input, target) in enumerate(loader):
            with torch.cuda.amp.autocast(enabled=args.amp):
                output = bench(input, target['img_scale'], target['img_size'])
            batch_time.update(time.time() - end)
            throughput.update(input.size(0) / batch_time.val)
            evaluator.add_predictions(output, target)
            torch.cuda.synchronize()

            # measure elapsed time
            if i == 9:
                batch_time.reset()
                throughput.reset()

            if args.rank == 0 and i % args.log_freq == 0:
                print(
                    'Test: [{0:>4d}/{1}]  '
                    'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s)  '
                    .format(
                        i,
                        len(loader),
                        batch_time=batch_time,
                        rate_avg=input.size(0) / batch_time.avg,
                    ))
            end = time.time()

    dllogger_metric['total_inference_time'] = time.time() - total_time_start
    dllogger_metric['inference_throughput'] = throughput.avg
    dllogger_metric['inference_time'] = 1000 / throughput.avg
    total_time_start = time.time()
    mean_ap = 0.
    if not args.inference:
        if 'test' not in args.anno:
            mean_ap = evaluator.evaluate()
        else:
            evaluator.save_predictions(args.results)
        dllogger_metric['map'] = mean_ap
        dllogger_metric['total_eval_time'] = time.time() - total_time_start
    else:
        evaluator.save_predictions(args.results)

    if not args.distributed or args.rank == 0:
        dllogger.log(step=(), data=dllogger_metric, verbosity=0)

    return results
Exemplo n.º 8
0
    def train(self):
        criterion = nn.CrossEntropyLoss().to(self.device)
        optimizer = torch.optim.Adam(self.net.parameters(),
                                     lr=self.learning_rate)
        total_step = len(self.train_loader)
        scheduler = StepLR(optimizer, self.decay_epoch, gamma=0.5)

        color_avg = AverageMeter('color')
        season_avg = AverageMeter('season')
        #    part_avg = AverageMeter('part')
        style_avg = AverageMeter('style')
        category_avg = AverageMeter('category')

        color_plot = create_vis_plot('Epoch', 'Loss', 'Color')
        season_plot = create_vis_plot('Epoch', 'Loss', 'Season')
        #      part_plot = create_vis_plot('Epoch', 'Loss', 'Part')
        style_plot = create_vis_plot('Epoch', 'Loss', 'Style')
        category_plot = create_vis_plot('Epoch', 'Loss', 'Category')

        for epoch in range(self.epoch, self.num_epoch):
            color_avg.reset()
            season_avg.reset()
            #    part_avg.reset()
            style_avg.reset()
            category_avg.reset()

            correct_color = 0
            correct_style = 0
            #      correct_part = 0
            correct_season = 0
            correct_category = 0

            for step, (images, color, style, season,
                       category) in enumerate(self.train_loader):
                images = images.to(self.device)
                color = color.to(self.device)
                style = style.to(self.device)
                #        part = part.to(self.device)
                season = season.to(self.device)
                category = category.to(self.device)

                color_prediction, style_prediction, season_prediction, category_prediction = self.net(
                    images)

                color_loss = criterion(color_prediction, color)
                style_loss = criterion(style_prediction, style)
                #        part_loss = criterion(part_prediction, part)
                season_loss = criterion(season_prediction, season)
                category_loss = criterion(category_prediction, category)

                loss = color_loss + 2 * style_loss + season_loss + category_loss

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                color_avg.update(color_loss.item())
                style_avg.update(style_loss.item())
                #     part_avg.update(part_loss.item())
                season_avg.update(season_loss.item())
                category_avg.update(category_loss.item())

                correct_color += color_prediction.argmax(
                    dim=1).eq(color).sum().item()
                correct_style += style_prediction.argmax(
                    dim=1).eq(style).sum().item()
                #      correct_part += part_prediction.argmax(dim=1).eq(part).sum().item()
                correct_season += season_prediction.argmax(
                    dim=1).eq(season).sum().item()
                correct_category += category_prediction.argmax(
                    dim=1).eq(category).sum().item()

                if step % 10 == 1:
                    print(
                        f'Epoch [{epoch}/{self.num_epoch}], Step: [{step}/{total_step}], Color Loss: {color_avg.avg:.4f},'
                        f'Season Loss: {season_avg.avg:.4f}, Style Loss: {style_avg.avg:.4f}, '
                        f'Category Loss: {category_avg.avg:.4f}')
                    print(
                        f'Color: {correct_color/((step+1)*self.batch_size)*100:.4f}%, Style: {correct_style/((step+1)*self.batch_size)*100:.4f}%, '
                        f'Season:{correct_season/((step+1)*self.batch_size)*100:.4f}%, Category: {correct_category/((step+1)*self.batch_size)*100:.4f}%'
                    )

            torch.save(
                self.net.state_dict(),
                f'{self.checkpoint_dir}/{self.backbone}_checkpoint-{epoch}.pth'
            )

            input_tensor = torch.rand(64, 3, 224, 224).to(self.device)

            #base_network = mobilenet_v2(pretrained=True)
            script_module = torch.jit.trace(self.net, input_tensor)
            #딥러닝 모델 저장 : 모델구조, 파라미터 - 케라스 // 파라미터만 (가중치) - 파이토치, 파이썬 파이토치, 라이브 토치 파라미터 + 모델구조 (C++토치) .pt: 모델, script
            script_module.save("jyson_classification0116.pt")

            scheduler.step()

            update_vis_plot(epoch, color_avg.avg, color_plot, 'append')
            update_vis_plot(epoch, season_avg.avg, season_plot, 'append')
            #     update_vis_plot(epoch, part_avg.avg, part_plot, 'append')
            update_vis_plot(epoch, style_avg.avg, style_plot, 'append')
            update_vis_plot(epoch, category_avg.avg, category_plot, 'append')
Exemplo n.º 9
0
def train_val(model, args):

    train_dir = args.train_dir
    val_dir = args.val_dir

    config = Config(args.config)
    cudnn.benchmark = True

    #lspet dataset contains 10000 images, lsp dataset contains 2000 images.

    # train
    train_loader = torch.utils.data.DataLoader(lsp_lspet_data.LSP_Data(
        'lspet', train_dir, 8,
        Mytransforms.Compose([
            Mytransforms.RandomResized(),
            Mytransforms.RandomRotate(40),
            Mytransforms.RandomCrop(368),
            Mytransforms.RandomHorizontalFlip(),
        ])),
                                               batch_size=config.batch_size,
                                               shuffle=True,
                                               num_workers=config.workers,
                                               pin_memory=True)

    # val
    if args.val_dir is not None and config.test_interval != 0:
        # val
        val_loader = torch.utils.data.DataLoader(lsp_lspet_data.LSP_Data(
            'lsp', val_dir, 8,
            Mytransforms.Compose([
                Mytransforms.TestResized(368),
            ])),
                                                 batch_size=config.batch_size,
                                                 shuffle=True,
                                                 num_workers=config.workers,
                                                 pin_memory=True)

    criterion = nn.MSELoss().cuda()

    params, multiple = get_parameters(model, config, False)

    optimizer = torch.optim.SGD(params,
                                config.base_lr,
                                momentum=config.momentum,
                                weight_decay=config.weight_decay)

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    losses_list = [AverageMeter() for i in range(6)]
    end = time.time()
    iters = config.start_iters
    best_model = config.best_model

    heat_weight = 46 * 46 * 15 / 1.0

    while iters < config.max_iter:
        #train_loader가 한번 불러오면 i는 1증가, input은 16개씩 가져옴
        for i, (input, heatmap, centermap,
                img_path) in enumerate(train_loader):

            learning_rate = adjust_learning_rate(
                optimizer,
                iters,
                config.base_lr,
                policy=config.lr_policy,
                policy_parameter=config.policy_parameter,
                multiple=multiple)
            data_time.update(time.time() - end)

            heatmap = heatmap.cuda(async=True)
            #print(heatmap)
            #sys.exit(1)
            centermap = centermap.cuda(async=True)

            input_var = torch.autograd.Variable(input)
            heatmap_var = torch.autograd.Variable(heatmap)
            centermap_var = torch.autograd.Variable(centermap)

            heat1, heat2, heat3, heat4, heat5, heat6 = model(
                input_var, centermap_var)

            loss1 = criterion(heat1, heatmap_var) * heat_weight
            loss2 = criterion(heat2, heatmap_var) * heat_weight
            loss3 = criterion(heat3, heatmap_var) * heat_weight
            loss4 = criterion(heat4, heatmap_var) * heat_weight
            loss5 = criterion(heat5, heatmap_var) * heat_weight
            loss6 = criterion(heat6, heatmap_var) * heat_weight

            loss = loss1 + loss2 + loss3 + loss4 + loss5 + loss6
            losses.update(loss.data[0], input.size(0))
            for cnt, l in enumerate([loss1, loss2, loss3, loss4, loss5,
                                     loss6]):
                losses_list[cnt].update(l.data[0], input.size(0))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            batch_time.update(time.time() - end)
            end = time.time()

            iters += 1
            #print(i,'\n')
            if iters % config.display == 0:
                print(
                    'Train Iteration: {0}\t'
                    'Time {batch_time.sum:.3f}s / {1}iters, ({batch_time.avg:.3f})\t'
                    'Data load {data_time.sum:.3f}s / {1}iters, ({data_time.avg:3f})\n'
                    'Learning rate = {2}\n'
                    'Loss = {loss.val:.8f} (ave = {loss.avg:.8f})\n'.format(
                        iters,
                        config.display,
                        learning_rate,
                        batch_time=batch_time,
                        data_time=data_time,
                        loss=losses))
                for cnt in range(0, 6):
                    print(
                        'Loss{0} = {loss1.val:.8f} (ave = {loss1.avg:.8f})\t'.
                        format(cnt + 1, loss1=losses_list[cnt]))

                print(
                    time.strftime(
                        '%Y-%m-%d %H:%M:%S -----------------------------------------------------------------------------------------------------------------\n',
                        time.localtime()))
                #############    image write  ##################
                for cnt in range(config.batch_size):
                    kpts = get_kpts(heat6[cnt], img_h=368.0, img_w=368.0)
                    draw_paint(img_path[cnt], kpts, i, cnt)
                #######################################################
                batch_time.reset()
                data_time.reset()
                losses.reset()
                for cnt in range(6):
                    losses_list[cnt].reset()

            save_checkpoint({
                'iter': iters,
                'state_dict': model.state_dict(),
            }, 0, args.model_name)

            # val
            if args.val_dir is not None and config.test_interval != 0 and iters % config.test_interval == 0:

                model.eval()
                for j, (input, heatmap, centermap) in enumerate(val_loader):
                    heatmap = heatmap.cuda(async=True)
                    centermap = centermap.cuda(async=True)

                    input_var = torch.autograd.Variable(input)
                    heatmap_var = torch.autograd.Variable(heatmap)
                    centermap_var = torch.autograd.Variable(centermap)

                    heat1, heat2, heat3, heat4, heat5, heat6 = model(
                        input_var, centermap_var)

                    loss1 = criterion(heat1, heatmap_var) * heat_weight
                    loss2 = criterion(heat2, heatmap_var) * heat_weight
                    loss3 = criterion(heat3, heatmap_var) * heat_weight
                    loss4 = criterion(heat4, heatmap_var) * heat_weight
                    loss5 = criterion(heat5, heatmap_var) * heat_weight
                    loss6 = criterion(heat6, heatmap_var) * heat_weight

                    loss = loss1 + loss2 + loss3 + loss4 + loss5 + loss6
                    losses.update(loss.data[0], input.size(0))
                    for cnt, l in enumerate(
                        [loss1, loss2, loss3, loss4, loss5, loss6]):
                        losses_list[cnt].update(l.data[0], input.size(0))

                    batch_time.update(time.time() - end)
                    end = time.time()
                    is_best = losses.avg < best_model
                    best_model = min(best_model, losses.avg)
                    save_checkpoint(
                        {
                            'iter': iters,
                            'state_dict': model.state_dict(),
                        }, is_best, args.model_name)

                    if j % config.display == 0:
                        print(
                            'Test Iteration: {0}\t'
                            'Time {batch_time.sum:.3f}s / {1}iters, ({batch_time.avg:.3f})\t'
                            'Data load {data_time.sum:.3f}s / {1}iters, ({data_time.avg:3f})\n'
                            'Loss = {loss.val:.8f} (ave = {loss.avg:.8f})\n'.
                            format(j,
                                   config.display,
                                   batch_time=batch_time,
                                   data_time=data_time,
                                   loss=losses))
                        for cnt in range(0, 6):
                            print(
                                'Loss{0} = {loss1.val:.8f} (ave = {loss1.avg:.8f})\t'
                                .format(cnt + 1, loss1=losses_list[cnt]))

                        print(
                            time.strftime(
                                '%Y-%m-%d %H:%M:%S -----------------------------------------------------------------------------------------------------------------\n',
                                time.localtime()))
                        batch_time.reset()
                        losses.reset()
                        for cnt in range(6):
                            losses_list[cnt].reset()

                model.train()
Exemplo n.º 10
0
    def train(self, train_loader, eval_loader):
        losses_1 = AverageMeter()
        losses_2 = AverageMeter()
        data_time = AverageMeter()
        batch_time = AverageMeter()
        end_time = time.time()
        if self.iter > self.max_iter:
            logging.info("Optimization is done !")
            sys.exit(0)
        for data in train_loader:
            self.model.train()
            # forward
            data_time.update(time.time() - end_time)
            data = self._read_inputs(data)
            # get loss
            loss, train_prec = self._forward(data)
            if isinstance(loss, tuple):
                losses_1.update(loss[0].item())
                losses_2.update(loss[1].item())
                total_loss = sum(loss)
            else:
                losses_1.update(loss.item())
                total_loss = loss
            # optimization
            self.optimizer.zero_grad()
            total_loss.backward()
            self.optimizer.step()
            self.lr_scheduler.step()
            # time for training(forward & loss computation & optimization) on one batch
            batch_time.update(time.time() - end_time)

            # log avg loss
            if self.iter > 0 and self.iter % self.cfg.TRAIN.PRINT_FREQ == 0:
                if isinstance(loss, tuple):
                    self.writer.add_scalar('loss/cls', losses_1.avg, self.iter)
                    self.writer.add_scalar('loss/box', losses_2.avg, self.iter)
                    loss_msg = f'avg_cls_loss:{losses_1.avg:.04f} avg_box_loss:{losses_2.avg:.04f}'
                else:
                    if self.replace_model_name:
                        self.writer.add_scalar(f'{self.model_name}_loss',
                                               losses_1.avg, self.iter)
                        loss_msg = f'avg_{self.model_name}_loss:{losses_1.avg:.04f}'
                    else:
                        self.writer.add_scalar('loss', losses_1.avg, self.iter)
                        loss_msg = f'avg_loss:{losses_1.avg:.04f}'

                logging.info(
                    f'epoch:{self.epoch:03d} '
                    f'{loss_msg:s} '
                    f'io_rate:{data_time.avg / batch_time.avg:.04f} '
                    f'samples/(gpu*s):{self.cfg.DATASET.IMG_NUM_PER_GPU / batch_time.avg:.02f}'
                )

                self.writer.add_scalar(
                    'speed/samples_per_second_per_gpu',
                    self.cfg.DATASET.IMG_NUM_PER_GPU / batch_time.avg,
                    self.iter)
                self.writer.add_scalar('speed/io_rate',
                                       data_time.avg / batch_time.avg,
                                       self.iter)
                if train_prec is not None:
                    logging.info(f'train precision: {train_prec}')
                losses_1.reset()
                losses_2.reset()

            # save checkpoint
            if self.iter > 0 and self.iter % self.cfg.TRAIN.SAVE_INTERVAL == 0:
                # evaluation
                if self.cfg.TRAIN.VAL_WHEN_TRAIN:
                    self.model.eval()
                    performance = self.evaluate(eval_loader)
                    self.writer.add_scalar(self.PI, performance, self.iter)
                    if self.PI == 'triplet_loss' and performance < self.best_performance:
                        self.is_best = True
                        self.best_performance = performance
                    elif performance > self.best_performance:
                        self.is_best = True
                        self.best_performance = performance
                    else:
                        self.is_best = False
                    logging.info(
                        f'Now: best {self.PI} is {self.best_performance}')
                else:
                    performance = -1

                # save checkpoint
                try:
                    state_dict = self.model.module.state_dict(
                    )  # remove prefix of multi GPUs
                except AttributeError:
                    state_dict = self.model.state_dict()

                if self.rank == 0:
                    if self.cfg.TRAIN.SAVE_EVERY_CHECKPOINT:
                        filename = f"{self.model_name}_epoch{self.epoch:03d}_iter{self.iter:06d}_checkpoint.pth"
                    else:
                        filename = "checkpoint.pth"
                    save_checkpoint(
                        {
                            'iter': self.iter,
                            'model': self.model_name,
                            f'performance/{self.PI}': performance,
                            'state_dict': state_dict,
                            'optimizer': self.optimizer.state_dict(),
                        },
                        self.is_best,
                        self.log_dir,
                        filename=filename)

            self.iter += 1
            end_time = time.time()
        self.epoch += 1
Exemplo n.º 11
0
    def train(self):
        criterion = nn.CrossEntropyLoss().to(self.device)
        optimizer = torch.optim.Adam(self.net.parameters(),
                                     lr=self.learning_rate)
        lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
            optimizer,
            LambdaLR(self.num_epoch, self.epoch, self.decay_epoch).step)

        total_step = len(self.train_loader)
        losses = AverageMeter()
        accuracy = AverageMeter()
        accuracy_set, loss_set, lr_set, epoch_set = self.read_loss_info()

        loss_window = self.visdom.line(Y=[1])
        lr_window = self.visdom.line(Y=[1])
        accuracy_window = self.visdom.line(Y=[1])

        for epoch in range(self.epoch, self.num_epoch):
            losses.reset()
            for step, (images, labels) in enumerate(self.train_loader):
                images = images.to(self.device)
                labels = labels.to(self.device)

                outputs = self.net(images)
                loss = criterion(outputs, labels)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                _, predicted = torch.max(outputs.data, 1)
                predicted = (predicted == labels).sum().item()

                losses.update(loss.item(), self.batch_size)
                accuracy.update(predicted / self.batch_size, self.batch_size)

                if step % 10 == 0:
                    print(
                        f'Epoch [{epoch}/{self.num_epoch}], Step [{step}/{total_step}], Loss: {losses.avg:.4f}, '
                        f'Accuracy: {accuracy.avg:.4f}')

            accuracy_set += [accuracy.avg]
            loss_set += [losses.avg]
            lr_set += [optimizer.param_groups[0]['lr']]
            epoch_set += [epoch]
            loss_window = self.visdom.line(Y=loss_set,
                                           X=epoch_set,
                                           win=loss_window,
                                           update='replace')
            lr_window = self.visdom.line(Y=lr_set,
                                         X=epoch_set,
                                         win=lr_window,
                                         update='replace')
            accuracy_window = self.visdom.line(Y=accuracy_set,
                                               X=epoch_set,
                                               win=accuracy_window,
                                               update='replace')

            self.save_loss_info(accuracy_set, loss_set, lr_set, epoch_set)
            torch.save(self.net.state_dict(),
                       '%s/vgg16-%d.pth' % (self.checkpoint_dir, epoch))
            lr_scheduler.step()
    def train(self):
        optimizer_ae = Adam(chain(self.Encoder.parameters(),
                                  self.Decoder.parameters()),
                            self.lr,
                            betas=(self.b1, self.b2),
                            weight_decay=self.weight_decay)
        optimizer_discriminator = Adam(self.Disciminator.parameters(),
                                       self.lr,
                                       betas=(self.b1, self.b2),
                                       weight_decay=self.weight_decay)
        lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
            optimizer_ae,
            LambdaLR(self.num_epoch, self.epoch, self.decay_epoch).step)
        total_step = len(self.data_loader)

        perceptual_criterion = PerceptualLoss().to(self.device)
        content_criterion = nn.L1Loss().to(self.device)
        adversarial_criterion = nn.BCELoss().to(self.device)

        self.Encoder.train()
        self.Decoder.train()
        content_losses = AverageMeter()
        generator_losses = AverageMeter()
        perceptual_losses = AverageMeter()
        discriminator_losses = AverageMeter()
        ae_losses = AverageMeter()

        lr_window = create_vis_plot('Epoch', 'Learning rate', 'Learning rate')
        loss_window = create_vis_plot('Epoch', 'Loss', 'Total Loss')
        generator_loss_window = create_vis_plot('Epoch', 'Loss',
                                                'Generator Loss')
        discriminator_loss_window = create_vis_plot('Epoch', 'Loss',
                                                    'Discriminator Loss')
        content_loss_window = create_vis_plot('Epoch', 'Loss', 'Content Loss')
        perceptual_loss_window = create_vis_plot('Epoch', 'Loss',
                                                 'Perceptual Loss')

        if not os.path.exists(self.sample_dir):
            os.makedirs(self.sample_dir)
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)

        for epoch in range(self.epoch, self.num_epoch):
            content_losses.reset()
            perceptual_losses.reset()
            generator_losses.reset()
            ae_losses.reset()
            discriminator_losses.reset()
            for step, images in enumerate(self.data_loader):
                images = images.to(self.device)

                real_labels = torch.ones((images.size(0), 1)).to(self.device)
                fake_labels = torch.zeros((images.size(0), 1)).to(self.device)

                encoded_image = self.Encoder(images)

                binary_decoded_image = paq.compress(
                    encoded_image.cpu().detach().numpy().tobytes())
                # encoded_image = paq.decompress(binary_decoded_image)
                #
                # encoded_image = torch.from_numpy(np.frombuffer(encoded_image, dtype=np.float32)
                #                                  .reshape(-1, self.storing_channels, self.image_size // 8,
                #                                           self.image_size // 8)).to(self.device)

                decoded_image = self.Decoder(encoded_image)

                content_loss = content_criterion(images, decoded_image)
                perceptual_loss = perceptual_criterion(images, decoded_image)
                generator_loss = adversarial_criterion(
                    self.Disciminator(decoded_image), real_labels)
                # generator_loss = -self.Disciminator(decoded_image).mean()

                ae_loss = content_loss * self.content_loss_factor + perceptual_loss * self.perceptual_loss_factor + \
                          generator_loss * self.generator_loss_factor

                content_losses.update(content_loss.item())
                perceptual_losses.update(perceptual_loss.item())
                generator_losses.update(generator_loss.item())
                ae_losses.update(ae_loss.item())

                optimizer_ae.zero_grad()
                ae_loss.backward(retain_graph=True)
                optimizer_ae.step()

                interpolated_image = self.eta * images + (
                    1 - self.eta) * decoded_image
                gravity_penalty = self.Disciminator(interpolated_image).mean()
                real_loss = adversarial_criterion(self.Disciminator(images),
                                                  real_labels)
                fake_loss = adversarial_criterion(
                    self.Disciminator(decoded_image), fake_labels)
                discriminator_loss = (real_loss + fake_loss) * self.discriminator_loss_factor / 2 +\
                                     gravity_penalty * self.penalty_loss_factor

                # discriminator_loss = self.Disciminator(decoded_image).mean() - self.Disciminator(images).mean() + \
                #                      gravity_penalty * self.penalty_loss_factor

                optimizer_discriminator.zero_grad()
                discriminator_loss.backward(retain_graph=True)
                optimizer_discriminator.step()
                discriminator_losses.update(discriminator_loss.item())

                if step % 100 == 0:
                    print(
                        f"[Epoch {epoch}/{self.num_epoch}] [Batch {step}/{total_step}] [Learning rate {get_lr(optimizer_ae)}] "
                        f"[Content {content_loss:.4f}] [Perceptual {perceptual_loss:.4f}] [Gan {generator_loss:.4f}]"
                        f"[Discriminator {discriminator_loss:.4f}]")

                    save_image(
                        torch.cat([images, decoded_image], dim=2),
                        os.path.join(self.sample_dir,
                                     f"Sample-epoch-{epoch}-step-{step}.png"))

            update_vis_plot(epoch, ae_losses.avg, loss_window, 'append')
            update_vis_plot(epoch, generator_losses.avg, generator_loss_window,
                            'append')
            update_vis_plot(epoch, discriminator_losses.avg,
                            discriminator_loss_window, 'append')
            update_vis_plot(epoch, content_losses.avg, content_loss_window,
                            'append')
            update_vis_plot(epoch, perceptual_losses.avg,
                            perceptual_loss_window, 'append')
            update_vis_plot(epoch, get_lr(optimizer_ae), lr_window, 'append')

            lr_scheduler.step()

            torch.save(
                self.Encoder.state_dict(),
                os.path.join(self.checkpoint_dir, f"Encoder-{epoch}.pth"))
            torch.save(
                self.Decoder.state_dict(),
                os.path.join(self.checkpoint_dir, f"Decoder-{epoch}.pth"))
            torch.save(
                self.Disciminator.state_dict(),
                os.path.join(self.checkpoint_dir,
                             f"Discriminator-{epoch}.pth"))
Exemplo n.º 13
0
def val(model, args, val_loader, criterion, config):
    global e
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    losses_list = [AverageMeter() for i in range(6)]
    end = time.time()
    iters = config.start_iters
    best_model = config.best_model

    heat_weight = 46 * 46 * 15 / 1.0

    # model.eval()
    model.detector.eval()
    model.flownet.eval()
    # model.eval()
    for j, (input, heatmap, centermap) in enumerate(val_loader):
        heatmap = heatmap.cuda(async=True)
        centermap = centermap.cuda(async=True)

        input_var = torch.autograd.Variable(input)
        heatmap_var = torch.autograd.Variable(heatmap)
        centermap_var = torch.autograd.Variable(centermap)

        output = model(input_var, centermap_var)

        loss_ = [criterion(ht, heatmap_var) * heat_weight for ht in output]

        loss = 0.
        loss += l for l in loss_
        losses.update(loss.data[0], input.size(0))
        for cnt, l in enumerate(
                [loss1, loss2, loss3, loss4, loss5, loss6]):
            losses_list[cnt].update(l.data[0], input.size(0))


        if j % config.display == 0:
            print('Valepoch: {2}/{3}\t'
                    'Test Iteration: {0}\t'
                    'Time {batch_time.sum:.3f}s / {1}iters, ({batch_time.avg:.3f})\t'
                    'Data load {data_time.sum:.3f}s / {1}iters, ({data_time.avg:3f})\n'
                    'Loss = {loss.val:.8f} (ave = {loss.avg:.8f})\n'.format(
                j, config.display, e, args.n_epochs, batch_time=batch_time,
                data_time=data_time, loss=losses))
            for cnt in range(0, 6):
                print('Loss{0} = {loss1.val:.8f} (ave = {loss1.avg:.8f})\t'
                        .format(cnt + 1, loss1=losses_list[cnt]))

            print(time.strftime(
                '%Y-%m-%d %H:%M:%S -----------------------------------------------------------------------------------------------------------------\n',
                time.localtime()))
            batch_time.reset()
            losses.reset()
            for cnt in range(6):
                losses_list[cnt].reset()
        




def main(args):
    
    # build train and val set
    train_dir = args.train_dir
    val_dir = args.val_dir

    config = Config(args.config)
    cudnn.benchmark = True

    # train
    train_loader = torch.utils.data.DataLoader(
        lsp_lspet_data.LSP_Data('lspet', train_dir, 8,
                Mytransforms.Compose([Mytransforms.RandomResized(),
                Mytransforms.RandomRotate(40),
                Mytransforms.RandomCrop(368),
                Mytransforms.RandomHorizontalFlip(),
            ])),
            batch_size=config.batch_size, shuffle=True,
            num_workers=config.workers, pin_memory=True)
    # val
    if args.val_dir is not None and config.test_interval != 0:
        # val
        val_loader = torch.utils.data.DataLoader(
            lsp_lspet_data.LSP_Data('lsp', val_dir, 8,
                              Mytransforms.Compose([Mytransforms.TestResized(368),
                                                    ])),
            batch_size=config.batch_size, shuffle=False,
            num_workers=config.workers, pin_memory=True)
    
    # build model
    model = MSBR(config=config, args=args, k=14, stages=config.stages)

    model.build_nets()


    return model, train_loader, val_loader


    


if __name__ == '__main__':

    # os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    args = parse()
    model, train_loader, val_loader = main(args)

    if args.pretrained_d is not 'None' and args.val_dir is not None and config.test_interval != 0:
        val_loss_d, val_loss_f = val(model, args, val_loader, criterion, config)
    
    for e in range(args.n_epochs):
        global e
        train_loss_d, train_loss_f = train(model, args, train_loader)

        if args.val_dir is not None and config.test_interval != 0:
            with torch.no_grad():
                val_loss_d, val_loss_f = val(model, args, val_loader, criterion)


            is_best_d = val_loss_d.avg < config.best_model_d
            is_best_f = val_loss_f.avg < config.best_model_f
            config.best_model_d = min(config.best_model_d, losses.avg)
            config.best_model_f = min(config.best_model_f, losses.avg)
            save_checkpoint({
                'epoch': e,
                'state_dict': model.detector.state_dict(),
            }, is_best_d, args.detector_name)
            save_checkpoint({
                'epoch': e,
                'state_dict': model.flownet.state_dict(),
            }, is_best_f, args.flownet_name)
Exemplo n.º 14
0
def training(train_loader, epochs, n_subact=0, save=True, **kwargs):
    """Training pipeline for embedding.

    Args:
        train_loader: iterator within dataset
        epochs: how much training epochs to perform
        n_subact: number of subactions in current complex activity
        mnist: if training with mnist dataset (just to test everything how well
            it works)
    Returns:
        trained pytorch model
    """
    logger.debug('create model')
    torch.manual_seed(opt.seed)
    np.random.seed(opt.seed)
    try:
        model = kwargs['model']
        loss = kwargs['loss']
        optimizer = kwargs['optimizer']
    except KeyError:
        model = Embedding(embed_dim=opt.embed_dim,
                          feature_dim=opt.feature_dim,
                          n_subact=n_subact).cuda()

        loss = RankLoss(margin=0.2).cuda()
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=opt.lr,
                                    momentum=opt.momentum,
                                    weight_decay=opt.weight_decay)
    cudnn.benchmark = True

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    vis = Visual()
    best_acc = -1
    _lr = opt.lr

    logger.debug('epochs: %s', epochs)
    loss_previous = np.inf
    for epoch in range(epochs):
        model.cuda()
        model.train()

        logger.debug('Epoch # %d' % epoch)
        if opt.lr_adj:
            # if epoch in [int(epochs * 0.3), int(epochs * 0.7)]:
            # if epoch in [int(epochs * 0.5)]:
            if epoch % 30 == 0 and epoch > 0:
                _lr = adjust_lr(optimizer, _lr)
                logger.debug('lr: %f' % _lr)
        end = time.time()
        for i, (input, k, _) in enumerate(train_loader):
            # TODO: not sure that it's necessary
            data_time.update(time.time() - end)
            input = input.float().cuda(non_blocking=True)
            k = k.float().cuda()
            output = model(input)
            loss_values = loss(output, k)
            losses.update(loss_values.item(), input.size(0))

            optimizer.zero_grad()
            loss_values.backward()
            optimizer.step()

            batch_time.update(time.time() - end)
            end = time.time()

            if i % 100 == 0 and i:
                logger.debug(
                    'Epoch: [{0}][{1}/{2}]\t'
                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                    'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                    'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                        epoch,
                        i,
                        len(train_loader),
                        batch_time=batch_time,
                        data_time=data_time,
                        loss=losses))
        logger.debug('loss: %f' % losses.avg)
        losses.reset()

    if save:
        save_dict = {
            'epoch': epoch,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
        }
        dir_check(join(opt.dataset_root, 'models'))
        dir_check(join(opt.dataset_root, 'models', kwargs['name']))
        torch.save(
            save_dict,
            join(opt.dataset_root, 'models', kwargs['name'],
                 '%s.pth.tar' % opt.log_str))
    return model
Exemplo n.º 15
0
def train_val(model, args):

    train_dir = args.train_dir
    val_dir = args.val_dir

    config = Config(args.config)
    cudnn.benchmark = True

    # train
    train_loader = torch.utils.data.DataLoader(lsp_lspet_data.LSP_Data(
        'lspet', train_dir, 8,
        Mytransforms.Compose([
            Mytransforms.RandomResized(),
            Mytransforms.RandomRotate(40),
            Mytransforms.RandomCrop(368),
            Mytransforms.RandomHorizontalFlip(),
        ])),
                                               batch_size=config.batch_size,
                                               shuffle=True,
                                               num_workers=config.workers,
                                               pin_memory=True)
    # val
    if args.val_dir is not None and config.test_interval != 0:
        # val
        val_loader = torch.utils.data.DataLoader(lsp_lspet_data.LSP_Data(
            'lsp', val_dir, 8,
            Mytransforms.Compose([
                Mytransforms.TestResized(368),
            ])),
                                                 batch_size=config.batch_size,
                                                 shuffle=True,
                                                 num_workers=config.workers,
                                                 pin_memory=True)

    if args.gpu[0] < 0:
        criterion = nn.MSELoss()
    else:
        criterion = nn.MSELoss().cuda()

    params, multiple = get_parameters(model, config, True)
    # params, multiple = get_parameters(model, config, False)

    optimizer = torch.optim.SGD(params,
                                config.base_lr,
                                momentum=config.momentum,
                                weight_decay=config.weight_decay)

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    losses_list = [AverageMeter() for i in range(6)]
    end = time.time()
    iters = config.start_iters
    best_model = config.best_model

    heat_weight = 46 * 46 * 15 / 1.0

    losstracker1 = []

    losstracker2 = []
    losstracker3 = []
    losstracker4 = []
    losstracker5 = []
    losstracker6 = []
    while iters < config.max_iter:

        for i, (input, heatmap, centermap) in enumerate(train_loader):

            learning_rate = adjust_learning_rate(
                optimizer,
                iters,
                config.base_lr,
                policy=config.lr_policy,
                policy_parameter=config.policy_parameter,
                multiple=multiple)
            data_time.update(time.time() - end)

            if args.gpu[0] >= 0:
                heatmap = heatmap.cuda(async=True)
                centermap = centermap.cuda(async=True)

            input_var = torch.autograd.Variable(input)
            heatmap_var = torch.autograd.Variable(heatmap)
            centermap_var = torch.autograd.Variable(centermap)

            heat1, heat2, heat3, heat4, heat5, heat6 = model(
                input_var, centermap_var)

            loss1 = criterion(heat1, heatmap_var) * heat_weight
            loss2 = criterion(heat2, heatmap_var) * heat_weight
            loss3 = criterion(heat3, heatmap_var) * heat_weight
            loss4 = criterion(heat4, heatmap_var) * heat_weight
            loss5 = criterion(heat5, heatmap_var) * heat_weight
            loss6 = criterion(heat6, heatmap_var) * heat_weight

            loss = loss1 + loss2 + loss3 + loss4 + loss5 + loss6
            #print(input.size(0).item())
            losses.update(loss.item(), input.size(0))
            for cnt, l in enumerate([loss1, loss2, loss3, loss4, loss5,
                                     loss6]):
                losses_list[cnt].update(l.item(), input.size(0))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            batch_time.update(time.time() - end)
            end = time.time()

            iters += 1
            if iters % config.display == 0:
                print(
                    'Train Iteration: {0}\t'
                    'Time {batch_time.sum:.3f}s / {1}iters, ({batch_time.avg:.3f})\t'
                    'Data load {data_time.sum:.3f}s / {1}iters, ({data_time.avg:3f})\n'
                    'Learning rate = {2}\n'
                    'Loss = {loss.val:.8f} (ave = {loss.avg:.8f})\n'.format(
                        iters,
                        config.display,
                        learning_rate,
                        batch_time=batch_time,
                        data_time=data_time,
                        loss=losses))
                for cnt in range(0, 6):
                    print(
                        'Loss{0} = {loss1.val:.8f} (ave = {loss1.avg:.8f})\t'.
                        format(cnt + 1, loss1=losses_list[cnt]))

                print(
                    time.strftime(
                        '%Y-%m-%d %H:%M:%S -----------------------------------------------------------------------------------------------------------------\n',
                        time.localtime()))

                batch_time.reset()
                data_time.reset()
                losses.reset()
                for cnt in range(6):
                    losses_list[cnt].reset()

            save_checkpoint({
                'iter': iters,
                'state_dict': model.state_dict(),
            }, 0, args.model_name)

            # val
            if args.val_dir is not None and config.test_interval != 0 and iters % config.test_interval == 0:

                model.eval()
                for j, (input, heatmap, centermap) in enumerate(val_loader):
                    if args.cuda[0] >= 0:
                        heatmap = heatmap.cuda(async=True)
                        centermap = centermap.cuda(async=True)

                    input_var = torch.autograd.Variable(input)
                    heatmap_var = torch.autograd.Variable(heatmap)
                    centermap_var = torch.autograd.Variable(centermap)

                    heat1, heat2, heat3, heat4, heat5, heat6 = model(
                        input_var, centermap_var)

                    loss1 = criterion(heat1, heatmap_var) * heat_weight
                    loss2 = criterion(heat2, heatmap_var) * heat_weight
                    loss3 = criterion(heat3, heatmap_var) * heat_weight
                    loss4 = criterion(heat4, heatmap_var) * heat_weight
                    loss5 = criterion(heat5, heatmap_var) * heat_weight
                    loss6 = criterion(heat6, heatmap_var) * heat_weight

                    loss = loss1 + loss2 + loss3 + loss4 + loss5 + loss6
                    losses.update(loss.data[0], input.size(0))
                    for cnt, l in enumerate(
                        [loss1, loss2, loss3, loss4, loss5, loss6]):
                        losses_list[cnt].update(l.data[0], input.size(0))

                    batch_time.update(time.time() - end)
                    end = time.time()
                    is_best = losses.avg < best_model
                    best_model = min(best_model, losses.avg)
                    save_checkpoint(
                        {
                            'iter': iters,
                            'state_dict': model.state_dict(),
                        }, is_best, args.model_name)

                    if j % config.display == 0:
                        print(
                            'Test Iteration: {0}\t'
                            'Time {batch_time.sum:.3f}s / {1}iters, ({batch_time.avg:.3f})\t'
                            'Data load {data_time.sum:.3f}s / {1}iters, ({data_time.avg:3f})\n'
                            'Loss = {loss.val:.8f} (ave = {loss.avg:.8f})\n'.
                            format(j,
                                   config.display,
                                   batch_time=batch_time,
                                   data_time=data_time,
                                   loss=losses))
                        for cnt in range(0, 6):
                            print(
                                'Loss{0} = {loss1.val:.8f} (ave = {loss1.avg:.8f})\t'
                                .format(cnt + 1, loss1=losses_list[cnt]))

                        print(
                            time.strftime(
                                '%Y-%m-%d %H:%M:%S -----------------------------------------------------------------------------------------------------------------\n',
                                time.localtime()))
                        batch_time.reset()
                        losses.reset()
                        for cnt in range(6):
                            losses_list[cnt].reset()

                        losstracker1.append(loss1)
                        losstracker2.append(loss2)
                        losstracker3.append(loss3)
                        losstracker4.append(loss4)
                        losstracker5.append(loss5)
                        losstracker6.append(loss6)
                model.train()

    np.save('loss1', np.asarray(losstracker1))
    np.save('loss2', np.asarray(losstracker2))
    np.save('loss3', np.asarray(losstracker3))
    np.save('loss4', np.asarray(losstracker4))
    np.save('loss5', np.asarray(losstracker5))
    np.save('loss6', np.asarray(losstracker6))
Exemplo n.º 16
0
    def train(self):
        # Setting the variables before starting the training
        print('Loading checkpoint if checkpoint_dir is given...')
        self.load_checkpoint()

        avg_train_loss = AverageMeter()
        avg_train_acc = AverageMeter()
        text_avg_train_acc = AverageMeter()
        combined_avg_train_acc = AverageMeter()

        best_val_acc = -np.inf
        patience_counter = 0
        best_epoch = self.num_epochs
        for epoch in range(self.num_epochs):

            self.model.print_frozen()

            avg_train_loss.reset()
            avg_train_acc.reset()
            text_avg_train_acc.reset()

            # Mini batch loop
            for batch_idx, batch in enumerate(tqdm(self.train_loader)):
                step = epoch * len(self.train_loader) + batch_idx

                # Get the model output for the batch and update the loss and accuracy meters
                train_loss, train_acc, text_train_acc = self.train_step(batch)
                if self.args.scheduler == 'cycle':
                    self.scheduler.step()
                avg_train_loss.update([train_loss.item()])
                avg_train_acc.update([train_acc])
                text_avg_train_acc.update([text_train_acc])

                # Logging and validation check
                if step % self.print_every == 0:
                    print(
                        'Epoch {}, batch {}, step {}, '
                        'loss = {:.4f}, acc_audio = {:.4f}, acc_text = {:.4f}, '
                        'running averages: loss = {:.4f}, acc_audio = {:.4f}, acc_text = {:.4f}'
                        .format(epoch, batch_idx, step,
                                train_loss.item(), train_acc, text_train_acc,
                                avg_train_loss.get(), avg_train_acc.get(),
                                text_avg_train_acc.get()))

                if step % self.val_every == 0:
                    val_loss, val_acc, text_val_acc, combined_val_acc = self.val(
                    )
                    print(
                        'Val acc (audio) = {:.4f}, Val acc (text) = {:.4f}, Val acc (combined) = {:.4f}, Val loss = {:.4f}'
                        .format(val_acc, text_val_acc, combined_val_acc,
                                val_loss))

                    # Update the save the best validation checkpoint if needed
                    if self.args.model_save_criteria == 'audio_text':
                        cur_avg_acc = (val_acc + text_val_acc) / 2
                    else:  #'combined'
                        cur_avg_acc = combined_val_acc

                    if cur_avg_acc > best_val_acc:
                        #print('Start saving best check point at step{}...'.format(step))
                        best_val_acc = cur_avg_acc
                        best_chkpt_path = os.path.join(self.model_dir,
                                                       'best_ckpt.pth')
                        torch.save(self.model.state_dict(), best_chkpt_path)
                        print('Done saving best check point!')
                    if self.args.scheduler == 'plateau':
                        self.scheduler.step(audio_text_avg_acc)

            print('------ End of epoch validation ------')
            val_loss, val_acc, text_val_acc, combined_val_acc = self.val()
            # Update the save the best validation checkpoint if needed
            if self.args.model_save_criteria == 'audio_text':
                cur_avg_acc = (val_acc + text_val_acc) / 2
            else:  #'combined'
                cur_avg_acc = combined_val_acc

            if cur_avg_acc > best_val_acc:
                #print('Start saving best check point at step{}...'.format(step))
                best_val_acc = cur_avg_acc
                best_chkpt_path = os.path.join(self.model_dir, 'best_ckpt.pth')
                torch.save(self.model.state_dict(), best_chkpt_path)
                patience_counter = 0
                best_epoch = epoch
                print('Done saving best check point! Patience counter reset!')
            else:
                patience_counter += 1
                if patience_counter > self.max_patience:
                    print(
                        'Reach max patience limit. Training stops! Best val acc achieved at epoch: {}.'
                        .format(epoch))
                    break
            self.model.unfreeze_one_layer()
Exemplo n.º 17
0
class Classifier:
    def __init__(self, **kwargs):
        opt._parse(kwargs)
        self.opt = opt
        self.model = getattr(models, self.opt.model)()
        self.criterion = t.nn.CrossEntropyLoss().to(self.opt.device)
        # 1. 铰链损失(Hinge Loss):主要用于支持向量机(SVM) 中;
        # 2. 互熵损失 (Cross Entropy Loss,Softmax Loss ):用于Logistic 回归与Softmax 分类中;
        # 3. 平方损失(Square Loss):主要是最小二乘法(OLS)中;
        # 4. 指数损失(Exponential Loss) :主要用于Adaboost 集成学习算法中;
        # 5. 其他损失(如0-1损失,绝对值损失)
        self.optimizer = self.model.get_optimizer(self.opt.lr,
                                                  self.opt.weight_decay)
        self.compression_scheduler = distiller.CompressionScheduler(self.model)
        self.train_losses = AverageMeter()  # 误差仪表
        self.train_top1 = AverageMeter()  # top1 仪表
        self.train_top5 = AverageMeter()  # top5 仪表
        self.best_precision = 0  # 最好的精确度
        self.start_epoch = 0
        self.train_writer = None
        self.value_writer = None

    def load_data(self):
        test_data = DatasetFromFilename(self.opt.data_root, flag='test')

        train_data = DatasetFromFilename(self.opt.data_root,
                                         flag='train')  # 训练集
        val_data = DatasetFromFilename(self.opt.data_root, flag='valid')  # 验证集
        self.test_dataloader = DataLoader(test_data,
                                          batch_size=self.opt.batch_size,
                                          shuffle=True,
                                          num_workers=self.opt.num_workers)
        self.train_dataloader = DataLoader(
            train_data,
            self.opt.batch_size,
            shuffle=True,
            num_workers=self.opt.num_workers)  # 训练集加载器
        self.val_dataloader = DataLoader(
            val_data,
            self.opt.batch_size,
            shuffle=True,
            num_workers=self.opt.num_workers)  # 验证集加载器

    def create_write(self):
        if self.opt.vis:
            self.train_writer = SummaryWriter(
                log_dir='./runs/train_' +
                datetime.now().strftime('%y%m%d-%H-%M-%S'))
            self.value_writer = SummaryWriter(
                log_dir='./runs/val_' +
                datetime.now().strftime('%y%m%d-%H-%M-%S'))

    def train_save_model(self, epoch, val_loss, val_top1, val_top5):
        self.model.save({
            "epoch":
            epoch + 1,
            "model_name":
            self.opt.model,
            "state_dict":
            self.model.state_dict(),
            "best_precision":
            self.best_precision,
            "optimizer":
            self.optimizer,
            "valid_loss": [val_loss, val_top1, val_top5],
            'compression_scheduler':
            self.compression_scheduler.state_dict()
        })  # 保存模型

    def train_load_model(self):
        if self.opt.load_model_path:
            # # 把所有的张量加载到CPU中
            # t.load(opt.load_model_path, map_location=lambda storage, loc: storage)
            # # 把所有的张量加载到GPU 1中
            # t.load(opt.load_model_path, map_location=lambda storage, loc: storage.cuda(1))
            # # 把张量从GPU 1 移动到 GPU 0
            # t.load(opt.load_model_path, map_location={'cuda:1': 'cuda:0'})
            checkpoint = t.load(self.opt.load_model_path)
            self.start_epoch = checkpoint["epoch"]
            # compression_scheduler.load_state_dict(checkpoint['compression_scheduler'], False)
            self.best_precision = checkpoint["best_precision"]
            self.model.load_state_dict(checkpoint["state_dict"])
            self.optimizer = checkpoint['optimizer']
        self.model.to(self.opt.device)  # 加载模型到 GPU

    def load_model(self):
        if self.opt.load_model_path:
            checkpoint = t.load(self.opt.load_model_path)
            self.model.load_state_dict(checkpoint["state_dict"])  # 加载模型
        self.model.to(self.opt.device)

    def save_quantize_model(self):
        if self.opt.quantize_eval:
            self.model.save(
                {
                    "model_name": self.opt.model,
                    "state_dict": self.model.state_dict(),
                    'quantizer_metadata': self.model.quantizer_metadata
                }, './checkpoint/ResNet152_quantize.pth')

    def quantize_model(self):
        if self.opt.quantize_eval:
            self.model.cpu()
            quantizer = quantization.PostTrainLinearQuantizer.from_args(
                self.model, self.opt)  # 量化模型
            quantizer.prepare_model()
            self.model.to(self.opt.device)

    def load_compress(self):

        if self.opt.compress:
            self.compression_scheduler = distiller.file_config(
                self.model, self.optimizer, self.opt.compress,
                self.compression_scheduler)  # 加载模型修剪计划表
            self.model.to(self.opt.device)

    def visualization_train(self, input, ii, epoch):
        if ii % self.opt.print_freq:
            if self.train_writer:
                grid = make_grid(
                    (input.data.cpu() * 0.225 + 0.45).clamp(min=0, max=1))
                self.train_writer.add_image('train_images', grid,
                                            ii * (epoch + 1))  # 训练图片
                self.train_writer.add_scalar('loss', self.train_losses.avg,
                                             ii * (epoch + 1))  # 训练误差
                self.train_writer.add_text(
                    'top1', 'train accuracy top1 %.2f%%' % self.train_top1.avg,
                    ii * (epoch + 1))  # top1准确率文本
                self.train_writer.add_scalars(
                    'accuracy', {
                        'top1': self.train_top1.avg,
                        'top5': self.train_top5.avg,
                        'loss': self.train_losses.avg
                    }, ii * (epoch + 1))

    def test(self):
        self.load_model()
        self.load_data()
        self.model.eval()  # 把module设成测试模式,对Dropout和BatchNorm有影响
        correct = 0
        total = 0
        msglogger.info('测试数据集大小', len(self.test_dataloader))
        # 量化
        self.quantize_model()
        self.model.eval()  # 把module设成测试模式,对Dropout和BatchNorm有影响
        err_img = [('img_path', 'result', 'label')]
        for ii, (data, labels,
                 img_path) in tqdm(enumerate(self.test_dataloader)):
            input = data.to(self.opt.device)
            labels = labels.to(self.opt.device)
            score = self.model(input)
            # probability = t.nn.functional.softmax(score, dim=1)[:, 1].detach().tolist()  # [:,i] 第i类的权重
            # 将一个K维的任意实数向量压缩(映射)成另一个K维的实数向量,其中向量中的每个元素取值都介于(0,1)之间,并且压缩后的K个值相加等于1(
            # 变成了概率分布)。在选用Softmax做多分类时,可以根据值的大小来进行多分类的任务,如取权重最大的一维
            results = score.max(dim=1)[1].detach(
            )  # max 返回每一行中最大值的那个元素,且返回其索引(返回最大元素在这一行的列索引) 返回最有可能的一类
            # batch_results = [(labels_.item(), self.opt.cate_classes[label_]) for labels_, label_ in zip(labels, label)]
            total += input.size(0)
            correct += (results == labels).sum().item()
            error_list = (results != labels).tolist()
            err_img.extend([(img_path[i], self.opt.cate_classes[results[i]],
                             self.opt.cate_classes[labels[i]])
                            for i, j in enumerate(error_list)
                            if j == 1])  # 识别错误图片地址,识别标签,正确标签,添加到错误列表

        msglogger.info(
            'Test Accuracy of the model on the {} test images: {} %'.format(
                total, 100 * correct / total))
        # 错误图片写入csv
        write_err_img(err_img)
        # 保存量化模型
        self.save_quantize_model()

    def recognition(self):
        self.load_model()
        self.model.eval()
        img = image_loader(self.opt.url)
        image = img.view(1, 3, self.opt.image_size,
                         self.opt.image_size).to(self.opt.device)  # 转换image
        outputs = self.model(image)
        result = {}
        for i in range(self.opt.num_classes):  # 计算各分类比重
            result[self.opt.cate_classes[i]] = t.nn.functional.softmax(
                outputs, dim=1)[:, i].detach().tolist()[0]
            result = sorted(result.items(), key=lambda x: x[1], reverse=True)
        return result

    def sensitivity(self):
        self.load_data()
        self.load_model()
        sensitivities = np.arange(self.opt.sensitivity_range[0],
                                  self.opt.sensitivity_range[1],
                                  self.opt.sensitivity_range[2])
        return sensitivity_analysis(self.model, self.criterion,
                                    self.test_dataloader, self.opt,
                                    sensitivities, msglogger)

    def train(self):
        previous_loss = 1e10  # 上次学习的loss
        lr = self.opt.lr
        perf_scores_history = []
        pylogger = PythonLogger(msglogger)
        self.train_load_model()
        self.load_compress()
        self.create_write()
        lr_scheduler = get_scheduler(self.optimizer, opt)
        for epoch in range(self.start_epoch, self.opt.max_epoch):
            self.model.train()
            self.load_data()
            if self.opt.pruning:
                self.compression_scheduler.on_epoch_begin(epoch)  # epoch 开始修剪
            self.train_losses.reset()  # 重置仪表
            self.train_top1.reset()  # 重置仪表
            # print('训练数据集大小', len(train_dataloader))
            total_samples = len(self.train_dataloader.sampler)
            steps_per_epoch = math.ceil(total_samples / self.opt.batch_size)
            train_progressor = ProgressBar(mode="Train  ",
                                           epoch=epoch,
                                           total_epoch=self.opt.max_epoch,
                                           model_name=self.opt.model,
                                           total=len(self.train_dataloader))
            lr = lr_scheduler.get_lr()
            for ii, (data, labels,
                     img_path) in enumerate(self.train_dataloader):
                if self.opt.pruning:
                    self.compression_scheduler.on_minibatch_begin(
                        epoch, ii, steps_per_epoch,
                        self.optimizer)  # batch 开始修剪
                train_progressor.current = ii + 1  # 训练集当前进度
                # train model
                input = data.to(self.opt.device)
                target = labels.to(self.opt.device)
                score = self.model(input)  # 网络结构返回值
                loss = self.criterion(score, target)  # 计算损失
                if self.opt.pruning:
                    # Before running the backward phase, we allow the scheduler to modify the loss
                    # (e.g. add regularization loss)
                    agg_loss = self.compression_scheduler.before_backward_pass(
                        epoch,
                        ii,
                        steps_per_epoch,
                        loss,
                        optimizer=self.optimizer,
                        return_loss_components=True)  # 模型修建误差
                    loss = agg_loss.overall_loss
                self.train_losses.update(loss.item(), input.size(0))
                # loss = criterion(score[0], target)  # 计算损失   Inception3网络
                self.optimizer.zero_grad()  # 参数梯度设成0
                loss.backward()  # 反向传播
                self.optimizer.step()  # 更新参数

                if opt.pruning:
                    self.compression_scheduler.on_minibatch_end(
                        epoch, ii, steps_per_epoch,
                        self.optimizer)  # batch 结束修剪

                precision1_train, precision5_train = accuracy(
                    score, target, topk=(1, 5))  # top1 和 top5 的准确率

                # precision1_train, precision2_train = accuracy(score[0], target, topk=(1, 2))  # Inception3网络
                self.train_losses.update(loss.item(), input.size(0))
                self.train_top1.update(precision1_train[0].item(),
                                       input.size(0))
                self.train_top5.update(precision5_train[0].item(),
                                       input.size(0))
                train_progressor.current_loss = self.train_losses.avg
                train_progressor.current_top1 = self.train_top1.avg
                train_progressor.current_top5 = self.train_top5.avg
                train_progressor()  # 打印进度
                if (ii + 1) % self.opt.print_freq == 0:
                    self.visualization_train(input, ii, epoch)
            if self.opt.pruning:
                distiller.log_weights_sparsity(self.model,
                                               epoch,
                                               loggers=[pylogger])  # 打印模型修剪结果
                self.compression_scheduler.on_epoch_end(
                    epoch, self.optimizer)  # epoch 结束修剪
            val_loss, val_top1, val_top5 = val(self.model, self.criterion,
                                               self.val_dataloader, epoch,
                                               self.value_writer)  # 校验模型
            sparsity = distiller.model_sparsity(self.model)
            perf_scores_history.append(
                distiller.MutableNamedTuple(
                    {
                        'sparsity': sparsity,
                        'top1': val_top1,
                        'top5': val_top5,
                        'epoch': epoch + 1,
                        'lr': lr,
                        'loss': val_loss
                    }, ))
            # 保持绩效分数历史记录从最好到最差的排序
            # 按稀疏度排序为主排序键,然后按top1、top5、epoch排序
            perf_scores_history.sort(key=operator.attrgetter(
                'sparsity', 'top1', 'top5', 'epoch'),
                                     reverse=True)
            for score in perf_scores_history[:1]:
                msglogger.info(
                    '==> Best [Top1: %.3f   Top5: %.3f   Sparsity: %.2f on epoch: %d   Lr: %f   Loss: %f]',
                    score.top1, score.top5, score.sparsity, score.epoch, lr,
                    score.loss)

            is_best = epoch == perf_scores_history[
                0].epoch  # 当前epoch 和最佳epoch 一样
            self.best_precision = max(perf_scores_history[0].top1,
                                      self.best_precision)  # 最大top1 准确率
            if is_best:
                self.train_save_model(epoch, val_loss, val_top1, val_top5)
            # update learning rate
            lr = lr_scheduler.get_lr()
            # # 如果训练误差比上次大 降低学习效率
            # if self.train_losses.val > previous_loss:
            #     lr = lr * self.opt.lr_decay
            #     # 当loss大于上一次loss,降低学习率
            #     for param_group in self.optimizer.param_groups:
            #         param_group['lr'] = lr
            #
            # previous_loss = self.train_losses.val
            t.cuda.empty_cache()
Exemplo n.º 18
0
def train(net,
          criterion,
          optimizer,
          train_loader,
          val_loader,
          config,
          scheduler=None):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    train_loss_avg = AverageMeter()
    val_loss_avg = AverageMeter()

    train_step = len(train_loader)
    val_step = len(test_loader)

    writer = SummaryWriter(config.log_path)
    criterion.to(device)
    net.train()
    net.to(device)
    if scheduler:
        scheduler.step(config.epoch)

    for epoch in range(config.epoch, config.num_epoch):
        train_loss_avg.reset()
        val_loss_avg.reset()
        iter = tqdm(enumerate(train_loader))
        iter.set_description(f'Train Step in {epoch} total step: {train_step}')

        for step, (images, targets) in iter:
            images = images.to(device)
            targets = targets.to(device)

            preds = net(images)

            loss = criterion(preds, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss_avg.update(loss.item())

        iter = tqdm(enumerate(val_loader))
        iter.set_description(
            f'Validation Step in {epoch} total step: {val_step}')
        for step, (images, targets) in iter:
            images = images.to(device)
            targets = targets.to(device)

            preds = net(images)
            loss = criterion(preds, targets)

            val_loss_avg.update(loss.item())
        writer.add_scalars('DarkNet19/loss', {
            'train': train_loss_avg.avg,
            'validation': val_loss_avg.avg
        }, epoch)
        writer.add_scalar('DarkNet19/LearningRate',
                          optimizer.param_groups[0]['lr'], epoch)

        torch.save(net.state_dict(),
                   f'{config.checkpoint_dir}/DarkNet19-{epoch}.pth')
        if scheduler:
            scheduler.step()
Exemplo n.º 19
0
def train(**kwargs):
    opt._parse(kwargs)
    train_writer = None
    value_writer = None
    if opt.vis:
        train_writer = SummaryWriter(
            log_dir='./runs/train_' +
            datetime.now().strftime('%y%m%d-%H-%M-%S'))
        value_writer = SummaryWriter(
            log_dir='./runs/val_' + datetime.now().strftime('%y%m%d-%H-%M-%S'))
    previous_loss = 1e10  # 上次学习的loss
    best_precision = 0  # 最好的精确度
    start_epoch = 0
    lr = opt.lr
    perf_scores_history = []  # 绩效分数
    # step1: criterion and optimizer
    # 1. 铰链损失(Hinge Loss):主要用于支持向量机(SVM) 中;
    # 2. 互熵损失 (Cross Entropy Loss,Softmax Loss ):用于Logistic 回归与Softmax 分类中;
    # 3. 平方损失(Square Loss):主要是最小二乘法(OLS)中;
    # 4. 指数损失(Exponential Loss) :主要用于Adaboost 集成学习算法中;
    # 5. 其他损失(如0-1损失,绝对值损失)
    criterion = t.nn.CrossEntropyLoss().to(opt.device)  # 损失函数
    # step2: meters
    train_losses = AverageMeter()  # 误差仪表
    train_top1 = AverageMeter()  # top1 仪表
    train_top5 = AverageMeter()  # top5 仪表
    pylogger = PythonLogger(msglogger)
    # step3: configure model
    model = getattr(models, opt.model)()  # 获得网络结构
    compression_scheduler = distiller.CompressionScheduler(model)
    optimizer = model.get_optimizer(lr, opt.weight_decay)  # 优化器
    if opt.load_model_path:
        # # 把所有的张量加载到CPU中
        # t.load(opt.load_model_path, map_location=lambda storage, loc: storage)
        # t.load(opt.load_model_path, map_location='cpu')
        # # 把所有的张量加载到GPU 1中
        # t.load(opt.load_model_path, map_location=lambda storage, loc: storage.cuda(1))
        # # 把张量从GPU 1 移动到 GPU 0
        # t.load(opt.load_model_path, map_location={'cuda:1': 'cuda:0'})
        checkpoint = t.load(opt.load_model_path)
        start_epoch = checkpoint["epoch"]
        # compression_scheduler.load_state_dict(checkpoint['compression_scheduler'], False)
        best_precision = checkpoint["best_precision"]
        model.load_state_dict(checkpoint["state_dict"])
        optimizer = checkpoint['optimizer']
    model.to(opt.device)  # 加载模型到 GPU

    if opt.compress:
        compression_scheduler = distiller.file_config(
            model, optimizer, opt.compress, compression_scheduler)  # 加载模型修剪计划表
        model.to(opt.device)
    # 学习速率调整器
    lr_scheduler = get_scheduler(optimizer, opt)
    # step4: data_image
    train_data = DatasetFromFilename(opt.data_root, flag='train')  # 训练集
    val_data = DatasetFromFilename(opt.data_root, flag='test')  # 验证集
    train_dataloader = DataLoader(train_data,
                                  opt.batch_size,
                                  shuffle=True,
                                  num_workers=opt.num_workers)  # 训练集加载器
    val_dataloader = DataLoader(val_data,
                                opt.batch_size,
                                shuffle=True,
                                num_workers=opt.num_workers)  # 验证集加载器
    # train
    for epoch in range(start_epoch, opt.max_epoch):
        model.train()
        if opt.pruning:
            compression_scheduler.on_epoch_begin(epoch)  # epoch 开始修剪
        train_losses.reset()  # 重置仪表
        train_top1.reset()  # 重置仪表
        # print('训练数据集大小', len(train_dataloader))
        total_samples = len(train_dataloader.sampler)
        steps_per_epoch = math.ceil(total_samples / opt.batch_size)
        train_progressor = ProgressBar(mode="Train  ",
                                       epoch=epoch,
                                       total_epoch=opt.max_epoch,
                                       model_name=opt.model,
                                       lr=lr,
                                       total=len(train_dataloader))
        lr = lr_scheduler.get_lr()[0]
        for ii, (data, labels, img_path, tag) in enumerate(train_dataloader):
            if not check_date(img_path, tag, msglogger): return
            if opt.pruning:
                compression_scheduler.on_minibatch_begin(
                    epoch, ii, steps_per_epoch, optimizer)  # batch 开始修剪
            train_progressor.current = ii + 1  # 训练集当前进度
            # train model
            input = data.to(opt.device)
            target = labels.to(opt.device)
            if train_writer:
                grid = make_grid(
                    (input.data.cpu() * 0.225 + 0.45).clamp(min=0, max=1))
                train_writer.add_image('train_images', grid,
                                       ii * (epoch + 1))  # 训练图片
            score = model(input)  # 网络结构返回值
            # 计算损失
            loss = criterion(score, target)
            if opt.pruning:
                # Before running the backward phase, we allow the scheduler to modify the loss
                # (e.g. add regularization loss)
                agg_loss = compression_scheduler.before_backward_pass(
                    epoch,
                    ii,
                    steps_per_epoch,
                    loss,
                    optimizer=optimizer,
                    return_loss_components=True)  # 模型修建误差
                loss = agg_loss.overall_loss
            train_losses.update(loss.item(), input.size(0))
            # loss = criterion(score[0], target)  # 计算损失   Inception3网络
            optimizer.zero_grad()  # 参数梯度设成0
            loss.backward()  # 反向传播
            optimizer.step()  # 更新参数

            if opt.pruning:
                compression_scheduler.on_minibatch_end(epoch, ii,
                                                       steps_per_epoch,
                                                       optimizer)  # batch 结束修剪

            precision1_train, precision5_train = accuracy(
                score, target, topk=(1, 5))  # top1 和 top5 的准确率

            # writer.add_graph(model, input)
            # precision1_train, precision2_train = accuracy(score[0], target, topk=(1, 2))  # Inception3网络
            train_losses.update(loss.item(), input.size(0))
            train_top1.update(precision1_train[0].item(), input.size(0))
            train_top5.update(precision5_train[0].item(), input.size(0))
            train_progressor.current_loss = train_losses.avg
            train_progressor.current_top1 = train_top1.avg
            train_progressor.current_top5 = train_top5.avg
            train_progressor()  # 打印进度
            if ii % opt.print_freq == 0:
                if train_writer:
                    train_writer.add_scalar('loss', train_losses.avg,
                                            ii * (epoch + 1))  # 训练误差
                    train_writer.add_text(
                        'top1', 'train accuracy top1 %s' % train_top1.avg,
                        ii * (epoch + 1))  # top1准确率文本
                    train_writer.add_scalars(
                        'accuracy', {
                            'top1': train_top1.avg,
                            'top5': train_top5.avg,
                            'loss': train_losses.avg
                        }, ii * (epoch + 1))
        # train_progressor.done()  # 保存训练结果为txt
        # validate and visualize
        if opt.pruning:
            distiller.log_weights_sparsity(model, epoch,
                                           loggers=[pylogger])  # 打印模型修剪结果
            compression_scheduler.on_epoch_end(epoch, optimizer)  # epoch 结束修剪
        val_loss, val_top1, val_top5 = val(model, criterion, val_dataloader,
                                           epoch, value_writer, lr)  # 校验模型
        sparsity = distiller.model_sparsity(model)
        perf_scores_history.append(
            distiller.MutableNamedTuple(
                {
                    'sparsity': sparsity,
                    'top1': val_top1,
                    'top5': val_top5,
                    'epoch': epoch + 1,
                    'lr': lr,
                    'loss': val_loss
                }, ))
        # 保持绩效分数历史记录从最好到最差的排序
        # 按稀疏度排序为主排序键,然后按top1、top5、epoch排序
        perf_scores_history.sort(key=operator.attrgetter(
            'sparsity', 'top1', 'top5', 'epoch'),
                                 reverse=True)
        for score in perf_scores_history[:1]:
            msglogger.info(
                '==> Best [Top1: %.3f   Top5: %.3f   Sparsity: %.2f on epoch: %d   Lr: %f   Loss: %f]',
                score.top1, score.top5, score.sparsity, score.epoch, lr,
                score.loss)

        best_precision = max(perf_scores_history[0].top1,
                             best_precision)  # 最大top1 准确率
        is_best = epoch + 1 == perf_scores_history[
            0].epoch  # 当前epoch 和最佳epoch 一样
        if is_best:
            model.save({
                "epoch":
                epoch + 1,
                "model_name":
                opt.model,
                "state_dict":
                model.state_dict(),
                "best_precision":
                best_precision,
                "optimizer":
                optimizer,
                "valid_loss": [val_loss, val_top1, val_top5],
                'compression_scheduler':
                compression_scheduler.state_dict(),
            })  # 保存模型
        # update learning rate
        lr_scheduler.step(epoch)  # 更新学习效率
        # 如果训练误差比上次大 降低学习效率
        # if train_losses.val > previous_loss:
        #     lr = lr * opt.lr_decay
        #     # 当loss大于上一次loss,降低学习率
        #     for param_group in optimizer.param_groups:
        #         param_group['lr'] = lr
        #
        # previous_loss = train_losses.val
        t.cuda.empty_cache()  # 这个命令是清除没用的临时变量的
Exemplo n.º 20
0
    def valid(self, t_max, epoch, model):
        top1 = AverageMeter()
        top5 = AverageMeter()
        # switch to evaluate mode
        model.eval()
        #eval on each class seperately
        acc_av = 0
        acc_av5 = 0
        with torch.no_grad():
            for t_past in range(t_max + 1):
                idx_ = [i + (t_past * 100) for i in self.idx]
                top1.reset()
                top5.reset()
                print(t_past)
                dataset_test = ImageFolder(
                    root=self.dataroot_test,
                    transform=transforms.Compose([
                        transforms.Resize(self.imageSize),
                        transforms.CenterCrop(self.imageSize),
                        transforms.ToTensor(),
                        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
                    ]),
                    classes_idx=(idx_),
                )
                val_loader_task = torch.utils.data.DataLoader(
                    dataset_test,
                    batch_size=self.batchSize,
                    shuffle=True,
                    num_workers=int(1))
                for i, (input, target) in enumerate(val_loader_task):
                    #if args.gpu is not None:
                    input = input.cuda(self.cuda0)
                    target = target.cuda(self.cuda0) + (
                        t_past * len(self.unique_classes[t_past]))
                    #print(target)
                    self.c_label.data.resize_(target.shape[0]).copy_(target)

                    # compute output
                    _, output = model(input)
                    output = torch.nn.functional.softmax(output, dim=1)
                    topk = ([1, 5]
                            )  #min(t_max+len(self.unique_classes[t_past]),5)])
                    acc1, acc5 = accuracy(output, target, topk=topk)
                    top1.update(acc1)  #, input.size(0))
                    top5.update(acc5)  #, input.size(0))

                acc_av += top1.avg
                print(
                    'Test: {}, Acc@1 {top1.val[0]:.3f} ({top1.avg[0]:.3f})), Acc@5 {top5.val[0]:.3f} ({top5.avg[0]:.3f}))'
                    .format(  #Acc@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                        t_past,
                        top1=top1,
                        top5=top5))
                self.acc_writers[t_past].scalar_summary(
                    "Accuracy top 1", top1.avg[0], self.global_step)
                self.acc_writers[t_past].scalar_summary(
                    "Accuracy top 5", top5.avg[0], self.global_step)

                self.acc_writers[t_past].scalar_summary(
                    "Accuracy top 1_val", top1.val[0], self.global_step)
                self.acc_writers[t_past].scalar_summary(
                    "Accuracy top 5_val", top5.val[0], self.global_step)
                acc_av5 += top5.avg

            self.writer.scalar_summary("Average_Acc. top 1",
                                       acc_av / (t_max + 1), self.global_step)
            self.writer.scalar_summary("Average_Acc. top 5",
                                       acc_av5 / (t_max + 1), self.global_step)
        model.train()
        return top1.avg