예제 #1
0
def evaluate(dataloader, model, dev, topk=(1, )):
    """

    :param dataloader:
    :param model:
    :param dev: devices, gpu or cpu
    :param topk: [tuple]          output the top topk accuracy
    :return:     [list[float]]    topk accuracy
    """
    model.eval()
    test_accuracy = AverageMeter()
    test_accuracy.reset()

    with torch.no_grad():
        for _, sample in enumerate(tqdm(dataloader, ncols=100, ascii=' >')):
            x = sample['data'].to(dev)
            y = sample['label'].to(dev)
            output = model(x)
            logits = output['logits']
            acc = accuracy(logits, y, topk)
            test_accuracy.update(acc[0], x.size(0))
    return test_accuracy.avg
예제 #2
0
def do_train(cfg, model, train_loader, optimizer, scheduler, loss_fn):
    log_period = cfg.SOLVER.LOG_PERIOD
    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    device = "cuda"
    epochs = cfg.SOLVER.MAX_EPOCHS

    logger = logging.getLogger("reid_baseline.train")
    logger.info('start training')

    if device:
        model.to(device)
        if torch.cuda.device_count() > 1:
            print('Using {} GPUs for training'.format(
                torch.cuda.device_count()))
            model = nn.DataParallel(model)

    loss_meter = AverageMeter()
    acc_meter = AverageMeter()

    # train
    scaler = GradScaler()
    for epoch in range(1, epochs + 1):
        start_time = time.time()
        loss_meter.reset()
        acc_meter.reset()

        model.train()
        for n_iter, (img, vid) in enumerate(train_loader):

            optimizer.zero_grad()
            if cfg.INPUT.AUGMIX:
                bs = img[0].size(0)
                images_cat = torch.cat(img, dim=0).to(
                    device)  # [3 * batch, 3, 32, 32]
                target = vid.to(device)
                with autocast():
                    logits, feat = model(images_cat, target)
                    logits_orig, logits_augmix1, logits_augmix2 = logits[:bs], logits[
                        bs:2 * bs], logits[2 * bs:]
                    loss = loss_fn(logits_orig, feat, target)
                    p_orig, p_augmix1, p_augmix2 = F.softmax(
                        logits_orig,
                        dim=-1), F.softmax(logits_augmix1,
                                           dim=-1), F.softmax(logits_augmix2,
                                                              dim=-1)

                    # Clamp mixture distribution to avoid exploding KL divergence
                    p_mixture = torch.clamp(
                        (p_orig + p_augmix1 + p_augmix2) / 3., 1e-7, 1).log()
                    loss += 12 * (
                        F.kl_div(p_mixture, p_orig, reduction='batchmean') +
                        F.kl_div(p_mixture, p_augmix1, reduction='batchmean') +
                        F.kl_div(p_mixture, p_augmix2,
                                 reduction='batchmean')) / 3.
            else:
                img = img.to(device)
                target = vid.to(device)
                with autocast():
                    if cfg.MODEL.CHANNEL_HEAD:
                        score, feat, channel_head_feature = model(img, target)
                        #print(feat.shape, channel_head_feature.shape)
                        loss = loss_fn(score, feat, channel_head_feature,
                                       target)

                    else:
                        score, feat = model(img, target)
                        loss = loss_fn(score, feat, target)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            acc = (score.max(1)[1] == target).float().mean()
            loss_meter.update(loss.item(), img.shape[0])
            acc_meter.update(acc, 1)

            if (n_iter + 1) % log_period == 0:
                logger.info(
                    "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}"
                    .format(epoch, (n_iter + 1), len(train_loader),
                            loss_meter.avg, acc_meter.avg,
                            scheduler.get_lr()[0]))
        scheduler.step()
        end_time = time.time()
        time_per_batch = (end_time - start_time) / (n_iter + 1)
        logger.info(
            "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]"
            .format(epoch, time_per_batch,
                    train_loader.batch_size / time_per_batch))

        if epoch % checkpoint_period == 0:
            torch.save(
                model.state_dict(),
                os.path.join(cfg.OUTPUT_DIR,
                             cfg.MODEL.NAME + '_{}.pth'.format(epoch)))
예제 #3
0
파일: lcl.py 프로젝트: webLCL/LCL
def main(cfg, device):
    init_seeds()
    cfg.use_fp16 = False if device.type == 'cpu' else cfg.use_fp16

    # logging ----------------------------------------------------------------------------------------------------------------------------------------
    logger_root = f'Results/{cfg.dataset}'
    if not os.path.isdir(logger_root):
        os.makedirs(logger_root, exist_ok=True)
    logtime = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    result_dir = os.path.join(logger_root, f'{logtime}-{cfg.log}')
    # result_dir = os.path.join(logger_root, f'ablation_study-{cfg.log}')  #TODO
    logger = Logger(logging_dir=result_dir, DEBUG=False)
    logger.set_logfile(logfile_name='log.txt')
    save_params(cfg, f'{result_dir}/params.json', json_format=True)
    logger.debug(f'Result Path: {result_dir}')

    # model, optimizer, scheduler --------------------------------------------------------------------------------------------------------------------
    opt_lvl = 'O1' if cfg.use_fp16 else 'O0'
    n_classes = cfg.n_classes
    net1 = ResNet(arch=cfg.net1, num_classes=n_classes, pretrained=True)
    optimizer1 = build_sgd_optimizer(net1.parameters(), cfg.lr, cfg.weight_decay)
    net1, optimizer1 = amp.initialize(net1.to(device), optimizer1, opt_level=opt_lvl, keep_batchnorm_fp32=None, loss_scale=None, verbosity=0)
    net2 = ResNet(arch=cfg.net2, num_classes=n_classes, pretrained=True)
    optimizer2 = build_sgd_optimizer(net2.parameters(), cfg.lr, cfg.weight_decay)
    net2, optimizer2 = amp.initialize(net2.to(device), optimizer2, opt_level=opt_lvl, keep_batchnorm_fp32=None, loss_scale=None, verbosity=0)
    lr_plan = make_lr_plan(cfg.lr, cfg.stage1, cfg.epochs)

    with open(f'{result_dir}/network.txt', 'w') as f:
        f.writelines(net1.__repr__())
        f.write('\n\n---------------------------\n\n')
        f.writelines(net1.__repr__())

    # drop rate scheduler ----------------------------------------------------------------------------------------------------------------------------
    T_k = cfg.stage1
    final_drop_rate = 0.25
    final_ldl_rate = cfg.ldl_rate
    drop_rate_scheduler = np.ones(cfg.epochs) * final_drop_rate
    drop_rate_scheduler[:T_k] = np.linspace(0, final_drop_rate, T_k)
    drop_rate_scheduler[T_k:cfg.epochs] = np.linspace(final_drop_rate, final_ldl_rate, cfg.epochs - T_k)

    # dataset, dataloader ----------------------------------------------------------------------------------------------------------------------------
    transform = build_transform(rescale_size=cfg.rescale_size, crop_size=cfg.crop_size)
    dataset = build_webfg_dataset(os.path.join(cfg.database, cfg.dataset), CLDataTransform(transform['train']), transform['test'])
    logger.debug(f"Number of Training Samples: {dataset['n_train_samples']}")
    logger.debug(f"Number of Testing  Samples: {dataset['n_test_samples']}")
    train_loader = DataLoader(dataset['train'], batch_size=cfg.batch_size, shuffle=True, num_workers=8, pin_memory=True)
    test_loader = DataLoader(dataset['test'], batch_size=16, shuffle=False, num_workers=8, pin_memory=True)

    # meters -----------------------------------------------------------------------------------------------------------------------------------------
    train_loss1, train_loss2 = AverageMeter(), AverageMeter()
    train_accuracy1, train_accuracy2 = AverageMeter(), AverageMeter()
    iter_time = AverageMeter()

    # training ---------------------------------------------------------------------------------------------------------------------------------------
    start_epoch = 0
    best_accuracy1, best_accuracy2 = 0.0, 0.0
    best_epoch1, best_epoch2 = None, None

    if cfg.dataset == 'cifar100' and cfg.noise_type != 'clean':
        t = torch.tensor(dataset['train'].noisy_labels)
    else:
        t = torch.tensor(dataset['train'].targets)
    labels2learn1 = torch.full(size=(dataset['n_train_samples'], n_classes), fill_value=0.0)
    labels2learn1.scatter_(dim=1, index=torch.unsqueeze(t, dim=1), value=1.0 * 10)
    labels2learn2 = labels2learn1

    flag = [0, 0, 0]
    for epoch in range(start_epoch, cfg.epochs):
        start_time = time.time()
        train_loss1.reset()
        train_accuracy1.reset()
        train_loss2.reset()
        train_accuracy2.reset()

        net1.train()
        net2.train()
        adjust_lr(optimizer1, lr_plan[epoch])
        adjust_lr(optimizer2, lr_plan[epoch])
        optimizer1.zero_grad()
        optimizer2.zero_grad()

        # train this epoch
        for it, sample in enumerate(train_loader):
            s = time.time()
            # optimizer1.zero_grad()
            # optimizer2.zero_grad()
            indices = sample['index']
            x1, x2 = sample['data']
            x1, x2 = x1.to(device), x2.to(device)
            y0 = sample['label'].to(device)
            y = get_smoothed_label_distribution(y0, nc=n_classes, epsilon=cfg.epsilon)

            output1 = net1(x1)
            output2 = net2(x2)
            logits1 = output1['logits']
            logits2 = output2['logits']

            if epoch < cfg.stage1:  # warmup
                if flag[0] == 0:
                    step_flagging('stage 1')
                    flag[0] += 1
                loss1 = cross_entropy(logits1, y)
                loss2 = cross_entropy(logits2, y)
            else:  # learn label distributions
                if flag[1] == 0:
                    step_flagging('stage 2')
                    flag[1] += 1
                with torch.no_grad():
                    cce_losses1 = cross_entropy(logits1, y, reduction='none')
                    cce_losses2 = cross_entropy(logits2, y, reduction='none')
                    losses1 = cce_losses1
                    losses2 = cce_losses2
                    # ent_losses1 = entropy_loss(logits1, reduction='none')
                    # ent_losses2 = entropy_loss(logits2, reduction='none')
                    # losses1 = cce_losses1 + ent_losses1  # (N)
                    # losses2 = cce_losses2 + ent_losses2  # (N)
                    sample_selection = sample_selector(losses1, losses2, drop_rate_scheduler[epoch])

                # for selected "clean" samples, train in a co-teaching manner
                logits_clean1 = logits1[sample_selection['clean2']]
                logits_clean2 = logits2[sample_selection['clean1']]
                y_clean1 = y[sample_selection['clean2']]
                y_clean2 = y[sample_selection['clean1']]
                losses_clean1 = cross_entropy(logits_clean1, y_clean1, reduction='none') + entropy_loss(logits_clean1, reduction='none')  # (Nc1)
                losses_clean2 = cross_entropy(logits_clean2, y_clean2, reduction='none') + entropy_loss(logits_clean2, reduction='none')  # (Nc2)
                loss_c1_1 = losses_clean1.mean()
                loss_c2_1 = losses_clean2.mean()

                # for selected "unclean" samples, train in a label distribution learning manner (exchange again)
                y_t1 = labels2learn1[indices, :].clone().to(device)
                y_t2 = labels2learn2[indices, :].clone().to(device)
                y_t1.requires_grad = True
                y_t2.requires_grad = True
                y_d1 = F.softmax(y_t1, dim=1) + 1e-8
                y_d2 = F.softmax(y_t2, dim=1) + 1e-8
                logits_unclean1 = logits1[sample_selection['unclean2']]
                logits_unclean2 = logits2[sample_selection['unclean1']]
                y_d_unclean1 = y_d1[sample_selection['unclean2']]
                y_d_unclean2 = y_d2[sample_selection['unclean1']]

                w1 = np.random.beta(cfg.phi, cfg.phi, logits_unclean1.size(0))
                w2 = np.random.beta(cfg.phi, cfg.phi, logits_unclean2.size(0))
                w1 = x1.new(w1).view(logits_unclean1.size(0), 1, 1, 1)
                w2 = x2.new(w2).view(logits_unclean2.size(0), 1, 1, 1)
                idx1 = np.random.choice(sample_selection['clean2'].cpu().numpy(), logits_unclean1.size(0), replace=False if sample_selection['clean2'].size(0) >= logits_unclean1.size(0) else True)
                idx1 = torch.tensor(idx1).to(device)
                idx2 = np.random.choice(sample_selection['clean1'].cpu().numpy(), logits_unclean2.size(0), replace=False if sample_selection['clean1'].size(0) >= logits_unclean2.size(0) else True)
                idx2 = torch.tensor(idx2).to(device)
                mixed_x1 = w1 * x1[sample_selection['unclean2']] + (1-w1) * x1[idx1]
                mixed_x2 = w2 * x2[sample_selection['unclean1']] + (1-w2) * x2[idx2]
                mixed_y1 = w1 * y_d_unclean1 + (1-w1) * y_d1[idx1]
                mixed_y2 = w2 * y_d_unclean2 + (1-w2) * y_d2[idx2]

                mixed_output1 = net1(mixed_x1)
                mixed_output2 = net2(mixed_x2)
                mixed_logits1 = mixed_output1['logits']
                mixed_logits2 = mixed_output2['logits']
                loss_c1_2 = kl_div(F.softmax(mixed_logits1, dim=1) + 1e-8, mixed_y1).mean()
                loss_c2_2 = kl_div(F.softmax(mixed_logits2, dim=1) + 1e-8, mixed_y2).mean()

                loss_c1 = loss_c1_1 + loss_c1_2 * cfg.beta
                loss_c2 = loss_c2_1 + loss_c2_2 * cfg.beta

                # consistency loss
                loss_o1 = cross_entropy(F.softmax(y_t1[sample_selection['clean2']], dim=1), y[sample_selection['clean2']])
                loss_o2 = cross_entropy(F.softmax(y_t2[sample_selection['clean1']], dim=1), y[sample_selection['clean1']])

                # final loss
                loss1 = (1 - cfg.alpha) * loss_c1 + cfg.alpha * loss_o1
                loss2 = (1 - cfg.alpha) * loss_c2 + cfg.alpha * loss_o2

            train_acc1 = accuracy(logits1, y0, topk=(1,))
            train_acc2 = accuracy(logits2, y0, topk=(1,))
            train_loss1.update(loss1.item(), x1.size(0))
            train_loss2.update(loss2.item(), x2.size(0))
            train_accuracy1.update(train_acc1[0], x1.size(0))
            train_accuracy2.update(train_acc2[0], x2.size(0))

            if cfg.use_fp16:
                with amp.scale_loss(loss1, optimizer1) as scaled_loss1:
                    scaled_loss1.backward()
                with amp.scale_loss(loss2, optimizer2) as scaled_loss2:
                    scaled_loss2.backward()
            else:
                loss1.backward()
                loss2.backward()

            optimizer1.step()
            optimizer2.step()
            optimizer1.zero_grad()
            optimizer2.zero_grad()

            if epoch >= cfg.stage1:
                y_t1.data.sub_(cfg.lmd * y_t1.grad.data)
                y_t2.data.sub_(cfg.lmd * y_t2.grad.data)
                labels2learn1[indices, :] = y_t1.detach().clone().cpu().data
                labels2learn2[indices, :] = y_t2.detach().clone().cpu().data
                del y_t1, y_t2

            iter_time.update(time.time() - s, 1)
            if (cfg.log_freq is not None and (it + 1) % cfg.log_freq == 0) or (it + 1 == len(train_loader)):
                total_mem = torch.cuda.get_device_properties(0).total_memory / 2**30
                mem = torch.cuda.memory_reserved() / 2**30
                console_content = f"Epoch:[{epoch + 1:>3d}/{cfg.epochs:>3d}]  " \
                                  f"Iter:[{it + 1:>4d}/{len(train_loader):>4d}]  " \
                                  f"Train Accuracy 1:[{train_accuracy1.avg:6.2f}]  " \
                                  f"Train Accuracy 2:[{train_accuracy2.avg:6.2f}]  " \
                                  f"Loss 1:[{train_loss1.avg:4.4f}]  " \
                                  f"Loss 2:[{train_loss2.avg:4.4f}]  " \
                                  f"GPU-MEM:[{mem:6.3f}/{total_mem:6.3f} Gb]  " \
                                  f"{iter_time.avg:6.2f} sec/iter"
                logger.debug(console_content)

        # evaluate this epoch
        test_accuracy1 = evaluate(test_loader, net1, device)
        test_accuracy2 = evaluate(test_loader, net2, device)
        if test_accuracy1 > best_accuracy1:
            best_accuracy1 = test_accuracy1
            best_epoch1 = epoch + 1
            torch.save(net1.state_dict(), f'{result_dir}/net1_best_epoch.pth')
        if test_accuracy2 > best_accuracy2:
            best_accuracy2 = test_accuracy2
            best_epoch2 = epoch + 1
            torch.save(net2.state_dict(), f'{result_dir}/net2_best_epoch.pth')

        # logging this epoch
        runtime = time.time() - start_time
        logger.info(f'epoch: {epoch + 1:>3d} | '
                    f'train loss(1/2): ({train_loss1.avg:>6.4f}/{train_loss2.avg:>6.4f}) | '
                    f'train accuracy(1/2): ({train_accuracy1.avg:>6.3f}/{train_accuracy2.avg:>6.3f}) | '
                    f'test accuracy(1/2): ({test_accuracy1:>6.3f}/{test_accuracy2:>6.3f}) | '
                    f'epoch runtime: {runtime:6.2f} sec | '
                    f'best accuracy(1/2): ({best_accuracy1:6.3f}/{best_accuracy2:6.3f}) @ epoch: ({best_epoch1:03d}/{best_epoch2:03d})')
        plot_results_cotraining(result_file=f'{result_dir}/log.txt')

    torch.save(labels2learn1, f'{result_dir}/labels_learned.pt')

    # rename results dir -----------------------------------------------------------------------------------------------------------------------------
    best_accuracy = max(best_accuracy1, best_accuracy2)
    os.rename(result_dir, f'{result_dir}-bestAcc_{best_accuracy:.4f}')
예제 #4
0
def do_train(cfg, model, center_criterion, train_loader, val_loader, optimizer,
             optimizer_center, scheduler, loss_fn, num_query):
    log_period = cfg.SOLVER.LOG_PERIOD
    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
    eval_period = cfg.SOLVER.EVAL_PERIOD

    device = "cuda"
    epochs = cfg.SOLVER.MAX_EPOCHS

    logger = logging.getLogger("reid_baseline.train")
    logger.info('start training')

    print("torch.cuda.device_count()", torch.cuda.device_count())
    if device:
        model.to(device)
        if torch.cuda.device_count() > 1:
            print('Using {} GPUs for training'.format(
                torch.cuda.device_count()))
            print("多卡训练")
            # model = DDP(model, delay_allreduce=True)  # 必须在initialze之后
            # model = nn.DataParallel(model)
            # model, optimizer = amp.initialize(model, optimizer, opt_level="O1")  # 字母小写o,不是零。
            torch.distributed.init_process_group(
                'gloo',
                init_method='file:///tmp/somefile',
                rank=0,
                world_size=1)

            # model = convert_syncbn_model(model)
            model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
            # model = DistributedDataParallel(model, delay_allreduce=True)
            # model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True)
            model = nn.DataParallel(model)
            # model = convert_syncbn_model(model)
        else:
            print("单卡训练")
            model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
        model.to(device=0)

    loss_meter = AverageMeter()
    acc_meter = AverageMeter()

    # evaluator = R1_mAP_eval(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM)
    # model.base._freeze_stages()
    logger.info('Freezing the stages number:{}'.format(cfg.MODEL.FROZEN))

    # model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
    for epoch in range(1, epochs + 1):
        if epoch == 5:
            print("balance 数据训练")
            # cfg.DATASETS.ROOT_DIR = '/home/lab3/bi/0716/Veri/ai_city/tools/mix_train_balance_flip.pkl'
            cfg.DATASETS.ROOT_DIR = 'datasets/mix_train_balance.pkl'
            train_loader, val_loader, num_query, num_classes = make_dataloader(
                cfg)

        # model.base._freeze_stages()
        start_time = time.time()
        loss_meter.reset()
        acc_meter.reset()
        # evaluator.reset()
        scheduler.step()
        model.train()
        # print(scheduler.get_lr()[0])
        for n_iter, (img, vid) in enumerate(tqdm(train_loader)):
            optimizer.zero_grad()
            optimizer_center.zero_grad()
            img = img.to(device)
            target = vid.to(device)

            #grid mask
            # img = grid(img)

            # score, feat,score_f1,score_f2,score_f3,f4,f4_score = model(img, target)
            # score, feat,score_f1,score_f2,feat1,score_layer2 = model(img, target)
            score, feat, score_f1, score_f2, feat1 = model(img, target)

            # print(feat.shape)
            loss = loss_fn(score, feat, target, score_f1, score_f2, feat1)
            # loss = loss_fn(score, feat, target,score_f1,score_f2,feat1,score_layer2)

            if cfg.SOLVER.FP16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                    # scaled_loss.backward(retain_graph=True)
            else:
                loss.backward()

            # loss.backward()
            optimizer.step()
            if 'center' in cfg.MODEL.METRIC_LOSS_TYPE:
                for param in center_criterion.parameters():
                    param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT)
                optimizer_center.step()

            acc = (score.max(1)[1] == target).float().mean()
            loss_meter.update(loss.item(), img.shape[0])
            acc_meter.update(acc, 1)

            # print(loss_meter.val)
            if (n_iter + 1) % log_period == 0:
                logger.info(
                    "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}"
                    .format(epoch, (n_iter + 1), len(train_loader),
                            loss_meter.avg, acc_meter.avg,
                            scheduler.get_lr()[0]))

        end_time = time.time()
        time_per_batch = (end_time - start_time) / (n_iter + 1)
        logger.info(
            "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]"
            .format(epoch, time_per_batch,
                    train_loader.batch_size / time_per_batch))

        if epoch % checkpoint_period == 0:
            torch.save(
                model.state_dict(),
                os.path.join(cfg.OUTPUT_DIR,
                             cfg.MODEL.NAME + '_epoch{}.pth'.format(epoch)))
        if epoch == 10:

            reduce_model_dict = model.half().state_dict()
            del_keys = []
            for key in reduce_model_dict.keys():
                if 'class' in key or 'sub1' in key or 'sub2' in key or 'base.fc' in key:
                    del_keys.append(key)
            for key in del_keys:
                del reduce_model_dict[key]

            torch.save(
                reduce_model_dict,
                os.path.join(
                    cfg.OUTPUT_DIR, cfg.MODEL.NAME +
                    str(cfg.INPUT.SIZE_TRAIN[0]) + 'half.pth'))
def do_train(Cfg, model_G, model_Dip, model_Dii, model_D_reid, train_loader,
             val_loader, optimizerG, optimizerDip, optimizerDii, GAN_loss,
             L1_loss, ReID_loss, schedulerG, schedulerDip, schedulerDii):
    log_period = Cfg.SOLVER.LOG_PERIOD
    checkpoint_period = Cfg.SOLVER.CHECKPOINT_PERIOD
    eval_period = Cfg.SOLVER.EVAL_PERIOD
    output_dir = Cfg.DATALOADER.LOG_DIR
    # need modified the following in cfg
    epsilon = 0.00001
    margin = 0.4
    ####################################
    device = "cuda"
    epochs = Cfg.SOLVER.MAX_EPOCHS

    logger = logging.getLogger('pose-transfer-gan.train')
    logger.info('Start training')

    if device:
        if torch.cuda.device_count() > 1:
            print('Using {} GPUs for training'.format(
                torch.cuda.device_count()))
            model_G = nn.DataParallel(model_G)
            model_Dii = nn.DataParallel(model_Dii)
            model_Dip = nn.DataParallel(model_Dip)
        model_G.to(device)
        model_Dip.to(device)
        model_Dii.to(device)
        model_D_reid.to(device)
    lossG_meter = AverageMeter()
    lossDip_meter = AverageMeter()
    lossDii_meter = AverageMeter()
    distDreid_meter = AverageMeter()
    fake_ii_pool = ImagePool(50)
    fake_ip_pool = ImagePool(50)

    #evaluator = R1_mAP(num_query, max_rank=50, feat_norm=Cfg.TEST.FEAT_NORM)
    #train
    for epoch in range(1, epochs + 1):
        start_time = time.time()
        lossG_meter.reset()
        lossDip_meter.reset()
        lossDii_meter.reset()
        distDreid_meter.reset()
        schedulerG.step()
        schedulerDip.step()
        schedulerDii.step()

        model_G.train()
        model_Dip.train()
        model_Dii.train()
        model_D_reid.eval()
        for iter, batch in enumerate(train_loader):
            img1 = batch['img1'].to(device)
            pose1 = batch['pose1'].to(device)
            img2 = batch['img2'].to(device)
            pose2 = batch['pose2'].to(device)
            input_G = (img1, pose2)

            #forward
            fake_img2 = model_G(input_G)
            optimizerG.zero_grad()

            #train G
            input_Dip = torch.cat((fake_img2, pose2), 1)
            pred_fake_ip = model_Dip(input_Dip)
            loss_G_ip = GAN_loss(pred_fake_ip, True)
            input_Dii = torch.cat((fake_img2, img1), 1)
            pred_fake_ii = model_Dii(input_Dii)
            loss_G_ii = GAN_loss(pred_fake_ii, True)

            loss_L1, _, _ = L1_loss(fake_img2, img2)

            feats_real = model_D_reid(img2)
            feats_fake = model_D_reid(fake_img2)

            dist_cos = torch.acos(
                torch.clamp(torch.sum(feats_real * feats_fake, 1),
                            -1 + epsilon, 1 - epsilon))

            same_id_tensor = torch.FloatTensor(
                dist_cos.size()).fill_(1).to('cuda')
            dist_cos_margin = torch.max(dist_cos - margin,
                                        torch.zeros_like(dist_cos))
            loss_reid = ReID_loss(dist_cos_margin, same_id_tensor)
            factor = loss_reid_factor(epoch)
            loss_G = 0.5 * loss_G_ii * Cfg.LOSS.GAN_WEIGHT + 0.5 * loss_G_ip * Cfg.LOSS.GAN_WEIGHT + loss_L1 + loss_reid * Cfg.LOSS.REID_WEIGHT * factor
            loss_G.backward()
            optimizerG.step()

            #train Dip
            for i in range(Cfg.SOLVER.DG_RATIO):
                optimizerDip.zero_grad()
                real_input_ip = torch.cat((img2, pose2), 1)
                fake_input_ip = fake_ip_pool.query(
                    torch.cat((fake_img2, pose2), 1).data)
                pred_real_ip = model_Dip(real_input_ip)
                loss_Dip_real = GAN_loss(pred_real_ip, True)
                pred_fake_ip = model_Dip(fake_input_ip)
                loss_Dip_fake = GAN_loss(pred_fake_ip, False)
                loss_Dip = 0.5 * Cfg.LOSS.GAN_WEIGHT * (loss_Dip_real +
                                                        loss_Dip_fake)
                loss_Dip.backward()
                optimizerDip.step()
            #train Dii
            for i in range(Cfg.SOLVER.DG_RATIO):
                optimizerDii.zero_grad()
                real_input_ii = torch.cat((img2, img1), 1)
                fake_input_ii = fake_ii_pool.query(
                    torch.cat((fake_img2, img1), 1).data)
                pred_real_ii = model_Dii(real_input_ii)
                loss_Dii_real = GAN_loss(pred_real_ii, True)
                pred_fake_ii = model_Dii(fake_input_ii)
                loss_Dii_fake = GAN_loss(pred_fake_ii, False)
                loss_Dii = 0.5 * Cfg.LOSS.GAN_WEIGHT * (loss_Dii_real +
                                                        loss_Dii_fake)
                loss_Dii.backward()
                optimizerDii.step()

            lossG_meter.update(loss_G.item(), 1)
            lossDip_meter.update(loss_Dip.item(), 1)
            lossDii_meter.update(loss_Dii.item(), 1)
            distDreid_meter.update(dist_cos.mean().item(), 1)
            if (iter + 1) % log_period == 0:
                logger.info(
                    "Epoch[{}] Iteration[{}/{}] G Loss: {:.3f}, Dip Loss: {:.3f}, Dii Loss: {:.3f}, Base G_Lr: {:.2e}, Base Dip_Lr: {:.2e}, Base Dii_Lr: {:.2e}"
                    .format(epoch, (iter + 1), len(train_loader),
                            lossG_meter.avg, lossDip_meter.avg,
                            lossDii_meter.avg,
                            schedulerG.get_lr()[0],
                            schedulerDip.get_lr()[0],
                            schedulerDii.get_lr()[0]))  #scheduler.get_lr()[0]
                logger.info("ReID Cos Distance: {:.3f}".format(
                    distDreid_meter.avg))
        end_time = time.time()
        time_per_batch = (end_time - start_time) / (iter + 1)
        logger.info(
            "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]"
            .format(epoch, time_per_batch,
                    train_loader.batch_size / time_per_batch))

        if epoch % checkpoint_period == 0:
            torch.save(model_G.state_dict(),
                       output_dir + 'model_G_{}.pth'.format(epoch))
            torch.save(model_Dip.state_dict(),
                       output_dir + 'model_Dip_{}.pth'.format(epoch))
            torch.save(model_Dii.state_dict(),
                       output_dir + 'model_Dii_{}.pth'.format(epoch))
        #
        if epoch % eval_period == 0:
            np.save(output_dir + 'train_Bx6x128x64_epoch{}.npy'.format(epoch),
                    fake_ii_pool.images[0].cpu().numpy())
            logger.info('Entering Evaluation...')
            tmp_results = []
            model_G.eval()
            for iter, batch in enumerate(val_loader):
                with torch.no_grad():
                    img1 = batch['img1'].to(device)
                    pose1 = batch['pose1'].to(device)
                    img2 = batch['img2'].to(device)
                    pose2 = batch['pose2'].to(device)
                    input_G = (img1, pose2)
                    fake_img2 = model_G(input_G)
                    tmp_result = torch.cat((img1, img2, fake_img2),
                                           1).cpu().numpy()
                    tmp_results.append(tmp_result)

            np.save(output_dir + 'test_Bx6x128x64_epoch{}.npy'.format(epoch),
                    tmp_results[0])
예제 #6
0
def do_train(Cfg, model, center_criterion, train_loader, val_loader, optimizer,
             optimizer_center, scheduler, loss_fn, num_query):
    log_period = Cfg.LOG_PERIOD
    checkpoint_period = Cfg.CHECKPOINT_PERIOD
    eval_period = Cfg.EVAL_PERIOD
    output_dir = Cfg.LOG_DIR

    device = "cuda"
    epochs = Cfg.MAX_EPOCHS

    logger = logging.getLogger('{}.train'.format(Cfg.PROJECT_NAME))
    logger.info('start training')

    if device:
        if torch.cuda.device_count() > 1:
            print('Using {} GPUs for training'.format(
                torch.cuda.device_count()))
            model = nn.DataParallel(model)
        model.to(device)

    loss_meter = AverageMeter()
    acc_meter = AverageMeter()

    evaluator = R1_mAP(num_query, max_rank=50, feat_norm=Cfg.FEAT_NORM)
    #train
    for epoch in range(1, epochs + 1):
        start_time = time.time()
        loss_meter.reset()
        acc_meter.reset()
        evaluator.reset()

        model.train()
        for iter, (img, vid) in enumerate(train_loader):
            optimizer.zero_grad()
            optimizer_center.zero_grad()
            img = img.to(device)
            target = vid.to(device)

            score, feat = model(img, target)
            loss = loss_fn(score, feat, target)

            loss.backward()
            optimizer.step()
            if 'center' in Cfg.LOSS_TYPE:
                for param in center_criterion.parameters():
                    param.grad.data *= (1. / Cfg.CENTER_LOSS_WEIGHT)
                optimizer_center.step()

            acc = (score.max(1)[1] == target).float().mean()
            loss_meter.update(loss.item(), img.shape[0])
            acc_meter.update(acc, 1)

            if (iter + 1) % log_period == 0:
                logger.info(
                    "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}"
                    .format(epoch, (iter + 1), len(train_loader),
                            loss_meter.avg, acc_meter.avg,
                            scheduler.get_lr()[0]))
        end_time = time.time()
        time_per_batch = (end_time - start_time) / (iter + 1)
        logger.info(
            "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]"
            .format(epoch, time_per_batch,
                    train_loader.batch_size / time_per_batch))
        scheduler.step()
        if epoch % checkpoint_period == 0:
            torch.save(model.state_dict(),
                       output_dir + Cfg.MODEL_NAME + '_{}.pth'.format(epoch))

        if epoch % eval_period == 0:
            model.eval()
            for iter, (img, vid, camid) in enumerate(val_loader):
                with torch.no_grad():
                    img = img.to(device)
                    feat = model(img)
                    evaluator.update((feat, vid, camid))

            cmc, mAP, _, _, _, _ = evaluator.compute()
            logger.info("Validation Results - Epoch: {}".format(epoch))
            logger.info("mAP: {:.1%}".format(mAP))
            for r in [1, 5, 10]:
                logger.info("CMC curve, Rank-{:<3}:{:.1%}".format(
                    r, cmc[r - 1]))
def do_train(cfg, model, center_criterion, train_loader, val_loader, optimizer,
             optimizer_center, scheduler, loss_fn, num_query, last_epoch):
    log_period = cfg.SOLVER.LOG_PERIOD
    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    device = "cuda"
    epochs = cfg.SOLVER.MAX_EPOCHS

    logger = logging.getLogger("reid_baseline.train")
    logger.info('start training')

    if device:
        model.to(device)
        if torch.cuda.device_count() > 1:
            print('Using {} GPUs for training'.format(
                torch.cuda.device_count()))
            model = nn.DataParallel(model)
        else:
            if cfg.SOLVER.FP16:
                model, optimizer = amp.initialize(model,
                                                  optimizer,
                                                  opt_level='O1')

    loss_meter = AverageMeter()
    acc_meter = AverageMeter()

    # train
    for epoch in range(last_epoch, epochs + 1):
        start_time = time.time()
        loss_meter.reset()
        acc_meter.reset()

        model.train()
        try:
            for n_iter, (img, vid) in enumerate(train_loader):
                optimizer.zero_grad()
                optimizer_center.zero_grad()
                img = img.to(device)
                target = vid.to(device)

                score, feat = model(img, target)
                loss = loss_fn(score, feat, target)

                if cfg.SOLVER.FP16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
                optimizer.step()
                if 'center' in cfg.MODEL.METRIC_LOSS_TYPE:
                    for param in center_criterion.parameters():
                        param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT)
                    optimizer_center.step()

                acc = (score.max(1)[1] == target).float().mean()
                loss_meter.update(loss.item(), img.shape[0])
                acc_meter.update(acc, 1)

                if (n_iter + 1) % log_period == 0:
                    logger.info(
                        "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}"
                        .format(epoch, (n_iter + 1), len(train_loader),
                                loss_meter.avg, acc_meter.avg,
                                scheduler.get_lr()[0]))
            scheduler.step()
            end_time = time.time()
            time_per_batch = (end_time - start_time) / (n_iter + 1)
            logger.info(
                "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]"
                .format(epoch, time_per_batch,
                        train_loader.batch_size / time_per_batch))

            if epoch % checkpoint_period == 0:
                torch.save(
                    {
                        'epoch': epoch,
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict()
                    },
                    os.path.join(cfg.OUTPUT_DIR,
                                 cfg.MODEL.NAME + '_{}.pth'.format(epoch)))
        except:
            torch.save(
                {
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict()
                },
                os.path.join(cfg.OUTPUT_DIR,
                             cfg.MODEL.NAME + '_{}.pth'.format(epoch)))
예제 #8
0
def do_train(cfg, model, center_criterion, train_loader, val_loader, optimizer,
             optimizer_center, scheduler, loss_fn, num_query):
    log_period = cfg.SOLVER.LOG_PERIOD
    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    device = "cuda"
    epochs = cfg.SOLVER.MAX_EPOCHS

    logger = logging.getLogger("reid_baseline.train")
    logger.info('start training')

    if device:
        # dist.init_process_group(backend='nccl',init_method='env://')

        model.to(device)
        if torch.cuda.device_count() > 1:
            print('Using {} GPUs for training'.format(
                torch.cuda.device_count()))

            model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

            model = nn.DataParallel(model)
            # model = torch.nn.parallel.DistributedDataParallel(model,find_unused_parameters=True)
        else:
            if cfg.SOLVER.FP16:
                model, optimizer = amp.initialize(model,
                                                  optimizer,
                                                  opt_level='O1')

    loss_meter = AverageMeter()
    all_loss_meter = AverageMeter()
    acc_meter = AverageMeter()
    pcb_losses = AverageMeter()
    pcb_merge_losses = AverageMeter()

    pcb_optimizer = get_pcb_optimizer(model)
    pcb_scheduler = WarmupMultiStepLR(pcb_optimizer, cfg.SOLVER.STEPS,
                                      cfg.SOLVER.GAMMA,
                                      cfg.SOLVER.WARMUP_FACTOR,
                                      cfg.SOLVER.WARMUP_EPOCHS,
                                      cfg.SOLVER.WARMUP_METHOD)

    # train
    for epoch in range(1, epochs + 1):
        start_time = time.time()
        loss_meter.reset()
        all_loss_meter.reset()
        acc_meter.reset()
        pcb_losses.reset()
        pcb_merge_losses.reset()

        model.train()
        for n_iter, (img, vid) in enumerate(train_loader):
            optimizer.zero_grad()
            optimizer_center.zero_grad()
            img = img.to(device)
            target = vid.to(device)

            if cfg.MODEL.IF_USE_PCB:
                score, feat, pcb_out = model(img, target)

                loss = loss_fn(score, feat, target)
                loss0, loss1, loss2, loss3, loss4, loss5, loss_merge = pcb_loss_forward(
                    pcb_feat=pcb_out, targets=target)
                pcb_loss = (loss0 + loss1 + loss2 + loss3 + loss4 + loss5) / 6
                all_loss = loss + 0.5 * pcb_loss + 0.5 * loss_merge

                if cfg.SOLVER.FP16:
                    with amp.scale_loss(all_loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    all_loss.backward()

                optimizer.step()
                if 'center' in cfg.MODEL.METRIC_LOSS_TYPE:
                    for param in center_criterion.parameters():
                        param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT)
                    optimizer_center.step()

                acc = (score.max(1)[1] == target).float().mean()
                loss_meter.update(loss.item(), img.shape[0])
                all_loss_meter.update(all_loss.item(), img.shape[0])
                pcb_losses.update(pcb_loss.item(), img.shape[0])
                pcb_merge_losses.update(loss_merge.item(), img.shape[0])
                acc_meter.update(acc, 1)

                if (n_iter + 1) % log_period == 0:
                    logger.info(
                        "Epoch[{}] Iteration[{}/{}] All_Loss: {:.3f},Global_Loss: {:.3f},PCB_Loss: {:.3f},Merge_Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}"
                        .format(epoch, (n_iter + 1), len(train_loader),
                                all_loss_meter.avg, loss_meter.avg,
                                pcb_losses.avg, pcb_merge_losses.avg,
                                acc_meter.avg,
                                scheduler.get_lr()[0]))
            else:
                score, feat = model(img, target)

                loss = loss_fn(score, feat, target)
                if cfg.SOLVER.FP16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
                optimizer.step()
                if 'center' in cfg.MODEL.METRIC_LOSS_TYPE:
                    for param in center_criterion.parameters():
                        param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT)
                    optimizer_center.step()

                acc = (score.max(1)[1] == target).float().mean()
                loss_meter.update(loss.item(), img.shape[0])
                # all_loss_meter.update(all_loss.item(), img.shape[0])
                # pcb_losses.update(pcb_loss.item(), img.shape[0])
                acc_meter.update(acc, 1)

                if (n_iter + 1) % log_period == 0:
                    logger.info(
                        "Epoch[{}] Iteration[{}/{}] Global_Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}"
                        .format(epoch, (n_iter + 1), len(train_loader),
                                loss_meter.avg, acc_meter.avg,
                                scheduler.get_lr()[0]))

            # if cfg.SOLVER.FP16:
            #     with amp.scale_loss(loss, optimizer) as scaled_loss:
            #         scaled_loss.backward()
            # else:
            #     loss.backward(retain_graph=True)

            # loss0, loss1, loss2, loss3, loss4, loss5 = pcb_loss_forward(pcb_feat=pcb_out, targets=target)
            # pcb_loss = (loss0 + loss1 + loss2 + loss3 + loss4 + loss5) / 6

            # all_loss = 0.1 * loss + 0.9 * pcb_loss

            # if cfg.SOLVER.FP16:
            #     with amp.scale_loss(all_loss, optimizer) as scaled_loss:
            #         scaled_loss.backward()
            # else:
            #     all_loss.backward()

            # wenli:if use mulit task to train ,may overfit.Deprecated use this method
            # pcb_optimizer.zero_grad()
            # torch.autograd.backward([loss0, loss1, loss2, loss3, loss4, loss5],
            #                         [torch.ones(1)[0].cuda(), torch.ones(1)[0].cuda(), torch.ones(1)[0].cuda(),
            #                          torch.ones(1)[0].cuda(), torch.ones(1)[0].cuda(), torch.ones(1)[0].cuda(),
            #                          torch.ones(1)[0].cuda()])
            # pcb_optimizer.step()

            # optimizer.step()
            # if 'center' in cfg.MODEL.METRIC_LOSS_TYPE:
            #     for param in center_criterion.parameters():
            #         param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT)
            #     optimizer_center.step()

            # acc = (score.max(1)[1] == target).float().mean()
            # loss_meter.update(loss.item(), img.shape[0])
            # all_loss_meter.update(all_loss.item(), img.shape[0])
            # pcb_losses.update(pcb_loss.item(), img.shape[0])
            # acc_meter.update(acc, 1)

            # if (n_iter + 1) % log_period == 0:
            #     logger.info("Epoch[{}] Iteration[{}/{}] All_Loss: {:.3f},Global_Loss: {:.3f},PCB_Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}"
            #                 .format(epoch, (n_iter + 1), len(train_loader),
            #                         all_loss_meter.avg,loss_meter.avg, pcb_losses.avg,acc_meter.avg, scheduler.get_lr()[0]))

        #pcb_scheduler.step()
        scheduler.step()
        end_time = time.time()
        time_per_batch = (end_time - start_time) / (n_iter + 1)
        logger.info(
            "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]"
            .format(epoch, time_per_batch,
                    train_loader.batch_size / time_per_batch))

        if epoch % checkpoint_period == 0:
            torch.save(
                model.state_dict(),
                os.path.join(cfg.OUTPUT_DIR,
                             cfg.MODEL.NAME + '_{}.pth'.format(epoch)))
def do_train(Cfg, model, train_loader, test_loader, optimizer, scheduler, loss_fn):
    log_period = Cfg.LOG_PERIOD
    checkpoint_period = Cfg.CHECKPOINT_PERIOD
    output_dir = Cfg.LOG_DIR

    device = "cuda"
    epochs = Cfg.MAX_EPOCHS

    logger = logging.getLogger('{}'.format(Cfg.PROJECT_NAME))
    logger.info('start training')

    if device:
        if torch.cuda.device_count() > 1:
            print('Using {} GPUs for training'.format(torch.cuda.device_count()))
            model = nn.DataParallel(model)
        model.to(device)

    loss_meter = AverageMeter()
    acc_meter = AverageMeter()
    precision_meter = AverageMeter()
    recall_meter = AverageMeter()

    #train
    for epoch in range(1, epochs+1):
        start_time = time.time()
        loss_meter.reset()
        acc_meter.reset()
        precision_meter.reset()
        recall_meter.reset()
        scheduler.step()

        model.train()
        for iter, ((feat, adj, cid, h1id), gtmat) in enumerate(train_loader):
            optimizer.zero_grad()
            feat, adj, cid, h1id, gtmat = map(lambda x: x.cuda(),
                                              (feat, adj, cid, h1id, gtmat))
            pred = model(feat, adj, h1id)
            labels = make_labels(gtmat).long()
            loss = loss_fn(pred, labels)
            p, r, acc = accuracy(pred, labels)

            loss.backward()
            optimizer.step()

            loss_meter.update(loss.item(), feat.size(0))
            acc_meter.update(acc.item(), feat.size(0))
            precision_meter.update(p, feat.size(0))
            recall_meter.update(r, feat.size(0))

            if (iter+1) % log_period == 0:
                logger.info("Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, P:{:.3f}, R:{:.3f}, Base Lr: {:.2e}"
                            .format(epoch, (iter+1), len(train_loader),
                                    loss_meter.avg, acc_meter.avg, precision_meter.avg, recall_meter.avg, scheduler.get_lr()[0]))
        end_time = time.time()
        time_per_batch = (end_time - start_time) / (iter + 1)
        logger.info("Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]"
                    .format(epoch, time_per_batch, train_loader.batch_size / time_per_batch))

        if epoch % checkpoint_period == 0:
            torch.save(model.state_dict(), output_dir+Cfg.MODEL_NAME+'_{}.pth'.format(epoch))
            model.eval()
            acc_meter.reset()
            precision_meter.reset()
            recall_meter.reset()
            for iter, ((feat, adj, cid, h1id, unique_nodes_list), gtmat) in enumerate(test_loader):
                feat, adj, cid, h1id, gtmat = map(lambda x: x.cuda(),
                                                  (feat, adj, cid, h1id, gtmat))
                pred = model(feat, adj, h1id)
                labels = make_labels(gtmat).long()
                p, r, acc = accuracy(pred, labels)
                acc_meter.update(acc.item(), feat.size(0))
                precision_meter.update(p, feat.size(0))
                recall_meter.update(r, feat.size(0))

            logger.info("Test Result: Acc: {:.3f}, P:{:.3f}, R:{:.3f}"
                        .format(acc_meter.avg, precision_meter.avg, recall_meter.avg))
예제 #10
0
def do_train(cfg, model, center_criterion, train_loader, val_loader, optimizer,
             optimizer_center, scheduler, loss_fn, num_query, local_rank):
    log_period = cfg.SOLVER.LOG_PERIOD
    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
    eval_period = cfg.SOLVER.EVAL_PERIOD

    device = "cuda"
    epochs = cfg.SOLVER.MAX_EPOCHS

    logger = logging.getLogger("transreid.train")
    logger.info('start training')
    _LOCAL_PROCESS_GROUP = None
    if device:
        model.to(local_rank)
        if torch.cuda.device_count() > 1 and cfg.MODEL.DIST_TRAIN:
            print('Using {} GPUs for training'.format(
                torch.cuda.device_count()))
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[local_rank], find_unused_parameters=True)

    loss_meter = AverageMeter()
    acc_meter = AverageMeter()

    evaluator = R1_mAP_eval(num_query,
                            max_rank=50,
                            feat_norm=cfg.TEST.FEAT_NORM)
    scaler = amp.GradScaler()
    # train
    for epoch in range(1, epochs + 1):
        start_time = time.time()
        loss_meter.reset()
        acc_meter.reset()
        evaluator.reset()
        scheduler.step(epoch)
        model.train()
        for n_iter, (img, vid, target_cam,
                     target_view) in enumerate(train_loader):
            optimizer.zero_grad()
            optimizer_center.zero_grad()
            img = img.to(device)
            target = vid.to(device)
            target_cam = target_cam.to(device)
            target_view = target_view.to(device)
            with amp.autocast(enabled=True):
                score, feat = model(img,
                                    target,
                                    cam_label=target_cam,
                                    view_label=target_view)
                loss = loss_fn(score, feat, target, target_cam)

            scaler.scale(loss).backward()

            scaler.step(optimizer)
            scaler.update()

            if 'center' in cfg.MODEL.METRIC_LOSS_TYPE:
                for param in center_criterion.parameters():
                    param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT)
                scaler.step(optimizer_center)
                scaler.update()
            if isinstance(score, list):
                acc = (score[0].max(1)[1] == target).float().mean()
            else:
                acc = (score.max(1)[1] == target).float().mean()

            loss_meter.update(loss.item(), img.shape[0])
            acc_meter.update(acc, 1)

            torch.cuda.synchronize()
            if (n_iter + 1) % log_period == 0:
                logger.info(
                    "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}"
                    .format(epoch, (n_iter + 1), len(train_loader),
                            loss_meter.avg, acc_meter.avg,
                            scheduler._get_lr(epoch)[0]))

        end_time = time.time()
        time_per_batch = (end_time - start_time) / (n_iter + 1)
        if cfg.MODEL.DIST_TRAIN:
            pass
        else:
            logger.info(
                "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]"
                .format(epoch, time_per_batch,
                        train_loader.batch_size / time_per_batch))

        if epoch % checkpoint_period == 0:
            if cfg.MODEL.DIST_TRAIN:
                if dist.get_rank() == 0:
                    torch.save(
                        model.state_dict(),
                        os.path.join(cfg.OUTPUT_DIR,
                                     cfg.MODEL.NAME + '_{}.pth'.format(epoch)))
            else:
                torch.save(
                    model.state_dict(),
                    os.path.join(cfg.OUTPUT_DIR,
                                 cfg.MODEL.NAME + '_{}.pth'.format(epoch)))

        if epoch % eval_period == 0:
            if cfg.MODEL.DIST_TRAIN:
                if dist.get_rank() == 0:
                    model.eval()
                    for n_iter, (img, vid, camid, camids, target_view,
                                 _) in enumerate(val_loader):
                        with torch.no_grad():
                            img = img.to(device)
                            camids = camids.to(device)
                            target_view = target_view.to(device)
                            feat = model(img,
                                         cam_label=camids,
                                         view_label=target_view)
                            evaluator.update((feat, vid, camid))
                    cmc, mAP, _, _, _, _, _ = evaluator.compute()
                    logger.info("Validation Results - Epoch: {}".format(epoch))
                    logger.info("mAP: {:.1%}".format(mAP))
                    for r in [1, 5, 10]:
                        logger.info("CMC curve, Rank-{:<3}:{:.1%}".format(
                            r, cmc[r - 1]))
                    torch.cuda.empty_cache()
            else:
                model.eval()
                for n_iter, (img, vid, camid, camids, target_view,
                             _) in enumerate(val_loader):
                    with torch.no_grad():
                        img = img.to(device)
                        camids = camids.to(device)
                        target_view = target_view.to(device)
                        feat = model(img,
                                     cam_label=camids,
                                     view_label=target_view)
                        evaluator.update((feat, vid, camid))
                cmc, mAP, _, _, _, _, _ = evaluator.compute()
                logger.info("Validation Results - Epoch: {}".format(epoch))
                logger.info("mAP: {:.1%}".format(mAP))
                for r in [1, 5, 10]:
                    logger.info("CMC curve, Rank-{:<3}:{:.1%}".format(
                        r, cmc[r - 1]))
                torch.cuda.empty_cache()
예제 #11
0
def do_train(
    cfg,
    model,
    center_criterion,
    train_loader,
    val_loader,
    optimizer,
    optimizer_center,
    scheduler,
    loss_fn,
    num_query,
    local_rank,
):
    log_period = cfg.SOLVER.LOG_PERIOD
    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    device = "cuda"
    epochs = cfg.SOLVER.MAX_EPOCHS

    logger = logging.getLogger("reid_baseline.train")
    logger.info('start training')
    _LOCAL_PROCESS_GROUP = None
    if device:
        model.to(local_rank)
        if torch.cuda.device_count() > 1 and cfg.MODEL.DIST_TRAIN:
            print('Using {} GPUs for training'.format(
                torch.cuda.device_count()))
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[local_rank], find_unused_parameters=True)

    scaler = amp.GradScaler()
    loss_meter = AverageMeter()
    acc_meter = AverageMeter()

    # train
    for epoch in range(1, epochs + 1):
        start_time = time.time()
        loss_meter.reset()
        acc_meter.reset()
        scheduler.step(epoch)
        model.train()
        for n_iter, (img, vid, target_cam) in enumerate(train_loader):
            optimizer.zero_grad()
            optimizer_center.zero_grad()
            img = img.to(device)
            target_cam = target_cam.to(device)

            if cfg.SOLVER.FP16_ENABLED:
                #### FP16 training
                with amp.autocast(enabled=True):
                    score, feat = model(img, target_cam, cam_label=None)
                    loss = loss_fn(score, feat, target_cam)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                score, feat = model(img, target_cam, cam_label=None)
                loss = loss_fn(score, feat, target_cam, target_cam)
                loss.backward()
                optimizer.step()

            if isinstance(score, list):
                acc = (score[0].max(1)[1] == target_cam).float().mean()
            else:
                acc = (score.max(1)[1] == target_cam).float().mean()
            loss_meter.update(loss.item(), img.shape[0])
            acc_meter.update(acc, 1)

            torch.cuda.synchronize()
            if (n_iter + 1) % log_period == 0:
                base_lr = scheduler._get_lr(
                    epoch
                )[0] if cfg.SOLVER.WARMUP_METHOD == 'cosine' else scheduler.get_lr(
                )[0]
                logger.info(
                    "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}"
                    .format(epoch, (n_iter + 1), len(train_loader),
                            loss_meter.avg, acc_meter.avg, base_lr))

        end_time = time.time()
        time_per_batch = (end_time - start_time) / (n_iter + 1)

        if cfg.MODEL.DIST_TRAIN:
            pass
        else:
            logger.info(
                "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]"
                .format(epoch, time_per_batch,
                        train_loader.batch_size / time_per_batch))

        if epoch % checkpoint_period == 0:
            if cfg.MODEL.DIST_TRAIN:
                if dist.get_rank() == 0:
                    torch.save(
                        model.module.state_dict(),
                        os.path.join(cfg.OUTPUT_DIR,
                                     cfg.MODEL.NAME + '_{}.pth'.format(epoch)))
            else:
                torch.save(
                    model.state_dict(),
                    os.path.join(cfg.OUTPUT_DIR,
                                 cfg.MODEL.NAME + '_{}.pth'.format(epoch)))
예제 #12
0
def main():

    torch.backends.cudnn.deterministic = True
    cudnn.benchmark = True
    #parser = argparse.ArgumentParser(description="ReID Baseline Training")
    #parser.add_argument(
    #"--config_file", default="", help="path to config file", type=str)

    #parser.add_argument("opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER)

    #args = parser.parse_args()
    config_file = 'configs/baseline_veri_r101_a.yml'
    if config_file != "":
        cfg.merge_from_file(config_file)
    #cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)

    logger = setup_logger("reid_baseline", output_dir, if_train=True)
    logger.info("Saving model in the path :{}".format(cfg.OUTPUT_DIR))
    logger.info(config_file)

    if config_file != "":
        logger.info("Loaded configuration file {}".format(config_file))
        with open(config_file, 'r') as cf:
            config_str = "\n" + cf.read()
            logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    os.environ['CUDA_VISIBLE_DEVICES'] = cfg.MODEL.DEVICE_ID

    path = 'D:/Python_SMU/Veri/verigms/gms/'
    pkl = {}
    entries = os.listdir(path)
    for name in entries:
        f = open((path + name), 'rb')
        if name == 'featureMatrix.pkl':
            s = name[0:13]
        else:
            s = name[0:3]
        pkl[s] = pickle.load(f)
        f.close

    with open('cids.pkl', 'rb') as handle:
        b = pickle.load(handle)

    with open('index.pkl', 'rb') as handle:
        c = pickle.load(handle)

    train_transforms, val_transforms, dataset, train_set, val_set = make_dataset(
        cfg, pkl_file='index.pkl')

    num_workers = cfg.DATALOADER.NUM_WORKERS
    num_classes = dataset.num_train_pids
    #pkl_f = 'index.pkl'
    pid = 0
    pidx = {}
    for img_path, pid, _, _ in dataset.train:
        path = img_path.split('\\')[-1]
        folder = path[1:4]
        pidx[folder] = pid
        pid += 1

    if 'triplet' in cfg.DATALOADER.SAMPLER:
        train_loader = DataLoader(train_set,
                                  batch_size=cfg.SOLVER.IMS_PER_BATCH,
                                  sampler=RandomIdentitySampler(
                                      dataset.train, cfg.SOLVER.IMS_PER_BATCH,
                                      cfg.DATALOADER.NUM_INSTANCE),
                                  num_workers=num_workers,
                                  pin_memory=True,
                                  collate_fn=train_collate_fn)
    elif cfg.DATALOADER.SAMPLER == 'softmax':
        print('using softmax sampler')
        train_loader = DataLoader(train_set,
                                  batch_size=cfg.SOLVER.IMS_PER_BATCH,
                                  shuffle=True,
                                  num_workers=num_workers,
                                  pin_memory=True,
                                  collate_fn=train_collate_fn)
    else:
        print('unsupported sampler! expected softmax or triplet but got {}'.
              format(cfg.SAMPLER))

    print("train loader loaded successfully")

    val_loader = DataLoader(val_set,
                            batch_size=cfg.TEST.IMS_PER_BATCH,
                            shuffle=False,
                            num_workers=num_workers,
                            pin_memory=True,
                            collate_fn=train_collate_fn)
    print("val loader loaded successfully")

    if cfg.MODEL.PRETRAIN_CHOICE == 'finetune':
        model = make_model(cfg, num_class=576)
        model.load_param_finetune(cfg.MODEL.PRETRAIN_PATH)
        print('Loading pretrained model for finetuning......')
    else:
        model = make_model(cfg, num_class=num_classes)

    loss_func, center_criterion = make_loss(cfg, num_classes=num_classes)

    optimizer, optimizer_center = make_optimizer(cfg, model, center_criterion)
    scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.STEPS,
                                  cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR,
                                  cfg.SOLVER.WARMUP_EPOCHS,
                                  cfg.SOLVER.WARMUP_METHOD)

    print("model,optimizer, loss, scheduler loaded successfully")

    height, width = cfg.INPUT.SIZE_TRAIN

    log_period = cfg.SOLVER.LOG_PERIOD
    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
    eval_period = cfg.SOLVER.EVAL_PERIOD

    device = "cuda"
    epochs = cfg.SOLVER.MAX_EPOCHS

    logger = logging.getLogger("reid_baseline.train")
    logger.info('start training')

    if device:
        if torch.cuda.device_count() > 1:
            print('Using {} GPUs for training'.format(
                torch.cuda.device_count()))
            model = nn.DataParallel(model)
        model.to(device)

    loss_meter = AverageMeter()
    acc_meter = AverageMeter()

    evaluator = R1_mAP_eval(len(dataset.query),
                            max_rank=50,
                            feat_norm=cfg.TEST.FEAT_NORM)
    model.base._freeze_stages()
    logger.info('Freezing the stages number:{}'.format(cfg.MODEL.FROZEN))

    data_index = search(pkl)
    print("Ready for training")

    for epoch in range(1, epochs + 1):
        start_time = time.time()
        loss_meter.reset()
        acc_meter.reset()
        evaluator.reset()
        scheduler.step()
        model.train()
        for n_iter, (img, label, index, pid, cid) in enumerate(train_loader):
            optimizer.zero_grad()
            optimizer_center.zero_grad()
            #img = img.to(device)
            #target = vid.to(device)
            trainX, trainY = torch.zeros(
                (train_loader.batch_size * 3, 3, height, width),
                dtype=torch.float32), torch.zeros(
                    (train_loader.batch_size * 3), dtype=torch.int64)

            for i in range(train_loader.batch_size):
                labelx = label[i]
                indexx = index[i]
                cidx = pid[i]
                if indexx > len(pkl[labelx]) - 1:
                    indexx = len(pkl[labelx]) - 1

                a = pkl[labelx][indexx]
                minpos = np.argmin(ma.masked_where(a == 0, a))
                pos_dic = train_set[data_index[cidx][1] + minpos]
                #print(pos_dic[1])
                neg_label = int(labelx)

                while True:
                    neg_label = random.choice(range(1, 770))
                    if neg_label is not int(labelx) and os.path.isdir(
                            os.path.join('D:/datasets/veri-split/train',
                                         strint(neg_label))) is True:
                        break

                negative_label = strint(neg_label)
                neg_cid = pidx[negative_label]
                neg_index = random.choice(range(0, len(pkl[negative_label])))

                neg_dic = train_set[data_index[neg_cid][1] + neg_index]
                trainX[i] = img[i]
                trainX[i + train_loader.batch_size] = pos_dic[0]
                trainX[i + (train_loader.batch_size * 2)] = neg_dic[0]
                trainY[i] = cidx
                trainY[i + train_loader.batch_size] = pos_dic[3]
                trainY[i + (train_loader.batch_size * 2)] = neg_dic[3]

            #print(trainY)
            trainX = trainX.cuda()
            trainY = trainY.cuda()

            score, feat = model(trainX, trainY)
            loss = loss_func(score, feat, trainY)
            loss.backward()
            optimizer.step()
            if 'center' in cfg.MODEL.METRIC_LOSS_TYPE:
                for param in center_criterion.parameters():
                    param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT)
                optimizer_center.step()

            acc = (score.max(1)[1] == trainY).float().mean()
            loss_meter.update(loss.item(), img.shape[0])
            acc_meter.update(acc, 1)

            if (n_iter + 1) % log_period == 0:
                logger.info(
                    "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}"
                    .format(epoch, (n_iter + 1), len(train_loader),
                            loss_meter.avg, acc_meter.avg,
                            scheduler.get_lr()[0]))
        end_time = time.time()
        time_per_batch = (end_time - start_time) / (n_iter + 1)
        logger.info(
            "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]"
            .format(epoch, time_per_batch,
                    train_loader.batch_size / time_per_batch))

        if epoch % checkpoint_period == 0:
            torch.save(
                model.state_dict(),
                os.path.join(cfg.OUTPUT_DIR,
                             cfg.MODEL.NAME + '_{}.pth'.format(epoch)))

        if epoch % eval_period == 0:
            model.eval()
            for n_iter, (img, vid, camid, _, _) in enumerate(val_loader):
                with torch.no_grad():
                    img = img.to(device)
                    feat = model(img)
                    evaluator.update((feat, vid, camid))

            cmc, mAP, _, _, _, _, _ = evaluator.compute()
            logger.info("Validation Results - Epoch: {}".format(epoch))
            logger.info("mAP: {:.1%}".format(mAP))
            for r in [1, 5, 10]:
                logger.info("CMC curve, Rank-{:<3}:{:.1%}".format(
                    r, cmc[r - 1]))
예제 #13
0
def do_train(cfg, model, center_criterion, train_loader, train_loader_b,
             val_loader, optimizer, optimizer_center, scheduler, loss_fn,
             num_query):
    log_period = cfg.SOLVER.LOG_PERIOD
    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
    eval_period = cfg.SOLVER.EVAL_PERIOD

    device = "cuda"
    epochs = cfg.SOLVER.MAX_EPOCHS

    logger = logging.getLogger("reid_baseline.train")
    logger.info('start training')

    if device:
        if torch.cuda.device_count() > 1:
            print('Using {} GPUs for training'.format(
                torch.cuda.device_count()))
            model.to(device)
            model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
            model = nn.DataParallel(model)
        else:
            model.to(device)
            model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
        # model.to(device)
    # model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
    loss_meter = AverageMeter()
    acc_meter = AverageMeter()

    evaluator = R1_mAP_eval(num_query,
                            max_rank=50,
                            feat_norm=cfg.TEST.FEAT_NORM)
    # model.base._freeze_stages()
    logger.info('Freezing the stages number:{}'.format(cfg.MODEL.FROZEN))
    # train
    for epoch in range(1, epochs + 1):
        start_time = time.time()
        loss_meter.reset()
        acc_meter.reset()
        evaluator.reset()
        scheduler.step()
        model.train()
        if epoch <= 80:
            loader = train_loader
        else:
            loader = train_loader_b
        for n_iter, (img, vid) in enumerate(loader):
            optimizer.zero_grad()
            optimizer_center.zero_grad()
            img = img.to(device)
            target = vid.to(device)
            if 'bdb' in cfg.MODEL.NAME:
                score, score2, feat1, feat2 = model(img, target)
                loss = loss_fn([score, score2], [feat1, feat2], target)
            else:
                score, feat = model(img, target)
                if cfg.DATALOADER.SAMPLER == 'softmax':
                    loss = F.cross_entropy(score, target)
                else:
                    loss = loss_fn(score, feat, target, model)

            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            optimizer.step()
            if 'center' in cfg.MODEL.METRIC_LOSS_TYPE:
                for param in center_criterion.parameters():
                    param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT)
                optimizer_center.step()

            acc = (score.max(1)[1] == target).float().mean()
            loss_meter.update(loss.item(), img.shape[0])
            acc_meter.update(acc, 1)

            if (n_iter + 1) % log_period == 0:
                logger.info(
                    "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}"
                    .format(epoch, (n_iter + 1), len(loader), loss_meter.avg,
                            acc_meter.avg,
                            scheduler.get_lr()[0]))

        end_time = time.time()
        time_per_batch = (end_time - start_time) / (n_iter + 1)
        logger.info(
            "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]"
            .format(epoch, time_per_batch,
                    train_loader.batch_size / time_per_batch))

        if epoch % checkpoint_period == 0:
            torch.save(
                model.state_dict(),
                os.path.join(cfg.OUTPUT_DIR,
                             cfg.MODEL.NAME + '_{}.pth'.format(epoch)))
예제 #14
0
    if device:
        if torch.cuda.device_count() > 1:
            print('Using {} GPUs for training'.format(torch.cuda.device_count()))
            model = nn.DataParallel(model)
        model.to(device)

    loss_meter = AverageMeter()
    acc_meter = AverageMeter()

    evaluator = R1_mAP_eval(num_query, max_rank=50, feat_norm='yes')
    model.base._freeze_stages()
    logger.info('Freezing the stages number:{}'.format(-1))
    # train
    for epoch in range(1, epochs + 1):
        start_time = time.time()
        loss_meter.reset()
        acc_meter.reset()
        evaluator.reset()
        scheduler.step()
        model.train()
        for n_iter, (img, vid) in enumerate(train_loader):
            optimizer.zero_grad()
            optimizer_center.zero_grad()
            img = img.to(device)
            target = vid.to(device)

            feat = model(img, target)
            loss,score = loss_func(feat, target)

            loss.backward()
            optimizer.step()
예제 #15
0
def do_train(cfg, model, center_criterion, train_loader, val_loader, optimizer,
             optimizer_center, scheduler, loss_fn, num_query):
    log_period = cfg.SOLVER.LOG_PERIOD
    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
    eval_period = cfg.SOLVER.EVAL_PERIOD

    device = "cuda"
    epochs = cfg.SOLVER.MAX_EPOCHS
    start_epoch = cfg.SOLVER.START_EPOCH

    logger = logging.getLogger("reid_baseline.train")
    logger.info('start training')

    if device:
        if torch.cuda.device_count() > 1:
            print('Using {} GPUs for training'.format(
                torch.cuda.device_count()))
            model = nn.DataParallel(model)
        model.to(device)

    loss_meter = AverageMeter()
    acc_meter1 = AverageMeter()
    acc_meter2 = AverageMeter()
    # acc_cam = AverageMeter()
    evaluator = R1_mAP_eval(num_query,
                            max_rank=50,
                            feat_norm=cfg.TEST.FEAT_NORM)
    if torch.cuda.device_count() > 1:
        model.module.base._freeze_stages()
    else:
        model.base._freeze_stages()
    logger.info('Freezing the stages number:{}'.format(cfg.MODEL.FROZEN))
    # train
    for epoch in range(start_epoch, epochs + 1):
        start_time = time.time()
        loss_meter.reset()
        acc_meter1.reset()
        acc_meter2.reset()
        # acc_cam.reset()
        evaluator.reset()
        scheduler.step()
        model.train()
        for n_iter, (img, vid, _) in enumerate(train_loader):
            optimizer.zero_grad()
            optimizer_center.zero_grad()
            img = img.to(device)
            target = vid.to(device)
            # camid = camid.to(device)

            scores, feat = model(img, target)
            loss = loss_fn(scores, feat, target)

            loss.backward()
            optimizer.step()
            if 'center' in cfg.MODEL.METRIC_LOSS_TYPE:
                for param in center_criterion.parameters():
                    param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT)
                optimizer_center.step()

            acc = [(score.max(1)[1] == target).float().mean()
                   for score in scores]
            # cam_acc = (cam_score.max(1)[1] == camid).float().mean()
            loss_meter.update(loss.item(), img.shape[0])
            acc_meter1.update(acc[0].item(), 1)
            acc_meter2.update(acc[1].item(), 1)
            # acc_cam.update(cam_acc.item(), 1)

            if (n_iter + 1) % log_period == 0:
                logger.info(
                    "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc1: {:.3f}, Acc2: {:.3f}, Base Lr: {:.2e}"
                    .format(epoch, (n_iter + 1), len(train_loader),
                            loss_meter.avg, acc_meter1.avg, acc_meter2.avg,
                            scheduler.get_lr()[0]))
        end_time = time.time()
        time_per_batch = (end_time - start_time) / (n_iter + 1)
        logger.info(
            "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]"
            .format(epoch, time_per_batch,
                    train_loader.batch_size / time_per_batch))

        if epoch % checkpoint_period == 0:
            if torch.cuda.device_count() > 1:
                torch.save(
                    {
                        'static_dict': model.module.state_dict(),
                        'optimizer_static_dict': optimizer.state_dict()
                    },
                    os.path.join(cfg.OUTPUT_DIR,
                                 cfg.MODEL.NAME + '_{}.pth'.format(epoch)))
            else:
                torch.save(
                    {
                        'static_dict': model.state_dict(),
                        'optimizer_static_dict': optimizer.state_dict()
                    },
                    os.path.join(cfg.OUTPUT_DIR,
                                 cfg.MODEL.NAME + '_{}.pth'.format(epoch)))

        if epoch % eval_period == 0:
            model.eval()
            for n_iter, (img, vid, camid, _, _) in enumerate(val_loader):
                with torch.no_grad():
                    img = img.to(device)
                    feat = model(img)
                    evaluator.update((feat, vid, camid))

            cmc, mAP, _, _, _, _, _ = evaluator.compute()
            logger.info("Validation Results - Epoch: {}".format(epoch))
            logger.info("mAP: {:.1%}".format(mAP))
            for r in [1, 5, 10]:
                logger.info("CMC curve, Rank-{:<3}:{:.1%}".format(
                    r, cmc[r - 1]))
예제 #16
0
def do_train(cfg, model, center_criterion, train_loader, val_loader, optimizer,
             optimizer_center, scheduler, loss_fn, num_query):
    log_period = cfg.SOLVER.LOG_PERIOD
    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    epochs = cfg.SOLVER.MAX_EPOCHS

    logger = logging.getLogger("reid_baseline.train")
    logger.info('start training')

    if device:
        model.to(device)
        #print("cuda个数", torch.cuda.device_count())
        if torch.cuda.device_count() > 1:
            print('Using {} GPUs for training'.format(
                torch.cuda.device_count()))
            model = nn.DataParallel(model)
    loss_meter = AverageMeter()
    acc_meter = AverageMeter()
    # train
    scaler = GradScaler()
    for epoch in range(1, epochs + 1):
        start_time = time.time()
        loss_meter.reset()
        acc_meter.reset()

        model.train()
        for n_iter, (img, vid) in enumerate(train_loader):
            optimizer.zero_grad()
            optimizer_center.zero_grad()
            img = img.cuda(non_blocking=True)
            target = vid.cuda(non_blocking=True)
            with autocast():
                score, feat = model(img, target)
                loss = loss_fn(score, feat, target)
            scaler.scale(loss).backward()
            # optimizer.module.step()
            scaler.step(optimizer)
            #scaler.update()
            if 'center' in cfg.MODEL.METRIC_LOSS_TYPE:
                for param in center_criterion.parameters():
                    param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT)
                scaler.step(optimizer_center)
            scaler.update()
            acc = (score.max(1)[1] == target).float().mean()
            loss_meter.update(loss.item(), img.shape[0])
            acc_meter.update(acc, 1)

            if (n_iter + 1) % log_period == 0:
                logger.info(
                    "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}"
                    .format(epoch, (n_iter + 1), len(train_loader),
                            loss_meter.avg, acc_meter.avg,
                            scheduler.get_lr()[0]))
        scheduler.step()
        end_time = time.time()
        time_per_batch = (end_time - start_time) / (n_iter + 1)
        logger.info(
            "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]"
            .format(epoch, time_per_batch,
                    train_loader.batch_size / time_per_batch))

        if epoch % checkpoint_period == 0:
            torch.save(
                model.state_dict(),
                os.path.join(cfg.OUTPUT_DIR,
                             cfg.MODEL.NAME + '_{}.pth'.format(epoch)))
예제 #17
0
def do_train(cfg, model, center_criterion, train_loader, val_loader, optimizer,
             optimizer_center, scheduler, loss_fn, num_query, writer):
    log_period = cfg.SOLVER.LOG_PERIOD
    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
    device = "cuda"
    epochs = cfg.SOLVER.MAX_EPOCHS
    tmp_input_data = torch.rand(
        (10, 3, cfg.INPUT.SIZE_TRAIN[0], cfg.INPUT.SIZE_TRAIN[1]))
    writer.add_graph(model, (tmp_input_data, ))

    logger = logging.getLogger("reid_baseline.train")
    logger.info('start training')

    if device:
        model.to(device)
        if torch.cuda.device_count() > 1:
            print('Using {} GPUs for training'.format(
                torch.cuda.device_count()))
            model = nn.DataParallel(model)
            model = model.cuda()
        else:
            if cfg.SOLVER.FP16:
                model, optimizer = amp.initialize(model,
                                                  optimizer,
                                                  opt_level='O1')

    loss_meter = AverageMeter()
    id_loss_meter = AverageMeter()
    tri_loss_meter = AverageMeter()
    cen_loss_meter = AverageMeter()
    acc_meter = AverageMeter()
    lr_meter = AverageMeter()
    if cfg.SOLVER.SWA:
        swa_model = torch.optim.swa_utils.AveragedModel(model)

    # train
    for epoch in range(1, epochs + 1):
        start_time = time.time()
        loss_meter.reset()
        acc_meter.reset()
        id_loss_meter.reset()
        tri_loss_meter.reset()
        cen_loss_meter.reset()
        lr_meter.reset()
        model.train()
        if cfg.SOLVER.GRADUAL_UNLOCK:
            model.base.gradual_unlock(cfg.SOLVER.MAX_EPOCHS, epoch)
        for n_iter, (img, vid) in enumerate(train_loader):
            optimizer.zero_grad()
            optimizer_center.zero_grad()
            img = img.to(device)
            target = vid.to(device)
            if cfg.DATASETS.MIXUP:
                img, target_a, target_b, lam = mixup_data(img, target)
            score, feat = model(img, target)

            if cfg.DATASETS.MIXUP:
                all_loss = mixup_criterion(loss_fn, score, feat, target_a,
                                           target_b, lam)
            else:
                all_loss = loss_fn(score, feat, target)
            loss, id_loss, tri_loss, cen_loss = all_loss
            if cfg.SOLVER.FP16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            optimizer.step()
            if 'center' in cfg.MODEL.METRIC_LOSS_TYPE:
                for param in center_criterion.parameters():
                    param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT)
                optimizer_center.step()

            loss_meter.update(loss.item(), img.shape[0])
            id_loss_meter.update(id_loss.item(), img.shape[0])
            if torch.is_tensor(tri_loss):
                tri_loss_meter.update(tri_loss.item(), img.shape[0])
            else:
                tri_loss_meter.update(tri_loss, 1)
            if torch.is_tensor(cen_loss):
                cen_loss_meter.update(cen_loss.item(), img.shape[0])
            else:
                cen_loss_meter.update(cen_loss, 1)
            acc = (score.max(1)[1] == target).float().mean()
            acc_meter.update(acc, 1)
            lr_meter.update(scheduler.get_last_lr()[0])

            writer.add_scalar('data/total_loss', loss_meter.avg,
                              (epoch - 1) * len(train_loader) + n_iter)
            writer.add_scalar('data/id_loss', id_loss_meter.avg,
                              (epoch - 1) * len(train_loader) + n_iter)
            writer.add_scalar('data/tri_loss', tri_loss_meter.avg,
                              (epoch - 1) * len(train_loader) + n_iter)
            writer.add_scalar('data/cen_loss', cen_loss_meter.avg,
                              (epoch - 1) * len(train_loader) + n_iter)
            writer.add_scalar('data/learning_rate', lr_meter.avg,
                              (epoch - 1) * len(train_loader) + n_iter)

            if (n_iter + 1) % log_period == 0:
                logger.info(
                    "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}"
                    .format(epoch, (n_iter + 1), len(train_loader),
                            loss_meter.avg, acc_meter.avg,
                            scheduler.get_last_lr()[0]))
        scheduler.step()
        end_time = time.time()
        time_per_batch = (end_time - start_time) / (n_iter + 1)
        logger.info(
            "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]"
            .format(epoch, time_per_batch,
                    train_loader.batch_size / time_per_batch))

        if epoch % checkpoint_period == 0:
            src_path = os.path.join(cfg.OUTPUT_DIR,
                                    cfg.MODEL.NAME + '_{}.pth'.format(epoch))
            torch.save(model.state_dict(), src_path)
            try:
                dest_root = os.path.join(
                    '/mnt/nfs-internstorage/user/zjf/NAIC2020/models',
                    cfg.SAVE_FLAG)
                if not os.path.exists(dest_root):
                    os.mkdir(dest_root)
                dst_path = os.path.join(
                    dest_root, cfg.MODEL.NAME + '_{}.pth'.format(epoch))
                shutil.copy(src_path, dst_path)
            except:
                print('No bak models...')
                pass
        if cfg.SOLVER.SWA and epoch in cfg.SOLVER.SWA_START:
            swa_model.update_parameters(model)
            logger.info('swa combine the {} epoch model'.format(epoch))
    if cfg.SOLVER.SWA:
        try:
            swa_model.cpu()
            torch.optim.swa_utils.update_bn(train_loader, swa_model)
            swa_model.cuda()
            src_path = os.path.join(cfg.OUTPUT_DIR,
                                    cfg.MODEL.NAME + '_swa.pth')
            torch.save(swa_model.state_dict(), src_path)
            logger.info('swa model is successfuly saved.')
        except:
            logger.info('swa model save failed.')
예제 #18
0
def do_train_xbm(cfg, model, center_criterion, train_loader, val_loader,
                 optimizer, optimizer_center, scheduler, loss_fn, num_query):
    log_period = cfg.SOLVER.LOG_PERIOD
    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
    eval_period = cfg.SOLVER.EVAL_PERIOD

    device = "cuda"
    epochs = cfg.SOLVER.MAX_EPOCHS

    logger = logging.getLogger("reid_baseline.train")
    logger.info('start training')

    if device:
        if torch.cuda.device_count() > 1:
            print('Using {} GPUs for training'.format(
                torch.cuda.device_count()))
            model.to(device)
            #model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
            model = nn.DataParallel(model)
        else:
            model.to(device)
            model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    loss_meter = AverageMeter()
    acc_meter = AverageMeter()
    xbm1 = XBM(6000, 2048)
    evaluator = R1_mAP_eval(num_query,
                            max_rank=50,
                            feat_norm=cfg.TEST.FEAT_NORM)
    # model.base._freeze_stages()
    logger.info('Freezing the stages number:{}'.format(cfg.MODEL.FROZEN))
    # train
    for epoch in range(1, epochs + 1):
        start_time = time.time()
        loss_meter.reset()
        acc_meter.reset()
        evaluator.reset()
        scheduler.step()
        model.train()
        for n_iter, (img, vid, cps) in enumerate(train_loader):
            optimizer.zero_grad()
            optimizer_center.zero_grad()
            img = img.to(device)
            cps = cps.to(device)
            target = vid.to(device)

            score, feat = model(img, target)
            if epoch >= 10:
                xbm1.enqueue_dequeue(feat.detach(), target.detach(),
                                     cps.detach())
                xbm1_feats, xbm1_targets, xbm1_cps = xbm1.get()
                loss = torch.nn.functional.cross_entropy(score, target) + TripletLoss()(feat, (target,cps))[0]+\
                       TripletLoss_XBM()(feat,xbm1_feats,(target,cps),(xbm1_targets,xbm1_cps))[0]
            else:
                loss = torch.nn.functional.cross_entropy(
                    score, target) + TripletLoss()(feat, (target, cps))[0]
                # xbm_loss = TripletLoss()(xbm_feats, xbm_targets)
                # loss = (loss + xbm_loss[0])
            # loss = loss_fn(score, feat, target)
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            # loss.backward()
            optimizer.step()
            if 'center' in cfg.MODEL.METRIC_LOSS_TYPE:
                for param in center_criterion.parameters():
                    param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT)
                optimizer_center.step()
            if type(score) is list:
                acc = (score[0].max(1)[1] == target).float().mean()
            else:
                acc = (score.max(1)[1] == target).float().mean()
            loss_meter.update(loss.item(), img.shape[0])
            acc_meter.update(acc, 1)

            if (n_iter + 1) % log_period == 0:
                logger.info(
                    "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}"
                    .format(epoch, (n_iter + 1), len(train_loader),
                            loss_meter.avg, acc_meter.avg,
                            scheduler.get_lr()[0]))

        end_time = time.time()
        time_per_batch = (end_time - start_time) / (n_iter + 1)
        logger.info(
            "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]"
            .format(epoch, time_per_batch,
                    train_loader.batch_size / time_per_batch))

        if epoch % checkpoint_period == 0:
            torch.save(
                model.state_dict(),
                os.path.join(cfg.OUTPUT_DIR,
                             cfg.MODEL.NAME + '_{}.pth'.format(epoch)))