Exemplo n.º 1
0
def validate(val_list, model, criterion):
    print("begin test")
    test_loader = listDataset(
        val_list,
        shuffle=True,
        transform=ST.Compose([
            ST.ToNumpy(),
            ST.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224,
                                                          0.225]),
        ]),
        train=True,
        seen=model.seen,
        batch_size=args.batch_size,
        num_workers=args.workers,
    )

    model.eval()
    mae = 0
    for i, (img, target) in enumerate(test_loader):
        img = flow.Tensor(img, dtype=flow.float32, device="cuda")
        with flow.no_grad():
            output = model(img).to("cuda")
        mae += abs(output.data.sum().numpy() - target.sum())
    mae = mae / len(test_loader)
    print(" * MAE {mae:.3f} ".format(mae=mae))
    return mae
Exemplo n.º 2
0
def main():
    transform = ST.Compose(
        [
            ST.ToNumpyForVal(),
            ST.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ]
    )
    global args
    args = parser.parse_args()
    model = CSRNet()
    model = model.to("cuda")
    # checkpoint = flow.load('checkpoint/Shanghai_BestModelA/shanghaiA_bestmodel')
    checkpoint = flow.load(args.modelPath)
    model.load_state_dict(checkpoint)
    img = transform(Image.open(args.picPath).convert("RGB"))
    img = flow.Tensor(img)
    img = img.to("cuda")
    output = model(img.unsqueeze(0))
    print("Predicted Count : ", int(output.detach().to("cpu").sum().numpy()))
    temp = output.view(output.shape[2], output.shape[3])
    temp = temp.numpy()
    plt.title("Predicted Count")
    plt.imshow(temp, cmap=c.jet)
    plt.show()
    temp = h5py.File(args.picDensity, "r")
    temp_1 = np.asarray(temp["density"])
    plt.title("Original Count")
    plt.imshow(temp_1, cmap=c.jet)
    print("Original Count : ", int(np.sum(temp_1)) + 1)
    plt.show()
    print("Original Image")
    plt.title("Original Image")
    plt.imshow(plt.imread(args.picPath))
    plt.show()
Exemplo n.º 3
0
def main():
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    use_gpu = torch.cuda.is_available()

    sys.stdout = Logger(osp.join(args.resume, 'log_test.txt'))
    print("==========\nArgs:{}\n==========".format(args))

    print("Initializing dataset {}".format(args.dataset))
    dataset = data_manager.init_dataset(name=args.dataset, root=args.root)

    # Data augmentation
    spatial_transform_test = ST.Compose([
        ST.Scale((args.height, args.width), interpolation=3),
        ST.ToTensor(),
        ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    temporal_transform_test = None

    pin_memory = True if use_gpu else False

    queryloader = DataLoader(VideoDataset(
        dataset.query,
        spatial_transform=spatial_transform_test,
        temporal_transform=temporal_transform_test),
                             batch_size=1,
                             shuffle=False,
                             num_workers=0,
                             pin_memory=pin_memory,
                             drop_last=False)

    galleryloader = DataLoader(VideoDataset(
        dataset.gallery,
        spatial_transform=spatial_transform_test,
        temporal_transform=temporal_transform_test),
                               batch_size=1,
                               shuffle=False,
                               num_workers=0,
                               pin_memory=pin_memory,
                               drop_last=False)

    print("Initializing model: {}".format(args.arch))
    model = models.init_model(name=args.arch,
                              num_classes=dataset.num_train_pids)
    print("Model size: {:.5f}M".format(
        sum(p.numel() for p in model.parameters()) / 1000000.0))

    for epoch in args.test_epochs:
        model_path = osp.join(args.resume,
                              'checkpoint_ep' + str(epoch) + '.pth.tar')
        print("Loading checkpoint from '{}'".format(model_path))
        checkpoint = torch.load(model_path)
        model.load_state_dict(checkpoint['state_dict'])

        if use_gpu: model = model.cuda()

        print("Evaluate")
        with torch.no_grad():
            test(model, queryloader, galleryloader, use_gpu)
Exemplo n.º 4
0
def getDataSets(dataset):
    spatial_transform_test = ST.Compose([
                ST.Scale((args.height, args.width), interpolation=3),
                ST.ToNumpy(),
                ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ])
    temporal_transform_test = TT.TemporalBeginCrop()
    queryset = VideoDataset(dataset.query, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test)

    galleryset =VideoDataset(dataset.gallery, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test)
    return queryset,galleryset
Exemplo n.º 5
0
def getDataSets(dataset):
    spatial_transform_train = ST.Compose([
                ST.Scale((args.height, args.width), interpolation=3),
                ST.RandomHorizontalFlip(),
                ST.ToNumpy(),
                ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ])
    temporal_transform_train = TT.TemporalRandomCrop(size=args.seq_len, stride=args.sample_stride)
    spatial_transform_test = ST.Compose([
                ST.Scale((args.height, args.width), interpolation=3),
                ST.ToNumpy(),
                ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ])
    temporal_transform_test = TT.TemporalBeginCrop()
    if args.dataset != 'mars':
        trainset = VideoDataset(dataset.train_dense, spatial_transform=spatial_transform_train, temporal_transform=temporal_transform_train)
    else:
        trainset = VideoDataset(dataset.train, spatial_transform=spatial_transform_train, temporal_transform=temporal_transform_train)
    queryset = VideoDataset(dataset.query, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test)

    galleryset =VideoDataset(dataset.gallery, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test)
    return trainset,queryset,galleryset
Exemplo n.º 6
0
def train(train_list, model, criterion, optimizer, epoch):
    losses = AverageMeter()
    batch_time = AverageMeter()
    data_time = AverageMeter()
    train_loader = listDataset(
        train_list,
        shuffle=True,
        transform=ST.Compose([
            ST.ToNumpy(),
            ST.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224,
                                                          0.225]),
        ]),
        train=True,
        seen=model.seen,
        batch_size=args.batch_size,
        num_workers=args.workers,
    )
    model.train()
    end = time.time()
    for i, (img, target) in enumerate(train_loader):
        data_time.update(time.time() - end)
        img = flow.Tensor(img, dtype=flow.float32, device="cuda")
        output = model(img).to("cuda")
        output = flow.Tensor(output, device="cuda")
        target = flow.Tensor(target, device="cuda").unsqueeze(0)
        loss = criterion(output, target)
        losses.update(loss.numpy(), img.size(0))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        batch_time.update(time.time() - end)
        end = time.time()
        if i % args.print_freq == 0:
            print("Epoch: [{0}][{1}/{2}]\t"
                  "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t"
                  "Data {data_time.val:.3f} ({data_time.avg:.3f})\t"
                  "Loss {loss.val:.4f} ({loss.avg:.4f})\t".format(
                      epoch,
                      i,
                      len(train_loader),
                      batch_time=batch_time,
                      data_time=data_time,
                      loss=losses,
                  ))
Exemplo n.º 7
0
def main():
    transform = ST.Compose([
        ST.ToNumpyForVal(),
        ST.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    global args
    args = parser.parse_args()
    root = "./dataset/"
    # now generate the ShanghaiA's ground truth
    part_A_train = os.path.join(root, "part_A_final/train_data", "images")
    part_A_test = os.path.join(root, "part_A_final/test_data", "images")
    part_B_train = os.path.join(root, "part_B_final/train_data", "images")
    part_B_test = os.path.join(root, "part_B_final/test_data", "images")
    path_sets = []
    if args.picSrc == "part_A_test":
        path_sets = [part_A_test]
    elif args.picSrc == "part_B_test":
        path_sets = [part_B_test]
    img_paths = []
    for path in path_sets:
        for img_path in glob.glob(os.path.join(path, "*.jpg")):
            img_paths.append(img_path)
    model = CSRNet()
    model = model.to("cuda")
    checkpoint = flow.load(args.modelPath)
    model.load_state_dict(checkpoint)
    MAE = []
    for i in range(len(img_paths)):
        img = transform(Image.open(img_paths[i]).convert("RGB"))
        img = np.asarray(img).astype(np.float32)
        img = flow.Tensor(img, dtype=flow.float32, device="cuda")
        img = img.to("cuda")
        gt_file = h5py.File(
            img_paths[i].replace(".jpg",
                                 ".h5").replace("images", "ground_truth"), "r")
        groundtruth = np.asarray(gt_file["density"])
        with flow.no_grad():
            output = model(img.unsqueeze(0))
        mae = abs(output.sum().numpy() - np.sum(groundtruth))
        MAE.append(mae)
    avg_MAE = sum(MAE) / len(MAE)
    print("test result: MAE:{:2f}".format(avg_MAE))
Exemplo n.º 8
0
def main():
    torch.manual_seed(args.seed)
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices
    use_gpu = torch.cuda.is_available()
    if args.use_cpu: use_gpu = False
    
    # set a learning rate 
    if args.lr_factor == -1:
        args.lr_factor = random()
    args.lr = args.lr_factor * 10**-args.lr_base
    #print(f"Choose learning rate {args.lr}")

    sys.stdout = Logger(osp.join(args.save_dir, 'log_train.txt'), mode='a')
    print("==========\nArgs:{}\n==========".format(args))

    #assert torch.distributed.is_available()
    #print("Initializing DDP by nccl-tcp({}) rank({}) world_size({})".format(args.init_method, args.rank, args.world_size))
    #dist.init_process_group(backend='nccl', init_method=args.init_method, rank=args.rank, world_size=args.world_size)
        
    if use_gpu:
        print("Currently using GPU {}".format(args.gpu_devices))
        torch.cuda.manual_seed_all(args.seed)
    else:
        print("Currently using CPU (GPU is highly recommended)")

    print("Initializing dataset {}".format(args.dataset))
    dataset = data_manager.init_dataset(name=args.dataset, root=args.root)

    # Data augmentation
    spatial_transform_train = [
                ST.Scale((args.height, args.width), interpolation=3),
                ST.RandomHorizontalFlip(),
                ST.ToTensor(),
                ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ]
    spatial_transform_train = ST.Compose(spatial_transform_train)

    temporal_transform_train = TT.TemporalRandomCrop(size=args.seq_len, stride=args.sample_stride)
    #temporal_transform_train = TT.TemporalRandomCropPick(size=args.seq_len, stride=args.sample_stride)

    spatial_transform_test = ST.Compose([
                ST.Scale((args.height, args.width), interpolation=3),
                ST.ToTensor(),
                ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ])
    temporal_transform_test = TT.TemporalBeginCrop(size=args.test_frames)

    pin_memory = True if use_gpu else False

    dataset_train = dataset.train
    if args.dataset == 'duke':
        dataset_train = dataset.train_dense
        print('process duke dataset')

    #sampler = RandomIdentitySampler(dataset_train, num_instances=args.num_instances)
    if args.dataset == 'lsvid':
        sampler = RandomIdentitySampler(dataset_train, num_instances=args.num_instances)
    elif args.dataset == 'mars':
        sampler = RandomIdentitySampler(dataset_train, num_instances=args.num_instances)
    trainloader = DataLoader(
        VideoDataset(dataset_train, spatial_transform=spatial_transform_train, temporal_transform=temporal_transform_train),
        sampler=sampler,
        batch_size=args.train_batch, num_workers=args.workers,
        pin_memory=pin_memory, drop_last=True,
    )
    '''
    for batch_idx, (vids, pids, camids, img_paths) in enumerate(trainloader):
        print(batch_idx, pids, camids, img_paths)
        break
    return
    '''
    dataset_query = dataset.query
    dataset_gallery = dataset.gallery
    if args.dataset == 'lsvid':
        dataset_query = dataset.val_query
        dataset_gallery = dataset.val_gallery
        print('process lsvid dataset')
        
    queryloader = DataLoader(
        VideoDataset(dataset_query, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test),
        batch_size=args.test_batch, shuffle=False, num_workers=args.workers,
        pin_memory=pin_memory, drop_last=False
    )

    galleryloader = DataLoader(
        VideoDataset(dataset_gallery, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test),
        batch_size=args.test_batch, shuffle=False, num_workers=args.workers,
        pin_memory=pin_memory, drop_last=False
    )
    
    print("Initializing model: {}".format(args.arch))
    model = models.init_model(name=args.arch, use_gpu=use_gpu, num_classes=dataset.num_train_pids, loss={'xent', 'htri'})
    #print(model)
    if args.resume:
        print("Loading checkpoint from '{}'".format(args.resume))
        checkpoint = torch.load(args.resume)
        model.load_state_dict(checkpoint['state_dict'])

    criterion_xent = nn.CrossEntropyLoss() 
    criterion_htri = TripletLoss(margin=args.margin, distance=args.distance, use_gpu=use_gpu)
    criterion_htri_c = TripletInterCamLoss(margin=args.margin, distance=args.distance, use_gpu=use_gpu)
    #criterion_htri_c = TripletWeightedInterCamLoss(margin=args.margin, distance=args.distance, use_gpu=use_gpu, alpha=args.cam_alpha)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)

    scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=args.stepsize, gamma=args.gamma)
    start_epoch = args.start_epoch

    if use_gpu:
        model = nn.DataParallel(model).cuda()
        #model = model.cuda()
        #model = nn.parallel.DistributedDataParallel(model)

    start_time = time.time()
    train_time = 0
    best_rank1 = -np.inf
    best_epoch = 0
    print("==> Start training")
    
    for epoch in range(start_epoch, args.max_epoch):
        #print("Set sampler seed to {}".format(args.seed*epoch))
        #sampler.set_seed(args.seed*epoch)
        start_train_time = time.time()
        train(epoch, model, criterion_xent, criterion_htri, criterion_htri_c, optimizer, trainloader, use_gpu)
        train_time += round(time.time() - start_train_time)

        scheduler.step()
        
        if (epoch+1) >= args.start_eval and (epoch+1) % args.eval_step == 0 or epoch == 0:
            print("==> Test")
            with torch.no_grad():
                rank1 = test(model, queryloader, galleryloader, use_gpu)
            is_best = rank1 > best_rank1
            if is_best: 
                best_rank1 = rank1
                best_epoch = epoch + 1

            if use_gpu:
                state_dict = model.module.state_dict()
            else:
                state_dict = model.state_dict()
            save_checkpoint({
                'state_dict': state_dict,
                'rank1': rank1,
                'epoch': epoch,
            }, is_best, osp.join(args.save_dir, 'checkpoint_ep' + str(epoch+1) + '.pth.tar'))

    print("==> Best Rank-1 {:.1%}, achieved at epoch {}".format(best_rank1, best_epoch))

    elapsed = round(time.time() - start_time)
    elapsed = str(datetime.timedelta(seconds=elapsed))
    train_time = str(datetime.timedelta(seconds=train_time))
    print("Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.".format(elapsed, train_time))
Exemplo n.º 9
0
            try:
                clip = torch.stack(clip, 0).permute(1, 0, 2, 3)
                assert clip.size(1) == self.sample_duration
            except:
                print("stack failed or clip is not right size", flush=True)
                print(len(clip), flush=True)
                print([cl.size() for cl in clip], flush=True)
                print(id, flush=True)
            clips.append(clip)

        return {
            'id': id,
            'duration': duration,
            'timestamps': timestamps,
            'fps': fps,
            'clips': clips
        }

    def __len__(self):
        return self.len


if __name__ == '__main__':
    sp = spt.Compose([spt.CornerCrop(size=224), spt.ToTensor()])
    tp = tpt.Compose([tpt.TemporalRandomCrop(16), tpt.LoopPadding(16)])
    dset = ActivityNetCaptions_Train('/ssd1/dsets/activitynet_captions',
                                     spatial_transform=sp,
                                     temporal_transform=tp)
    print(dset[0][0].size())
    print(dset[0][1])
Exemplo n.º 10
0
def main():
    torch.manual_seed(args.seed)
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices
    use_gpu = torch.cuda.is_available()
    if args.use_cpu: use_gpu = False
    
    # set a learning rate 
    #if args.lr_factor == -1:
    #    args.lr_factor = random()
    #args.lr = args.lr_factor * 10**-args.lr_base
    #print(f"Choose learning rate {args.lr}")

    sys.stdout = Logger(osp.join(args.save_dir, 'log_train.txt'), mode='a')
    print("==========\nArgs:{}\n==========".format(args))

    #assert torch.distributed.is_available()
    #print("Initializing DDP by nccl-tcp({}) rank({}) world_size({})".format(args.init_method, args.rank, args.world_size))
    #dist.init_process_group(backend='nccl', init_method=args.init_method, rank=args.rank, world_size=args.world_size)
        
    if use_gpu:
        print("Currently using GPU {}".format(args.gpu_devices))
        torch.cuda.manual_seed_all(args.seed)
    else:
        print("Currently using CPU (GPU is highly recommended)")

    print("Initializing dataset {}".format(args.dataset))
    dataset = data_manager.init_dataset(name=args.dataset, root=args.root)

    # Data augmentation
    spatial_transform_train = [
                ST.Scale((args.height, args.width), interpolation=3),
                ST.RandomHorizontalFlip(),
                ST.ToTensor(),
                ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ]
    spatial_transform_train = ST.Compose(spatial_transform_train)

    temporal_transform_train = TT.TemporalRandomCrop(size=args.seq_len, stride=args.sample_stride)
    #temporal_transform_train = TT.TemporalRandomCropPick(size=args.seq_len, stride=args.sample_stride)

    spatial_transform_test = ST.Compose([
                ST.Scale((args.height, args.width), interpolation=3),
                ST.ToTensor(),
                ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ])
    temporal_transform_test = TT.TemporalBeginCrop(size=args.test_frames)

    pin_memory = True if use_gpu else False

    dataset_train = dataset.train
    if args.dataset == 'duke':
        dataset_train = dataset.train_dense
        print('process duke dataset')

    #sampler = RandomIdentitySampler(dataset_train, num_instances=args.num_instances)
    if args.dataset == 'lsvid':
        sampler = RandomIdentityCameraSampler(dataset_train, num_instances=args.num_instances, num_cam=15)
    elif args.dataset == 'mars':
        sampler = RandomIdentityCameraSampler(dataset_train, num_instances=args.num_instances, num_cam=6)
    trainloader = DataLoader(
        VideoDataset(dataset_train, spatial_transform=spatial_transform_train, temporal_transform=temporal_transform_train),
        sampler=sampler,
        batch_size=args.train_batch, num_workers=args.workers,
        pin_memory=pin_memory, drop_last=True,
    )
    '''
    for batch_idx, (vids, pids, camids, img_paths) in enumerate(trainloader):
        print(batch_idx, pids, camids, img_paths)
        break
    return
    '''
    dataset_query = dataset.query
    dataset_gallery = dataset.gallery
    if args.dataset == 'lsvid':
        dataset_query = dataset.val_query
        dataset_gallery = dataset.val_gallery
        print('process lsvid dataset')
        
    queryloader = DataLoader(
        VideoDataset(dataset_query, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test),
        batch_size=args.test_batch, shuffle=False, num_workers=args.workers,
        pin_memory=pin_memory, drop_last=False
    )

    galleryloader = DataLoader(
        VideoDataset(dataset_gallery, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test),
        batch_size=args.test_batch, shuffle=False, num_workers=args.workers,
        pin_memory=pin_memory, drop_last=False
    )
    
    print("Initializing model: {}".format(args.arch))
    model = models.init_model(name=args.arch, use_gpu=use_gpu, num_classes=dataset.num_train_pids, loss={'xent', 'htri'}, transformer_num_heads=args.transformer_num_heads, transformer_num_layers=args.transformer_num_layers, attention_flatness=True)
    #print(model)
    if args.resume:
        print("Loading checkpoint from '{}'".format(args.resume))
        checkpoint = torch.load(args.resume)
        model.load_state_dict(checkpoint['state_dict'])

    criterion_xent = nn.CrossEntropyLoss() 
    criterion_flat = FlatnessLoss(reduction='batchmean', use_gpu=use_gpu)
    criterion_htri_c = TripletInterCamLoss(margin=args.margin, distance=args.distance, use_gpu=use_gpu)
    #criterion_htri_c = TripletWeightedInterCamLoss(margin=args.margin, distance=args.distance, use_gpu=use_gpu, alpha=args.cam_alpha)

    #optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    linear_scaled_lr = args.lr * args.train_batch * len(args.gpu_devices.split(',')) / 512.0
	args.lr = linear_scaled_lr
Exemplo n.º 11
0
    args = parse_args()
    print(args)

    # load vocabulary
    vocab = Vocabulary(token_level=args.token_level)
    vocpath = os.path.join(args.root_path, args.vocabpath)
    try:
        assert os.path.exists(vocpath)
    except AssertionError:
        print("didn't find vocab in {}! aborting".format(vocpath))
        sys.exit(0)
    vocab.load(vocpath)
    vocab_size = len(vocab)

    # transforms
    sp = spt.Compose([spt.CornerCrop(size=args.imsize), spt.ToTensor()])
    tp = tpt.Compose([tpt.TemporalRandomCrop(args.clip_len), tpt.LoopPadding(args.clip_len)])

    # dataloading
    collatefn = functools.partial(collater, args.max_seqlen)
    dset = ActivityNetCaptions(args.root_path, args.meta_path, args.mode, vocab, args.framepath, spatial_transform=sp, temporal_transform=tp, sample_duration=args.clip_len)
    dloader = DataLoader(dset, batch_size=args.batch_size, shuffle=True, num_workers=args.n_cpu, collate_fn=collatefn, drop_last=True)
    max_it = int(len(dset) / args.batch_size)

    # models
    video_encoder, params = generate_model(args)
    # rewrite part of average pooling
    if args.langmethod == 'Transformer':
        scale = 16
        inter_time = int(args.clip_len/scale)
        video_encoder.avgpool = nn.AdaptiveAvgPool3d((inter_time, 1, 1))
Exemplo n.º 12
0
def main():
    torch.manual_seed(args.seed)
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices
    use_gpu = torch.cuda.is_available()

    if not args.evaluate:
        sys.stdout = Logger(osp.join(args.save_dir, 'log_train.txt'))
    else:
        sys.stdout = Logger(osp.join(args.save_dir, 'log_test.txt'))
    print("==========\nArgs:{}\n==========".format(args))

    if use_gpu:
        print("Currently using GPU {}".format(args.gpu_devices))
        torch.cuda.manual_seed_all(args.seed)
    else:
        print("Currently using CPU (GPU is highly recommended)")

    print("Initializing dataset {}".format(args.dataset))
    dataset = data_manager.init_dataset(name=args.dataset, root=args.root)

    # Data augmentation
    spatial_transform_train = ST.Compose([
        ST.Scale((args.height, args.width), interpolation=3),
        ST.RandomHorizontalFlip(),
        ST.ToTensor(),
        ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    temporal_transform_train = TT.TemporalRandomCrop(size=args.seq_len,
                                                     stride=args.sample_stride)

    spatial_transform_test = ST.Compose([
        ST.Scale((args.height, args.width), interpolation=3),
        ST.ToTensor(),
        ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    temporal_transform_test = TT.TemporalBeginCrop()

    transform_test_img = T.Compose([
        T.Resize((args.height, args.width), interpolation=3),
        T.ToTensor(),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    pin_memory = True if use_gpu else False

    if args.dataset == 'dukevid':
        trainloader = DataLoader(
            VideoDataset(dataset.train_dense,
                         spatial_transform=spatial_transform_train,
                         temporal_transform=temporal_transform_train),
            sampler=RandomIdentitySampler(dataset.train_dense,
                                          num_instances=args.num_instances),
            batch_size=args.train_batch,
            num_workers=args.workers,
            pin_memory=pin_memory,
            drop_last=True,
        )
    else:
        trainloader = DataLoader(
            VideoDataset(dataset.train,
                         spatial_transform=spatial_transform_train,
                         temporal_transform=temporal_transform_train),
            sampler=RandomIdentitySampler(dataset.train,
                                          num_instances=args.num_instances),
            batch_size=args.train_batch,
            num_workers=args.workers,
            pin_memory=pin_memory,
            drop_last=True,
        )

    queryloader = DataLoader(VideoDataset(
        dataset.query,
        spatial_transform=spatial_transform_test,
        temporal_transform=temporal_transform_test),
                             batch_size=args.test_batch,
                             shuffle=False,
                             num_workers=0,
                             pin_memory=pin_memory,
                             drop_last=False)

    galleryloader = DataLoader(VideoDataset(
        dataset.gallery,
        spatial_transform=spatial_transform_test,
        temporal_transform=temporal_transform_test),
                               batch_size=args.test_batch,
                               shuffle=False,
                               num_workers=0,
                               pin_memory=pin_memory,
                               drop_last=False)

    queryimgloader = DataLoader(ImageDataset(dataset.query_img,
                                             transform=transform_test_img),
                                batch_size=args.img_test_batch,
                                shuffle=False,
                                num_workers=args.workers,
                                pin_memory=pin_memory,
                                drop_last=False)

    galleryimgloader = DataLoader(ImageDataset(dataset.gallery_img,
                                               transform=transform_test_img),
                                  batch_size=args.img_test_batch,
                                  shuffle=False,
                                  num_workers=args.workers,
                                  pin_memory=pin_memory,
                                  drop_last=False)

    print("Initializing model: {} and {}".format(args.vid_arch, args.img_arch))
    vid_model = models.init_model(name=args.vid_arch)
    img_model = models.init_model(name=args.img_arch)
    classifier = models.init_model(name='classifier',
                                   num_classes=dataset.num_train_pids)
    print("Video model size: {:.5f}M".format(
        sum(p.numel() for p in vid_model.parameters()) / 1000000.0))
    print("Image model size: {:.5f}M".format(
        sum(p.numel() for p in img_model.parameters()) / 1000000.0))

    criterion = nn.CrossEntropyLoss()
    criterion_tkp_f = FeatureBasedTKP(bp_to_vid=args.bp_to_vid)
    criterion_tkp_d = SimilarityBasedTKP(distance='euclidean',
                                         bp_to_vid=args.bp_to_vid)
    criterion_i2v = HeterogeneousTripletLoss(margin=0.3, distance='euclidean')
    optimizer = torch.optim.Adam([{
        'params': vid_model.parameters(),
        'lr': args.lr
    }, {
        'params': img_model.parameters(),
        'lr': args.lr
    }, {
        'params': classifier.parameters(),
        'lr': args.lr
    }],
                                 weight_decay=args.weight_decay)
    scheduler = lr_scheduler.MultiStepLR(optimizer,
                                         milestones=args.stepsize,
                                         gamma=args.gamma)
    start_epoch = args.start_epoch

    if args.resume:
        print("Loading checkpoint from '{}'".format(args.resume))
        checkpoint = torch.load(args.resume)
        vid_model.load_state_dict(checkpoint['vid_model_state_dict'])
        img_model.load_state_dict(checkpoint['img_model_state_dict'])
        classifier.load_state_dict(checkpoint['classifier_state_dict'])
        start_epoch = checkpoint['epoch']

    if use_gpu:
        vid_model = vid_model.cuda()
        img_model = img_model.cuda()
        classifier = classifier.cuda()

    if args.evaluate:
        print("Evaluate only")
        with torch.no_grad():
            test(vid_model, img_model, queryloader, galleryloader,
                 queryimgloader, galleryimgloader, use_gpu)
        return

    start_time = time.time()
    train_time = 0
    best_rank1 = -np.inf
    best_epoch = 0
    print("==> Start training")

    for epoch in range(start_epoch, args.max_epoch):
        scheduler.step()

        start_train_time = time.time()
        train(epoch, vid_model, img_model, classifier, criterion,
              criterion_tkp_f, criterion_tkp_d, criterion_i2v, optimizer,
              trainloader, use_gpu)
        torch.cuda.empty_cache()
        train_time += round(time.time() - start_train_time)

        if (epoch + 1) >= args.start_eval and args.eval_step > 0 and (
                epoch + 1) % args.eval_step == 0 or (epoch +
                                                     1) == args.max_epoch:
            print("==> Test")
            with torch.no_grad():
                rank1 = test(vid_model, img_model, queryloader, galleryloader,
                             queryimgloader, galleryimgloader, use_gpu)
                torch.cuda.empty_cache()
            is_best = rank1 > best_rank1
            if is_best:
                best_rank1 = rank1
                best_epoch = epoch + 1

            vid_model_state_dict = vid_model.state_dict()
            img_model_state_dict = img_model.state_dict()
            classifier_state_dict = classifier.state_dict()

            save_checkpoint(
                {
                    'vid_model_state_dict': vid_model_state_dict,
                    'img_model_state_dict': img_model_state_dict,
                    'classifier_state_dict': classifier_state_dict,
                    'rank1': rank1,
                    'epoch': epoch,
                }, is_best,
                osp.join(args.save_dir,
                         'checkpoint_ep' + str(epoch + 1) + '.pth.tar'))

    print("==> Best Rank-1 {:.1%}, achieved at epoch {}".format(
        best_rank1, best_epoch))

    elapsed = round(time.time() - start_time)
    elapsed = str(datetime.timedelta(seconds=elapsed))
    train_time = str(datetime.timedelta(seconds=train_time))
    print(
        "Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.".
        format(elapsed, train_time))
Exemplo n.º 13
0
def main():
    torch.manual_seed(args.seed)
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices
    use_gpu = torch.cuda.is_available()
    if args.use_cpu: use_gpu = False

    sys.stdout = Logger(osp.join(args.save_dir, 'log_train.txt'), mode='a')
    print("==========\nArgs:{}\n==========".format(args))

    if use_gpu:
        print("Currently using GPU {}".format(args.gpu_devices))
        torch.cuda.manual_seed_all(args.seed)
    else:
        print("Currently using CPU (GPU is highly recommended)")

    print("Initializing dataset {}".format(args.dataset))
    dataset = data_manager.init_dataset(name=args.dataset, root=args.root)

    # Data augmentation
    spatial_transform_train = [
        ST.Scale((args.height, args.width), interpolation=3),
        ST.RandomHorizontalFlip(),
        ST.ToTensor(),
        ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]
    spatial_transform_train = ST.Compose(spatial_transform_train)

    temporal_transform_train = TT.TemporalRandomCrop(size=args.seq_len,
                                                     stride=args.sample_stride)

    spatial_transform_test = ST.Compose([
        ST.Scale((args.height, args.width), interpolation=3),
        ST.ToTensor(),
        ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    temporal_transform_test = TT.TemporalBeginCrop(size=args.test_frames)

    pin_memory = True if use_gpu else False

    dataset_train = dataset.train
    if args.dataset == 'duke':
        dataset_train = dataset.train_dense
        print('process duke dataset')

    trainloader = DataLoader(
        VideoDataset(dataset_train,
                     spatial_transform=spatial_transform_train,
                     temporal_transform=temporal_transform_train),
        sampler=RandomIdentitySampler(dataset_train,
                                      num_instances=args.num_instances),
        batch_size=args.train_batch,
        num_workers=args.workers,
        pin_memory=pin_memory,
        drop_last=True,
    )

    queryloader = DataLoader(VideoDataset(
        dataset.query,
        spatial_transform=spatial_transform_test,
        temporal_transform=temporal_transform_test),
                             batch_size=args.test_batch,
                             shuffle=False,
                             num_workers=args.workers,
                             pin_memory=pin_memory,
                             drop_last=False)

    galleryloader = DataLoader(VideoDataset(
        dataset.gallery,
        spatial_transform=spatial_transform_test,
        temporal_transform=temporal_transform_test),
                               batch_size=args.test_batch,
                               shuffle=False,
                               num_workers=args.workers,
                               pin_memory=pin_memory,
                               drop_last=False)

    print("Initializing model: {}".format(args.arch))
    model = models.init_model(name=args.arch,
                              use_gpu=use_gpu,
                              num_classes=dataset.num_train_pids,
                              loss={'xent', 'htri'})
    print(model)

    criterion_xent = nn.CrossEntropyLoss()
    criterion_htri = TripletLoss(margin=args.margin,
                                 distance=args.distance,
                                 use_gpu=use_gpu)

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)

    scheduler = lr_scheduler.MultiStepLR(optimizer,
                                         milestones=args.stepsize,
                                         gamma=args.gamma)
    start_epoch = args.start_epoch

    if use_gpu:
        model = nn.DataParallel(model).cuda()

    start_time = time.time()
    train_time = 0
    best_rank1 = -np.inf
    best_epoch = 0
    print("==> Start training")

    for epoch in range(start_epoch, args.max_epoch):

        start_train_time = time.time()
        train(epoch, model, criterion_xent, criterion_htri, optimizer,
              trainloader, use_gpu)
        train_time += round(time.time() - start_train_time)

        scheduler.step()

        if (epoch + 1) >= args.start_eval and (
                epoch + 1) % args.eval_step == 0 or epoch == 0:
            print("==> Test")
            with torch.no_grad():
                rank1 = test(model, queryloader, galleryloader, use_gpu)
            is_best = rank1 > best_rank1
            if is_best:
                best_rank1 = rank1
                best_epoch = epoch + 1

            if use_gpu:
                state_dict = model.module.state_dict()
            else:
                state_dict = model.state_dict()
            save_checkpoint(
                {
                    'state_dict': state_dict,
                    'rank1': rank1,
                    'epoch': epoch,
                }, is_best,
                osp.join(args.save_dir,
                         'checkpoint_ep' + str(epoch + 1) + '.pth.tar'))

    print("==> Best Rank-1 {:.1%}, achieved at epoch {}".format(
        best_rank1, best_epoch))

    elapsed = round(time.time() - start_time)
    elapsed = str(datetime.timedelta(seconds=elapsed))
    train_time = str(datetime.timedelta(seconds=train_time))
    print(
        "Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.".
        format(elapsed, train_time))
Exemplo n.º 14
0
def main():
    torch.manual_seed(args.seed)
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices
    use_gpu = torch.cuda.is_available()
    if args.use_cpu: use_gpu = False

    sys.stdout = Logger(osp.join(args.save_dir, 'log_test1.txt'), mode='a')
    print("==========\nArgs:{}\n==========".format(args))

    if use_gpu:
        print("Currently using GPU {}".format(args.gpu_devices))
        torch.cuda.manual_seed_all(args.seed)
    else:
        print("Currently using CPU (GPU is highly recommended)")

    print("Initializing dataset {}".format(args.dataset))
    dataset = data_manager.init_dataset(name=args.dataset)

    # Data augmentation

    spatial_transform_test = ST.Compose([
        ST.Scale((args.height, args.width), interpolation=3),
        ST.ToTensor(),
        ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    temporal_transform_test = None

    pin_memory = True if use_gpu else False

    queryloader = DataLoader(VideoDataset(
        dataset.query,
        spatial_transform=spatial_transform_test,
        temporal_transform=temporal_transform_test),
                             batch_size=1,
                             shuffle=False,
                             num_workers=0,
                             pin_memory=pin_memory,
                             drop_last=False)

    galleryloader = DataLoader(VideoDataset(
        dataset.gallery,
        spatial_transform=spatial_transform_test,
        temporal_transform=temporal_transform_test),
                               batch_size=1,
                               shuffle=False,
                               num_workers=0,
                               pin_memory=pin_memory,
                               drop_last=False)

    print("Initializing model: {}".format(args.arch))
    model = models.init_model(name=args.arch,
                              use_gpu=use_gpu,
                              num_classes=dataset.num_train_pids,
                              loss={'xent', 'htri'})

    if args.resume:
        print("Loading checkpoint from '{}'".format(args.resume))
        checkpoint = torch.load(args.resume)
        model.load_state_dict(checkpoint['state_dict'])

    if use_gpu:
        model = nn.DataParallel(model).cuda()

    model.eval()
    with torch.no_grad():
        evaluation(model, args, queryloader, galleryloader, use_gpu)
Exemplo n.º 15
0
def main():
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices
    print(torch.cuda.device_count())
    use_gpu = torch.cuda.is_available()

    sys.stdout = Logger(osp.join(args.save_dir, 'log_test.txt'))
    print("==========\nArgs:{}\n==========".format(args))

    print("Initializing dataset {}".format(args.dataset))
    dataset = data_manager.init_dataset(name=args.dataset, root=args.root)

    # Data augmentation
    spatial_transform_test = ST.Compose([
        ST.Scale((args.height, args.width), interpolation=3),
        ST.ToTensor(),
        ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    temporal_transform_test = None

    transform_test_img = T.Compose([
        T.Resize((args.height, args.width), interpolation=3),
        T.ToTensor(),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    pin_memory = True if use_gpu else False

    queryloader = DataLoader(VideoDataset(
        dataset.query,
        spatial_transform=spatial_transform_test,
        temporal_transform=temporal_transform_test),
                             batch_size=args.test_batch,
                             shuffle=False,
                             num_workers=0,
                             pin_memory=pin_memory,
                             drop_last=False)

    galleryloader = DataLoader(VideoDataset(
        dataset.gallery,
        spatial_transform=spatial_transform_test,
        temporal_transform=temporal_transform_test),
                               batch_size=args.test_batch,
                               shuffle=False,
                               num_workers=0,
                               pin_memory=pin_memory,
                               drop_last=False)

    queryimgloader = DataLoader(ImageDataset(dataset.query_img,
                                             transform=transform_test_img),
                                batch_size=args.img_test_batch,
                                shuffle=False,
                                num_workers=args.workers,
                                pin_memory=pin_memory,
                                drop_last=False)

    galleryimgloader = DataLoader(ImageDataset(dataset.gallery_img,
                                               transform=transform_test_img),
                                  batch_size=args.img_test_batch,
                                  shuffle=False,
                                  num_workers=args.workers,
                                  pin_memory=pin_memory,
                                  drop_last=False)

    print("Initializing model: {} and {}".format(args.vid_arch, args.img_arch))
    vid_model = models.init_model(name=args.vid_arch)
    img_model = models.init_model(name=args.img_arch)
    print("Video model size: {:.5f}M".format(
        sum(p.numel() for p in vid_model.parameters()) / 1000000.0))
    print("Image model size: {:.5f}M".format(
        sum(p.numel() for p in img_model.parameters()) / 1000000.0))

    print("Loading checkpoint from '{}'".format(args.resume))
    checkpoint = torch.load(args.resume)
    vid_model.load_state_dict(checkpoint['vid_model_state_dict'])
    img_model.load_state_dict(checkpoint['img_model_state_dict'])

    if use_gpu:
        vid_model = vid_model.cuda()
        img_model = img_model.cuda()

    print("Evaluate")
    with torch.no_grad():
        test(vid_model, img_model, queryloader, galleryloader, queryimgloader,
             galleryimgloader, use_gpu)
Exemplo n.º 16
0
def train_lstm(args):
    # gpus
    device = torch.device(
        'cuda' if args.cuda and torch.cuda.is_available() else 'cpu')

    # load vocabulary
    annfiles = [os.path.join(args.root_path, pth) for pth in args.annpaths]
    text_proc = build_vocab(annfiles, args.min_freq, args.max_seqlen)
    vocab_size = len(text_proc.vocab)

    # transforms
    sp = spt.Compose([spt.CornerCrop(size=args.imsize), spt.ToTensor()])
    tp = tpt.Compose([
        tpt.TemporalRandomCrop(args.clip_len),
        tpt.LoopPadding(args.clip_len)
    ])

    # dataloading
    train_dset = ActivityNetCaptions_Train(args.root_path,
                                           ann_path='train_fps.json',
                                           sample_duration=args.clip_len,
                                           spatial_transform=sp,
                                           temporal_transform=tp)
    trainloader = DataLoader(train_dset,
                             batch_size=args.batch_size,
                             shuffle=True,
                             num_workers=args.n_cpu,
                             drop_last=True,
                             timeout=100)
    max_train_it = int(len(train_dset) / args.batch_size)
    val_dset = ActivityNetCaptions_Val(
        args.root_path,
        ann_path=['val_1_fps.json', 'val_2_fps.json'],
        sample_duration=args.clip_len,
        spatial_transform=sp,
        temporal_transform=tp)
    valloader = DataLoader(val_dset,
                           batch_size=args.batch_size,
                           shuffle=True,
                           num_workers=args.n_cpu,
                           drop_last=True,
                           timeout=100)
    #max_val_it = int(len(val_dset) / args.batch_size)
    max_val_it = 10

    # models
    video_encoder = generate_3dcnn(args)
    caption_gen = generate_rnn(vocab_size, args)
    models = [video_encoder, caption_gen]

    # initialize pretrained embeddings
    if args.emb_init is not None:
        begin = time.time()
        print("initializing embeddings from {}...".format(args.emb_init))
        lookup = get_pretrained_from_txt(args.emb_init)
        first = next(iter(lookup.values()))
        try:
            assert len(first) == args.embedding_size
        except AssertionError:
            print("embedding size not compatible with pretrained embeddings.")
            print(
                "specified size {}, pretrained model includes size {}".format(
                    args.embedding_size, len(first)))
            sys.exit(1)
        matrix = torch.randn_like(caption_gen.emb.weight)
        for char, vec in lookup.items():
            if char in text_proc.vocab.stoi.keys():
                id = text_proc.vocab.stoi[char]
                matrix[id, :] = torch.tensor(vec)
        caption_gen.init_embedding(matrix)
        print("{} | successfully initialized".format(
            sec2str(time.time() - begin), args.emb_init))

    # move models to device
    n_gpu = torch.cuda.device_count()
    if n_gpu > 1 and args.dataparallel:
        video_encoder = nn.DataParallel(video_encoder)
        caption_gen = nn.DataParallel(caption_gen)
    else:
        n_gpu = 1
    print("using {} gpus...".format(n_gpu))

    # loss function
    criterion = nn.CrossEntropyLoss(ignore_index=text_proc.vocab.stoi['<pad>'])

    # optimizer, scheduler
    params = list(video_encoder.parameters()) + list(caption_gen.parameters())
    optimizer = optim.SGD([{
        "params": video_encoder.parameters(),
        "lr": args.lr_cnn,
        "momentum": args.momentum_cnn
    }, {
        "params": caption_gen.parameters(),
        "lr": args.lr_rnn,
        "momentum": args.momentum_rnn
    }],
                          weight_decay=args.weight_decay)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                     mode='max',
                                                     factor=0.9,
                                                     patience=args.patience,
                                                     verbose=True)

    # count parameters
    num_params = sum(count_parameters(model) for model in models)
    print("# of params in model : {}".format(num_params))

    # joint training loop
    print("start training")
    begin = time.time()
    for ep in range(args.max_epochs):

        # train for epoch
        video_encoder, caption_gen, optimizer = train_epoch(
            trainloader,
            video_encoder,
            caption_gen,
            optimizer,
            criterion,
            device,
            text_proc,
            max_it=max_train_it,
            opt=args)

        # save models
        enc_save_dir = os.path.join(args.model_save_path, "encoder")
        enc_filename = "ep{:04d}.pth".format(ep + 1)
        if not os.path.exists(enc_save_dir):
            os.makedirs(enc_save_dir)
        enc_save_path = os.path.join(enc_save_dir, enc_filename)

        dec_save_dir = os.path.join(args.model_save_path, "decoder")
        dec_filename = "ep{:04d}.pth".format(ep + 1)
        dec_save_path = os.path.join(dec_save_dir, dec_filename)
        if not os.path.exists(dec_save_dir):
            os.makedirs(dec_save_dir)

        if n_gpu > 1 and args.dataparallel:
            torch.save(video_encoder.module.state_dict(), enc_save_path)
            torch.save(caption_gen.module.state_dict(), dec_save_path)
        else:
            torch.save(video_encoder.state_dict(), enc_save_path)
            torch.save(caption_gen.state_dict(), dec_save_path)

        print("saved encoder model to {}".format(enc_save_path))
        print("saved decoder model to {}".format(dec_save_path))

        # evaluate
        print("begin evaluation for epoch {} ...".format(ep + 1))
        nll, ppl, metrics = validate(valloader,
                                     video_encoder,
                                     caption_gen,
                                     criterion,
                                     device,
                                     text_proc,
                                     max_it=max_val_it,
                                     opt=args)
        if metrics is not None:
            scheduler.step(metrics["METEOR"])

        print(
            "training time {}, epoch {:04d}/{:04d} done, validation loss: {:.06f}, perplexity: {:.03f}"
            .format(sec2str(time.time() - begin), ep + 1, args.max_epochs, nll,
                    ppl))

    print("end training")