def __init__(self, cfg: DictConfig):
        super().__init__()
        self.train_gulp_dir = Path(cfg.data.train_gulp_dir)
        self.val_gulp_dir = Path(cfg.data.val_gulp_dir)
        self.test_gulp_dir = Path(cfg.data.test_gulp_dir)
        self.cfg = cfg

        channel_count = (3 if self.cfg.modality == "RGB" else 2 *
                         self.cfg.data.segment_length)
        common_transform = Compose([
            Stack(bgr=self.cfg.modality == "RGB"
                  and self.cfg.data.preprocessing.get("bgr", False)),
            ToTorchFormatTensor(div=self.cfg.data.preprocessing.rescale),
            GroupNormalize(
                mean=list(self.cfg.data.preprocessing.mean),
                std=list(self.cfg.data.preprocessing.std),
            ),
            ExtractTimeFromChannel(channel_count),
        ])
        self.train_transform = Compose([
            GroupMultiScaleCrop(
                self.cfg.data.preprocessing.input_size,
                self.cfg.data.train_augmentation.multiscale_crop_scales,
            ),
            GroupRandomHorizontalFlip(is_flow=self.cfg.modality == "Flow"),
            common_transform,
        ])
        self.test_transform = Compose([
            GroupScale(self.cfg.data.test_augmentation.rescale_size),
            GroupCenterCrop(self.cfg.data.preprocessing.input_size),
            common_transform,
        ])
示例#2
0
def main():
    global args
    global best_prec1
    args = parser.parse_args()

    print('Training arguments:')
    for k, v in vars(args).items():
        print('\t{}: {}'.format(k, v))

    if args.data_name == 'ucf101':
        num_class = 101
    elif args.data_name == 'hmdb51':
        num_class = 51
    else:
        raise ValueError('Unknown dataset ' + args.data_name)

    model = Model(num_class,
                  args.num_segments,
                  args.representation,
                  base_model=args.arch)
    print(model)

    train_loader = torch.utils.data.DataLoader(CoviarDataSet(
        args.data_root,
        args.data_name,
        video_list=args.train_list,
        num_segments=args.num_segments,
        representation=args.representation,
        transform=model.get_augmentation(),
        is_train=True,
        accumulate=(not args.no_accumulation),
    ),
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True)

    val_loader = torch.utils.data.DataLoader(CoviarDataSet(
        args.data_root,
        args.data_name,
        video_list=args.test_list,
        num_segments=args.num_segments,
        representation=args.representation,
        transform=torchvision.transforms.Compose([
            GroupScale(int(model.scale_size)),
            GroupCenterCrop(model.crop_size),
        ]),
        is_train=False,
        accumulate=(not args.no_accumulation),
    ),
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    model = torch.nn.DataParallel(model, device_ids=args.gpus).cuda()
    cudnn.benchmark = True

    params_dict = dict(model.named_parameters())
    params = []
    for key, value in params_dict.items():
        decay_mult = 0.0 if 'bias' in key else 1.0

        if ('module.base_model.conv1' in key or 'module.base_model.bn1' in key
                or 'data_bn'
                in key) and args.representation in ['mv', 'residual']:
            lr_mult = 0.1
        elif '.fc.' in key:
            lr_mult = 1.0
        else:
            lr_mult = 0.01

        params += [{
            'params': value,
            'lr': args.lr,
            'lr_mult': lr_mult,
            'decay_mult': decay_mult
        }]

    optimizer = torch.optim.Adam(params,
                                 weight_decay=args.weight_decay,
                                 eps=0.001)
    criterion = torch.nn.CrossEntropyLoss().cuda()

    for epoch in range(args.epochs):
        cur_lr = adjust_learning_rate(optimizer, epoch, args.lr_steps,
                                      args.lr_decay)

        train(train_loader, model, criterion, optimizer, epoch, cur_lr)

        if epoch % args.eval_freq == 0 or epoch == args.epochs - 1:
            prec1 = validate(val_loader, model, criterion)

            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)
            if is_best or epoch % SAVE_FREQ == 0:
                save_checkpoint(
                    {
                        'epoch': epoch + 1,
                        'arch': args.arch,
                        'state_dict': model.state_dict(),
                        'best_prec1': best_prec1,
                    },
                    is_best,
                    filename='checkpoint.pth.tar')
def main(conf, test_set, test_part=-1):
    gulp_path = os.path.join(conf.gulp_test_dir, conf.modality.lower(), 'test',
                             test_set)
    gulp_path = os.path.realpath(gulp_path)
    gulp_path = Path(gulp_path)

    classes_map = pickle.load(open(conf.classes_map, "rb"))
    conf.num_classes = count_num_classes(classes_map)

    net = TSN(conf.num_classes,
              1,
              conf.modality,
              base_model=conf.arch,
              consensus_type=conf.crop_fusion_type,
              dropout=conf.dropout)

    checkpoint = torch.load(conf.weights)
    print("Model epoch {} best prec@1: {}".format(checkpoint['epoch'],
                                                  checkpoint['best_prec1']))

    base_dict = {
        '.'.join(k.split('.')[1:]): v
        for k, v in list(checkpoint['state_dict'].items())
    }
    net.load_state_dict(base_dict)

    if conf.test_crops == 1:
        cropping = torchvision.transforms.Compose([
            GroupScale(net.scale_size),
            GroupCenterCrop(net.input_size),
        ])
    elif conf.test_crops == 10:
        cropping = torchvision.transforms.Compose(
            [GroupOverSample(net.input_size, net.scale_size)])
    else:
        raise ValueError(
            "Only 1 and 10 crops are supported while we got {}".format(
                conf.test_crops))

    class_type = 'verb+noun' if conf.class_type == 'action' else conf.class_type
    if conf.modality == 'Flow':
        dataset = EpicVideoFlowDataset(gulp_path=gulp_path,
                                       class_type=class_type)
    else:
        dataset = EpicVideoDataset(gulp_path=gulp_path, class_type=class_type)

    data_loader = torch.utils.data.DataLoader(EpicTSNTestDataset(
        dataset,
        classes_map,
        num_segments=conf.test_segments,
        new_length=1 if conf.modality == "RGB" else 5,
        modality=conf.modality,
        transform=torchvision.transforms.Compose([
            cropping,
            Stack(roll=conf.arch == 'BNInception'),
            ToTorchFormatTensor(div=conf.arch != 'BNInception'),
            GroupNormalize(net.input_mean, net.input_std),
        ]),
        part=test_part),
                                              batch_size=1,
                                              shuffle=False,
                                              num_workers=conf.workers * 2,
                                              pin_memory=True)

    net = torch.nn.DataParallel(net, device_ids=conf.gpus).cuda()
    net.eval()

    total_num = len(data_loader.dataset)
    output = []

    proc_start_time = time.time()
    for i, (keys, input_) in enumerate(data_loader):
        rst = eval_video(conf, (i, keys, input_), net)
        output.append(rst[1:])
        cnt_time = time.time() - proc_start_time
        print('video {} done, total {}/{}, average {} sec/video'.format(
            i, i + 1, total_num,
            float(cnt_time) / (i + 1)))

    video_index = [x[0] for x in output]
    scores = [x[1] for x in output]

    save_scores = './{}/tsn_{}_{}_testset_{}_{}_lr_{}_model_{:03d}.npz'.format(
        conf.checkpoint, conf.class_type, conf.modality.lower(), test_set,
        conf.arch, conf.lr, checkpoint['epoch'])
    if test_part > 0:
        save_scores = save_scores.replace('.npz',
                                          '_part-{}.npz'.format(test_part))
    np.savez(save_scores, segment_indices=video_index, scores=scores)
示例#4
0
def main():
    global args
    global best_prec1
    args = parser.parse_args()

    print('Training arguments:')
    for k, v in vars(args).items():
        print('\t{}: {}'.format(k, v))

    if args.data_name == 'ucf101':
        num_class = 101
    elif args.data_name == 'hmdb51':
        num_class = 51
    else:
        raise ValueError('Unknown dataset ' + args.data_name)

    # num_class: total number of classes
    # num_segments: number of TSN segments, default=3
    # representation: iframe, mv, residual
    # base_model: base architecture

    model = Model(num_class,
                  args.num_segments,
                  args.representation,
                  base_model=args.arch,
                  mv_stack_size=args.mv_stack_size)
    print(model)

    # dataset (Dataset) – dataset from which to load the data.
    # batch_size – how many samples per batch to load (default: 1).
    # shuffle – set to True to have the data reshuffled at every epoch.
    # num_workers – how many subprocesses to use for data loading. 0 means that the data will be loaded in the main process. (default: 0)
    # pin_memory – If True, the data loader will copy tensors into CUDA pinned memory before returning them.

    train_loader = torch.utils.data.DataLoader(
        CoviarDataSet(
            args.data_root,
            args.data_name,
            video_list=args.train_list,
            num_segments=args.num_segments,
            representation=args.representation,
            transform=model.get_augmentation(),
            # get_augmentation() =
            # GroupMultiScaleCrop + GroupRandomHorizontalFlip
            # GroupMultiScaleCrop contains stack mv

            # seems np.stack in resize_mv() called in GroupMultiScaleCrop
            # has the same effects as Stack() in TSN

            # -----------------------
            # TSN:
            # transform=torchvision.transforms.Compose([
            #     train_augmentation,                       # train_augmentation = model.get_augmentation(), same
            #     Stack(roll=args.arch == 'BNInception'),   # this line seems important
            #     ToTorchFormatTensor(div=args.arch != 'BNInception'),
            #     normalize, # used for RGBDiff
            # ])),
            # ----------------------
            is_train=True,
            accumulate=(not args.no_accumulation),
            mv_stack_size=args.mv_stack_size),
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.workers,
        pin_memory=True)

    val_loader = torch.utils.data.DataLoader(
        CoviarDataSet(
            args.data_root,
            args.data_name,
            video_list=args.test_list,
            num_segments=args.num_segments,
            representation=args.representation,
            transform=torchvision.transforms.
            Compose([  # seems important to stacking
                GroupScale(int(model.scale_size)),
                GroupCenterCrop(
                    model.crop_size
                ),  # here they both use model.crop_size (instead of TSN's net.input_size in test_model.py)
            ]),  # this function contains stack

            # seems np.stack in resize_mv() called in GroupCenterCrop
            # has the same effects as Stack() in TSN

            # -----------------------
            # TSN:
            # transform=torchvision.transforms.Compose([
            #     GroupScale(int(scale_size)),
            #     GroupCenterCrop(crop_size),
            #     Stack(roll=args.arch == 'BNInception'),       # this line seems important
            #     ToTorchFormatTensor(div=args.arch != 'BNInception'),
            #     normalize,
            # ])),
            # -----------------------
            is_train=False,
            accumulate=(not args.no_accumulation),
            mv_stack_size=args.mv_stack_size),
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.workers,
        pin_memory=True)

    # parallel gpu setting
    model = torch.nn.DataParallel(model, device_ids=args.gpus).cuda()
    cudnn.benchmark = True

    params_dict = dict(model.named_parameters())
    params = []
    for key, value in params_dict.items():
        decay_mult = 0.0 if 'bias' in key else 1.0

        if ('module.base_model.conv1' in key or 'module.base_model.bn1' in key
                or 'data_bn'
                in key) and args.representation in ['mv', 'residual']:
            lr_mult = 0.1
        elif '.fc.' in key:
            lr_mult = 1.0
        else:
            lr_mult = 0.01

        params += [{
            'params': value,
            'lr': args.lr,
            'lr_mult': lr_mult,
            'decay_mult': decay_mult
        }]

    optimizer = torch.optim.Adam(params,
                                 weight_decay=args.weight_decay,
                                 eps=0.001)
    criterion = torch.nn.CrossEntropyLoss().cuda()

    for epoch in range(args.epochs):
        cur_lr = adjust_learning_rate(optimizer, epoch, args.lr_steps,
                                      args.lr_decay)

        train(train_loader, model, criterion, optimizer, epoch, cur_lr)

        if epoch % args.eval_freq == 0 or epoch == args.epochs - 1:
            prec1 = validate(val_loader, model, criterion)

            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)
            if is_best or epoch % SAVE_FREQ == 0:
                save_checkpoint(
                    {
                        'epoch': epoch + 1,
                        'arch': args.arch,
                        'state_dict': model.state_dict(),
                        'best_prec1': best_prec1,
                    },
                    is_best,
                    filename='checkpoint.pth.tar')
示例#5
0
def main():
    global args
    global best_prec1
    args = parser.parse_args()

    print('Training arguments:')
    for k, v in vars(args).items():
        print('\t{}: {}'.format(k, v))

    if args.data_name == 'ucf101':
        num_class = 101
    elif args.data_name == 'hmdb51':
        num_class = 51
    elif args.data_name == 'mine':
        num_class = 2
    else:
        raise ValueError('Unknown dataset ' + args.data_name)

    model = Model(num_class,
                  args.num_segments,
                  args.representation,
                  base_model=args.arch)
    print(model)

    if 'resnet3D' in args.arch:
        train_crop_min_ratio = 0.75
        train_crop_min_scale = 0.25
        mean = [0.4345, 0.4051, 0.3775]
        std = [0.2768, 0.2713, 0.2737]
        value_scale = 1

        train_transform = Compose([
            RandomResizedCrop(
                model.crop_size, (train_crop_min_scale, 1.0),
                (train_crop_min_ratio, 1.0 / train_crop_min_ratio)),
            RandomHorizontalFlip(),
            ToTensor(),
            ScaleValue(value_scale),
            Normalize(mean, std)
        ])
        test_trainsform = Compose([
            Resize(model.crop_size),
            CenterCrop(model.crop_size),
            ToTensor(),  # range [0, 255] -> [0.0,1.0]
            ScaleValue(1),
            Normalize(mean, std)
        ])

    train_loader = torch.utils.data.DataLoader(
        CoviarDataSet(
            args.data_root,
            args.data_name,
            video_list=args.train_list,
            num_segments=args.num_segments,
            representation=args.representation,
            transform=model.get_augmentation(),  #train_transform, 
            is_train=True,
            accumulate=(not args.no_accumulation),
            model_name=args.arch),
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.workers,
        pin_memory=True,
        worker_init_fn=worker_init_fn)

    val_loader = torch.utils.data.DataLoader(
        CoviarDataSet(
            args.data_root,
            args.data_name,
            video_list=args.test_list,
            num_segments=args.num_segments,
            representation=args.representation,
            transform=torchvision.transforms.Compose([
                GroupScale(int(model.scale_size)),
                GroupCenterCrop(model.crop_size)
            ]),  #test_trainsform, 
            is_train=True,
            accumulate=(not args.no_accumulation),
            model_name=args.arch),
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.workers,
        pin_memory=True,
        worker_init_fn=worker_init_fn)

    model = torch.nn.DataParallel(model, device_ids=args.gpus).cuda()
    cudnn.benchmark = True

    params_dict = dict(model.named_parameters())
    params = []
    for key, value in params_dict.items():
        decay_mult = 0.0 if 'bias' in key else 1.0

        if ('module.base_model.conv1' in key or 'module.base_model.bn1' in key
                or 'data_bn'
                in key) and args.representation in ['mv', 'residual']:
            lr_mult = 0.1
        elif '.fc.' in key:
            lr_mult = 1.0
        else:
            lr_mult = 0.01

        params += [{
            'params': value,
            'lr': args.lr,
            'lr_mult': lr_mult,
            'decay_mult': decay_mult
        }]

    #optimizer = torch.optim.SGD(params, weight_decay=0.001, momentum=0.9, nesterov=False)
    #scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=10)
    optimizer = torch.optim.Adam(params,
                                 weight_decay=args.weight_decay,
                                 eps=0.001)
    criterion = torch.nn.CrossEntropyLoss().cuda()

    for epoch in range(args.epochs):
        cur_lr = adjust_learning_rate(optimizer, epoch, args.lr_steps,
                                      args.lr_decay)
        #cur_lr = get_lr(optimizer)

        train(train_loader, model, criterion, optimizer, epoch, cur_lr)
        #prec1, prev_val_loss = validate(val_loader, model, criterion)
        #scheduler.step(prev_val_loss)

        if epoch % args.eval_freq == 0 or epoch == args.epochs - 1:
            prec1, _ = validate(val_loader, model, criterion)

            # 紀錄訓練歷程
            np.savez("train_history/train_history.npz",
                     loss=np.array(train_loss),
                     top1=np.array(train_prec),
                     lr=np.array(train_lr))
            np.savez("train_history/valid_history.npz",
                     loss=np.array(valid_loss),
                     top1=np.array(valid_prec))

            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)
            if is_best or epoch % SAVE_FREQ == 0:
                save_checkpoint(
                    {
                        'epoch': epoch + 1,
                        'arch': args.arch,
                        'state_dict': model.state_dict(),
                        'best_prec1': best_prec1,
                    },
                    is_best,
                    filename='checkpoint.pth.tar')
示例#6
0
def main():
    net = Model(num_class,
                args.test_segments,
                args.representation,
                base_model=args.arch)

    checkpoint = torch.load(args.weights)
    print("model epoch {} best prec@1: {}".format(checkpoint['epoch'],
                                                  checkpoint['best_prec1']))

    base_dict = {
        '.'.join(k.split('.')[1:]): v
        for k, v in list(checkpoint['state_dict'].items())
    }
    net.load_state_dict(base_dict)

    if args.test_crops == 1:
        cropping = torchvision.transforms.Compose([
            GroupScale(net.scale_size),
            GroupCenterCrop(net.crop_size),
        ])
    elif args.test_crops == 10:
        cropping = torchvision.transforms.Compose([
            GroupOverSample(net.crop_size,
                            net.scale_size,
                            is_mv=(args.representation == 'mv'))
        ])
    else:
        raise ValueError(
            "Only 1 and 10 crops are supported, but got {}.".format(
                args.test_crops))

    data_loader = torch.utils.data.DataLoader(CoviarDataSet(
        args.data_root,
        args.data_name,
        video_list=args.test_list,
        num_segments=args.test_segments,
        representation=args.representation,
        transform=cropping,
        is_train=False,
        accumulate=(not args.no_accumulation),
    ),
                                              batch_size=1,
                                              shuffle=False,
                                              num_workers=args.workers * 2,
                                              pin_memory=True)

    if args.gpus is not None:
        devices = [args.gpus[i] for i in range(args.workers)]
    else:
        devices = list(range(args.workers))

    net = torch.nn.DataParallel(net.cuda(devices[0]), device_ids=devices)
    net.eval()

    data_gen = enumerate(data_loader)

    total_num = len(data_loader.dataset)
    output = []

    def forward_video(data):
        input_var = torch.autograd.Variable(data, volatile=True)
        scores = net(input_var)
        scores = scores.view((-1, args.test_segments * args.test_crops) +
                             scores.size()[1:])
        scores = torch.mean(scores, dim=1)
        return scores.data.cpu().numpy().copy()

    proc_start_time = time.time()

    for i, (data, label) in data_gen:
        video_scores = forward_video(data)
        output.append((video_scores, label[0]))
        cnt_time = time.time() - proc_start_time
        if (i + 1) % 100 == 0:
            print('video {} done, total {}/{}, average {} sec/video'.format(
                i, i + 1, total_num,
                float(cnt_time) / (i + 1)))

    video_pred = [np.argmax(x[0]) for x in output]
    video_labels = [x[1] for x in output]

    print('Accuracy {:.02f}% ({})'.format(
        float(np.sum(np.array(video_pred) == np.array(video_labels))) /
        len(video_pred) * 100.0, len(video_pred)))

    if args.save_scores is not None:

        name_list = [x.strip().split()[0] for x in open(args.test_list)]
        order_dict = {e: i for i, e in enumerate(sorted(name_list))}

        reorder_output = [None] * len(output)
        reorder_label = [None] * len(output)
        reorder_name = [None] * len(output)

        for i in range(len(output)):
            idx = order_dict[name_list[i]]
            reorder_output[idx] = output[i]
            reorder_label[idx] = video_labels[i]
            reorder_name[idx] = name_list[i]

        np.savez(args.save_scores,
                 scores=reorder_output,
                 labels=reorder_label,
                 names=reorder_name)
示例#7
0
    # Move to GPU if available and set to evaluation
    model.eval()
    model.to(device)

    # Define the transform
    batch_size = 1
    snippet_length = 1  # Number of frames composing the snippet, 1 for RGB, 5 for optical flow
    snippet_channels = 3  # Number of channels in a frame, 3 for RGB, 2 for optical flow
    height, width = 224, 224

    crop_count = 10

    if crop_count == 1:
        cropping = Compose([
            GroupScale(model.scale_size),
            GroupCenterCrop(model.input_size),
        ])
    elif crop_count == 10:
        cropping = GroupOverSample(model.input_size, model.scale_size)
    else:
        raise ValueError("Only 1 and 10 crop_count are supported while we got {}".format(crop_count))

    transform = Compose([
        cropping,
        Stack(roll=base_model == base_model),
        ToTorchFormatTensor(div=base_model != base_model),
        GroupNormalize(model.input_mean, model.input_std),
    ])

    pred_verb_indices = []
    pred_noun_indices = []
示例#8
0
def main():
    # define the model
    net = Model(num_class,
                args.test_segments,
                args.representation,
                base_model=args.arch,
                new_length=args.new_length,
                use_databn=args.use_databn,
                gen_flow_or_delta=args.gen_flow_or_delta,
                gen_flow_ds_factor=args.gen_flow_ds_factor,
                arch_estimator=args.arch_estimator,
                att=args.att)

    # load the trained model
    checkpoint = torch.load(args.weights,
                            map_location=lambda storage, loc: storage)
    print("model epoch {} best prec@1: {}".format(checkpoint['epoch'],
                                                  checkpoint['best_prec1']))

    base_dict = {
        '.'.join(k.split('.')[1:]): v
        for k, v in list(checkpoint['state_dict'].items())
    }
    net.load_state_dict(base_dict, strict=False)

    # setup the data loader
    if args.test_crops == 1:
        cropping = torchvision.transforms.Compose([
            GroupScale(net.scale_size),
            GroupCenterCrop(net.crop_size),
        ])
    elif args.test_crops == 10:
        cropping = torchvision.transforms.Compose(
            [GroupOverSample(net.crop_size, net.scale_size)])
    else:
        raise ValueError(
            "Only 1 and 10 crops are supported, but got {}.".format(
                args.test_crops))

    data_loader = torch.utils.data.DataLoader(CoviarDataSet(
        args.data_root,
        args.flow_root,
        args.data_name,
        video_list=args.test_list,
        num_segments=args.test_segments,
        representation=args.representation,
        new_length=args.new_length,
        flow_ds_factor=args.flow_ds_factor,
        upsample_interp=args.upsample_interp,
        transform=cropping,
        is_train=False,
        accumulate=(not args.no_accumulation),
        gop=args.gop,
        flow_folder=args.data_flow,
        viz=args.viz),
                                              batch_size=1,
                                              shuffle=False,
                                              num_workers=args.workers * 2,
                                              pin_memory=True)

    # deploy model on gpu
    if args.gpus is not None:
        devices = [args.gpus[i] for i in range(args.workers)]
    else:
        devices = list(range(args.workers))

    net.cuda(devices[0])
    #net.base_model.cuda(devices[-1])
    net = torch.nn.DataParallel(net, device_ids=devices)

    # switch to inference model and start to iterate over the test set
    net.eval()

    total_num = len(data_loader.dataset)
    output = []

    # process each video to obtain its predictions
    def forward_video(input_mv, input_residual, att=0):
        input_mv_var = torch.autograd.Variable(input_mv, volatile=True)
        input_residual_var = torch.autograd.Variable(input_residual,
                                                     volatile=True)
        if att == 0:
            scores, gen_flow = net(input_mv_var, input_residual_var)
        if att == 1:
            scores, gen_flow, att_flow = net(input_mv_var, input_residual_var)
        scores = scores.view((-1, args.test_segments * args.test_crops) +
                             scores.size()[1:])
        scores = torch.mean(scores, dim=1)
        if att == 0:
            return scores.data.cpu().numpy().copy(), gen_flow
        if att == 1:
            return scores.data.cpu().numpy().copy(), gen_flow, att_flow

    proc_start_time = time.time()

    # iterate over the whole test set
    for i, (input_flow, input_mv, input_residual,
            label) in enumerate(data_loader):
        input_mv = input_mv.cuda(args.gpus[-1], async=True)
        input_residual = input_residual.cuda(args.gpus[0], async=True)
        input_flow = input_flow.cuda(args.gpus[-1], async=True)

        # print("input_flow shape:")
        # print(input_flow.shape) # torch.Size([batch_size, num_crops*num_segments, 2, 224, 224])
        # print("input_flow type:")  # print(input_flow.type())  # torch.cuda.FloatTensor
        if args.att == 0:
            video_scores, gen_flow = forward_video(input_mv, input_residual)
        if args.att == 1:
            video_scores, gen_flow, att_flow = forward_video(
                input_mv, input_residual, args.att)
        output.append((video_scores, label[0]))
        cnt_time = time.time() - proc_start_time
        if (i + 1) % 100 == 0:
            print('video {} done, total {}/{}, average {} sec/video'.format(
                i, i + 1, total_num,
                float(cnt_time) / (i + 1)))

    video_pred = [np.argmax(x[0]) for x in output]
    video_labels = [x[1] for x in output]

    print('Accuracy {:.02f}% ({})'.format(
        float(np.sum(np.array(video_pred) == np.array(video_labels))) /
        len(video_pred) * 100.0, len(video_pred)))

    if args.save_scores is not None:

        name_list = [x.strip().split()[0] for x in open(args.test_list)]
        order_dict = {e: i for i, e in enumerate(sorted(name_list))}

        reorder_output = [None] * len(output)
        reorder_label = [None] * len(output)
        reorder_name = [None] * len(output)

        for i in range(len(output)):
            idx = order_dict[name_list[i]]
            reorder_output[idx] = output[i]
            reorder_label[idx] = video_labels[i]
            reorder_name[idx] = name_list[i]

        np.savez(args.save_scores,
                 scores=reorder_output,
                 labels=reorder_label,
                 names=reorder_name)
示例#9
0
def main():
    # load trained model
    '''
    @Param
    num_class: total number of classes
    num_segments: number of TSN segments, test default = 25
    representation: iframe, mv, residual
    base_model: base architecture
    '''
    net = Model(num_class,
                args.test_segments,
                args.representation,
                base_model=args.arch,
                mv_stack_size=args.mv_stack_size)

    # -----------------------------MODIFIED_CODE_START-------------------------------
    # print(net)
    # -----------------------------MODIFIED_CODE_END---------------------------------

    # checkpoint trained model ? (not best model
    checkpoint = torch.load(args.weights)
    print("model epoch {} best prec@1: {}".format(checkpoint['epoch'],
                                                  checkpoint['best_prec1']))
    base_dict = {
        '.'.join(k.split('.')[1:]): v
        for k, v in list(checkpoint['state_dict'].items())
    }
    net.load_state_dict(base_dict)

    # -----------------------
    # CLASS torchvision.transforms.Compose(transforms)[SOURCE]
    # Composes several transforms together.
    # Parameters: transforms (list of Transform objects) – list of transforms to compose.
    # -----------------------

    # -----------------------
    # TSN:
    # if args.test_crops == 1:
    #     cropping = torchvision.transforms.Compose([
    #         GroupScale(net.scale_size),
    #         GroupCenterCrop(net.input_size),
    #     ])
    # -----------------------
    if args.test_crops == 1:
        cropping = torchvision.transforms.Compose([
            GroupScale(net.scale_size),
            GroupCenterCrop(net.crop_size),
        ])

    # ??? what's difference between net.input_size and net.crop_size

    # line 70 in model.py
    #     def crop_size(self):
    #         return self._input_size
    # seems they are same here

    # -----------------------
    # TSN:
    # elif args.test_crops == 10:
    #     cropping = torchvision.transforms.Compose([
    #         GroupOverSample(net.input_size, net.scale_size)
    #     ])
    # -----------------------

    # is_mv=(args.representation == 'mv') seems quite important
    elif args.test_crops == 10:
        cropping = torchvision.transforms.Compose([
            GroupOverSample(net.crop_size,
                            net.scale_size,
                            is_mv=(args.representation == 'mv'))
        ])
    # --test-crops specifies how many crops per segment.
    # The value should be 1 or 10.
    # 1 means using only one center crop.
    # 10 means using 5 crops for both (horizontal) flips.
    else:
        raise ValueError(
            "Only 1 and 10 crops are supported, but got {}.".format(
                args.test_crops))

    data_loader = torch.utils.data.DataLoader(
        CoviarDataSet(
            args.data_root,
            args.data_name,
            video_list=args.test_list,
            num_segments=args.test_segments,
            representation=args.representation,
            transform=cropping,  # seems important to stacking
            # test_crops == 1: GroupScale + GroupCenterCrop
            # the same as val_data_loader in train.py
            # seems np.stack in resize_mv() called in GroupCenterCrop
            # has the same effects as Stack() in TSN

            # test_crops == 10: GroupOverSample

            # -----------------------
            # TSN:
            # transform=torchvision.transforms.Compose([
            #     cropping,
            #     Stack(roll=args.arch == 'BNInception'),       # this line seems important
            #     ToTorchFormatTensor(div=args.arch != 'BNInception'),
            #     GroupNormalize(net.input_mean, net.input_std),
            # ])),
            # -----------------------
            is_train=False,
            accumulate=(not args.no_accumulation),
            mv_stack_size=args.mv_stack_size),
        batch_size=1,
        shuffle=False,
        # -----------------------------ORIGINAL_CODE_START-----------------------------
        # num_workers=args.workers * 2, pin_memory=True)
        # -----------------------------ORIGINAL_CODE_END-------------------------------
        # -----------------------------MODIFIED_CODE_START-----------------------------
        num_workers=args.workers,
        pin_memory=True)
    # -----------------------------MODIFIED_CODE_END-------------------------------

    if args.gpus is not None:
        devices = [args.gpus[i] for i in range(args.workers)]
    else:
        devices = list(range(args.workers))

    net = torch.nn.DataParallel(net.cuda(devices[0]), device_ids=devices)
    net.eval()

    data_gen = enumerate(data_loader)

    total_num = len(data_loader.dataset)
    output = []

    def forward_video(data):
        # torch.Size([batch_size, num_segment, 2*MV_STACK_SIZE, height, width])
        # -----------------------------MODIFIED_CODE_START-------------------------------
        # print("data.shape"+str(data.shape)) # testing: torch.Size([1, 25, 10, 224, 224])
        # training: torch.Size([40, 3, 10, 224, 224])
        # original:data.shape:torch.Size([1, 250, 2, 224, 224])
        # so it seems that the format of input data in this function is not correct
        # -----------------------------MODIFIED_CODE_END---------------------------------

        input_var = torch.autograd.Variable(data, volatile=True)

        # -----------------------------MODIFIED_CODE_START-------------------------------
        # print("input_var:"+str(input_var.shape)) # input_var:torch.Size([1, 25, 10, 224, 224])
        # original: input_var.shape:torch.Size([1, 250, 2, 224, 224])
        # -----------------------------MODIFIED_CODE_END---------------------------------

        # compute output
        scores = net(input_var)

        # -----------------------------MODIFIED_CODE_START-------------------------------
        # torch.Size([batch_size*num_segment, num_class])
        # print("scores: "+str(scores.shape)) # testing:  torch.Size([25, 101])
        # training: torch.Size([120, 101])

        # print("scores.size()")
        # print(scores.size()) # torch.Size([25, 101])
        # -----------------------------MODIFIED_CODE_END---------------------------------

        # what does args.test_segments * args.test_crops mean??
        # view(*shape) → Tensor: Returns a new tensor with the same data as the self tensor but of a different shape.
        # Parameters    shape (torch.Size or int...) – the desired size
        scores = scores.view((-1, args.test_segments * args.test_crops) +
                             scores.size()[1:])
        scores = torch.mean(scores, dim=1)

        return scores.data.cpu().numpy().copy()

    proc_start_time = time.time()

    for i, (data, label) in data_gen:

        video_scores = forward_video(data)
        output.append((video_scores, label[0]))
        cnt_time = time.time() - proc_start_time
        if (i + 1) % 100 == 0:
            print('video {} done, total {}/{}, average {} sec/video'.format(
                i, i + 1, total_num,
                float(cnt_time) / (i + 1)))

    video_pred = [np.argmax(x[0]) for x in output]
    video_labels = [x[1] for x in output]

    print('Accuracy {:.02f}% ({})'.format(
        float(np.sum(np.array(video_pred) == np.array(video_labels))) /
        len(video_pred) * 100.0, len(video_pred)))

    if args.save_scores is not None:

        name_list = [x.strip().split()[0] for x in open(args.test_list)]
        order_dict = {e: i for i, e in enumerate(sorted(name_list))}

        reorder_output = [None] * len(output)
        reorder_label = [None] * len(output)
        reorder_name = [None] * len(output)

        for i in range(len(output)):
            idx = order_dict[name_list[i]]
            reorder_output[idx] = output[i]
            reorder_label[idx] = video_labels[i]
            reorder_name[idx] = name_list[i]

        np.savez(args.save_scores,
                 scores=reorder_output,
                 labels=reorder_label,
                 names=reorder_name)
示例#10
0
def main():
    writter = SummaryWriter('./log/test', comment='')

    net = Model(2, args.num_segments, args.representation,
                base_model=args.arch)

    checkpoint = torch.load(args.weights)
    # print("model epoch {} best prec@1: {}".format(checkpoint['epoch'], checkpoint['best_prec1']))
    print("model epoch {} lowest loss {}".format(checkpoint['epoch'], checkpoint['loss_min']))
    base_dict = {'.'.join(k.split('.')[1:]): v for k, v in list(checkpoint['state_dict'].items())}
    net.load_state_dict(base_dict)

    if args.test_crops == 1:
        cropping = torchvision.transforms.Compose([
            GroupScale(net.scale_size),
            GroupCenterCrop(net.crop_size),
        ])
    elif args.test_crops == 10:
        cropping = torchvision.transforms.Compose([
            GroupOverSample(net.crop_size, net.scale_size, is_mv=(args.representation == 'mv'))
        ])
    else:
        raise ValueError("Only 1 and 10 crops are supported, but got {}.".format(args.test_crops))

    data_loader = torch.utils.data.DataLoader(
        CoviarDataSet(
            args.data_root,
            video_list=args.test_list,
            num_segments=args.num_segments,
            representation=args.representation,
            transform=cropping,
            is_train=False,
            accumulate=(not args.no_accumulation),
        ),
        batch_size=1, shuffle=False,
        num_workers=args.workers * 2, pin_memory=True)

    devices = [torch.device("cuda:%d" % device) for device in args.gpus]
    net = torch.nn.DataParallel(net.cuda(devices[0]), device_ids=devices)
    net.eval()

    total_num = len(data_loader.dataset)
    scores = []
    labels = []
    proc_start_time = time.time()
    correct_nums = 0

    for i, (input_pairs, label) in enumerate(data_loader):
        with torch.no_grad:
            input_pairs[0] = input_pairs[0].float().to(devices[0])
            input_pairs[1] = input_pairs[1].float().to(devices[0])
            label = label.float().to(devices[0])

            outputs, y = net(input_pairs)
            _, predicts = torch.max(y, 1)
            scores.append(y.detach().cpu().numpy())
            labels.append(label.detach().cpu().numpy())
            correct_nums += (predicts == label.clone().long()).sum()

            cnt_time = time.time() - proc_start_time
            if (i + 1) % 100 == 0:
                print('video {} done, total {}/{}, average {} sec/video'.format(i, i + 1,
                                                                                total_num,
                                                                                float(cnt_time) / (i + 1)))
    predits = np.argmax(scores, 1)
    labels = np.around(labels).astype(np.long).ravel()

    acc = 100 * correct_nums / len(data_loader.dataset)
    target_names = ['Copy', 'Not Copy']
    # writter.add_pr_curve('Precision/Recall', labels, predits)
    writter.add_text('Accuracy', '%.3f%%' % acc)
    writter.add_text(classification_report(labels, predits, target_names=target_names))
    print(('Validating Results: accuracy: {accuracy:.3f}%'.format(accuracy=acc)))

    if args.save_scores is not None:
        with open(args.save_scores + '_scores.pkl', 'wb') as fp:
            pickle.dump(scores, fp)
        with open(args.save_scores + '_labels.pkl', 'wb') as fp:
            pickle.dump(labels, fp)
示例#11
0
def main():
    # loading input arguments for training
    global args
    global best_prec1
    global start_epoch
    start_epoch = 0
    args = parser.parse_args()

    print('Training arguments:')
    for k, v in vars(args).items():
        print('\t{}: {}'.format(k, v))

    if args.data_name == 'ucf101':
        num_class = 101
    elif args.data_name == 'hmdb51':
        num_class = 51
    elif args.data_name == 'kinetics400':
        num_class = 400
    else:
        raise ValueError('Unknown dataset ' + args.data_name)

    # define the model architecture
    model = Model(num_class, args.num_segments, args.representation,
                  base_model=args.arch,
                  new_length=args.new_length,
                  use_databn=args.use_databn,
                  gen_flow_or_delta=args.gen_flow_or_delta,
                  gen_flow_ds_factor=args.gen_flow_ds_factor,
                  arch_estimator=args.arch_estimator,
                  arch_d=args.arch_d,
                  att=args.att)
    print(model)

    # load the pre-trained model
    if args.weights is not None:
        checkpoint = torch.load(args.weights, map_location=lambda storage, loc: storage)
        print("model epoch {} best prec@1: {}".format(checkpoint['epoch'], checkpoint['best_prec1']))
        base_dict = {'.'.join(k.split('.')[1:]): v for k,v in list(checkpoint['state_dict'].items())}
        model.load_state_dict(base_dict, strict=False)

    # define the data loader for reading training data
    train_loader = torch.utils.data.DataLoader(
        CoviarDataSet(
            args.data_root,
            args.flow_root,
            args.data_name,
            video_list=args.train_list,
            num_segments=args.num_segments,
            representation=args.representation,
            new_length=args.new_length,
            flow_ds_factor=args.flow_ds_factor,
            upsample_interp=args.upsample_interp,
            transform=model.get_augmentation(),
            is_train=True,
            accumulate=(not args.no_accumulation),
            gop=args.gop,
            flow_folder=args.data_flow,
            mv_minmaxnorm=args.mv_minmaxnorm,
            ),
        batch_size=args.batch_size, shuffle=True,
        num_workers=args.workers, pin_memory=True)

    # define the data loader for reading val data
    val_loader = torch.utils.data.DataLoader(
        CoviarDataSet(
            args.data_root,
            args.flow_root,
            args.data_name,
            video_list=args.test_list,
            num_segments=args.num_segments,
            representation=args.representation,
            new_length=args.new_length,
            flow_ds_factor=args.flow_ds_factor,
            upsample_interp=args.upsample_interp,
            transform=torchvision.transforms.Compose([
                GroupScale(int(model.scale_size)),
                GroupCenterCrop(model.crop_size),
                ]),
            is_train=False,
            accumulate=(not args.no_accumulation),
            gop=args.gop,
            flow_folder=args.data_flow,
            mv_minmaxnorm=args.mv_minmaxnorm,
            ),
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True)

    model = torch.nn.DataParallel(model, device_ids=args.gpus).cuda(args.gpus[0])
    cudnn.benchmark = True

    # define optimizer and specify the corresponding parameters
    params_dict = dict(model.named_parameters())
    params_cls = []
    params_gf = []
    params_d = []
    for key, value in params_dict.items():
        if 'base_model' in key:
            decay_mult = 0.0 if 'bias' in key else 1.0
            lr_mult = args.lr_cls_mult # for cls, just finetune. if '.fc.' in key: lr_mult = 1.0
            params_cls += [{'params': value, 'lr': args.lr, 'lr_mult': lr_mult, 'decay_mult': decay_mult}]
        if 'gen_flow_model' in key:
            decay_mult = 0.0 if 'bias' in key else 1.0
            lr_mult = args.lr_mse_mult
            params_gf += [{'params': value, 'lr': args.lr, 'lr_mult': lr_mult, 'decay_mult': decay_mult}]
        if 'discriminator' in key:
            decay_mult = 0.0 if 'bias' in key else 1.0
            lr_mult = args.lr_d_mult
            params_d += [{'params': value, 'lr': args.lr, 'lr_mult': lr_mult, 'decay_mult': decay_mult}]

    optimizer_cls = torch.optim.Adam(
        params_cls,
        weight_decay=args.weight_decay,
        eps=0.001)

    optimizer_gf = torch.optim.Adam(
        params_gf,
        weight_decay=args.weight_decay,
        eps=0.001)

    optimizer_d = torch.optim.Adam(
        params_d,
        weight_decay=args.weight_decay,
        eps=0.001)

    # resume training from previous checkpoint
    if args.resume is not None:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage)
            start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            if 'optimizer_cls' in checkpoint.keys():
                optimizer_cls.load_state_dict(checkpoint['optimizer_cls'])
                optimizer_gf.load_state_dict(checkpoint['optimizer_gf'])
                optimizer_d.load_state_dict(checkpoint['optimizer_d'])
                def load_opt_update_cuda(optimizer, cuda_id):
                    for state in optimizer.state.values():
                        for k, v in state.items():
                            if torch.is_tensor(v):
                                state[k] = v.cuda(cuda_id)
                load_opt_update_cuda(optimizer_cls, args.gpus[0])
                load_opt_update_cuda(optimizer_gf, args.gpus[0])
                load_opt_update_cuda(optimizer_d, args.gpus[0])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))

    # define several loss functions
    criterion = torch.nn.CrossEntropyLoss().cuda(args.gpus[0])
    if args.loss_mse == 'MSELoss':
        criterion_mse = torch.nn.MSELoss().cuda(args.gpus[0])
    elif args.loss_mse == 'SmoothL1Loss':
        criterion_mse = torch.nn.SmoothL1Loss().cuda(args.gpus[0])
    elif args.loss_mse == 'L1':
        criterion_mse = torch.nn.L1Loss().cuda(args.gpus[0])

    # finally done with setup and start to train model
    for epoch in range(start_epoch, args.epochs):
        # determine the learning rate for the current epoch
        cur_lr_cls = adjust_learning_rate(optimizer_cls, epoch, args.lr_steps, args.lr_decay) #, freeze=True, epoch_thre=args.epoch_thre)
        cur_lr_gf = adjust_learning_rate(optimizer_gf, epoch, args.lr_steps, args.lr_decay)
        cur_lr_d = adjust_learning_rate(optimizer_d, epoch, args.lr_steps, args.lr_decay)

        # perform training
        train(train_loader, model, criterion, criterion_mse, optimizer_cls,
            optimizer_gf, optimizer_d, epoch, cur_lr_cls, cur_lr_gf, cur_lr_d, args.lr_cls, args.lr_adv_g, args.lr_adv_d, args.lr_mse, args.att)

        # perform validation if needed
        if epoch % args.eval_freq == 0 or epoch == args.epochs - 1:
            prec1 = validate(val_loader, model, criterion, criterion_mse, args.lr_cls, args.lr_adv_g, args.lr_adv_d, args.lr_mse, args.att)
            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)
            if is_best or epoch % SAVE_FREQ == 0:
                save_checkpoint(
                    {
                        'epoch': epoch + 1,
                        'arch': args.arch,
                        'state_dict': model.state_dict(),
                        'best_prec1': best_prec1,
                        'optimizer_cls': optimizer_cls.state_dict(),
                        'optimizer_gf': optimizer_gf.state_dict(),
                        'optimizer_d': optimizer_d.state_dict(),
                    },
                    is_best,
                    filename='checkpoint.pth.tar')
示例#12
0
def main():
    net = Model(num_class, base_model=args.arch)

    checkpoint = torch.load(args.weights)
    print("model epoch {} best prec@1: {}".format(checkpoint['epoch'],
                                                  checkpoint['best_prec1']))

    base_dict = {
        '.'.join(k.split('.')[1:]): v
        for k, v in list(checkpoint['state_dict'].items())
    }
    net.load_state_dict(base_dict)

    if args.test_crops == 1:
        cropping = torchvision.transforms.Compose([
            GroupScale(net.scale_size),
            GroupCenterCrop(net.crop_size),
        ])
    elif args.test_crops == 10:
        cropping = torchvision.transforms.Compose([
            GroupOverSample(net.crop_size,
                            net.scale_size,
                            is_mv=(args.representation == 'mv'))
        ])
    else:
        raise ValueError(
            "Only 1 and 10 crops are supported, but got {}.".format(
                args.test_crops))

    data_loader = torch.utils.data.DataLoader(FoodDataSet(
        args.data_root,
        img_list=args.test_list,
        transform=cropping,
        is_train=False,
    ),
                                              batch_size=1,
                                              shuffle=False,
                                              num_workers=args.workers * 2,
                                              pin_memory=True)

    if args.gpus is not None:
        devices = [args.gpus[i] for i in range(args.workers)]
    else:
        devices = list(range(args.workers))

    net = torch.nn.DataParallel(net.cuda(devices[0]), device_ids=devices)
    net.eval()

    data_gen = enumerate(data_loader)

    total_num = len(data_loader.dataset)
    output = []

    def forward_img(data):
        """
        Args:
            data (Tensor): size [batch_size, c, h, w]

        Returns:
            scores (Tensor) : size [batch_size, num_class]

        """
        with torch.no_grad():
            input_var = torch.autograd.Variable(data, volatile=True)
            scores = net(input_var)
            scores = scores.view((-1, args.test_crops) + scores.size()[1:])
            scores = torch.mean(scores, dim=1)
            return scores.data.cpu().numpy().copy()

    proc_start_time = time.time()

    for i, (data, label) in data_gen:
        # data = [1, c, h ,w], label = [1]
        img_scores = forward_img(data)
        output.append((img_scores[0], label[0]))
        cnt_time = time.time() - proc_start_time
        if (i + 1) % 100 == 0:
            print('image {} done, total {}/{}, average {} sec/image'.format(
                i, i + 1, total_num,
                float(cnt_time) / (i + 1)))

    img_pred = [np.argmax(x[0]) for x in output]
    img_labels = [x[1] for x in output]

    print('Accuracy {:.02f}% ({})'.format(
        float(np.sum(np.array(img_pred) == np.array(img_labels))) /
        len(img_pred) * 100.0, len(img_pred)))