def __init__(self, cfg: DictConfig): super().__init__() self.train_gulp_dir = Path(cfg.data.train_gulp_dir) self.val_gulp_dir = Path(cfg.data.val_gulp_dir) self.test_gulp_dir = Path(cfg.data.test_gulp_dir) self.cfg = cfg channel_count = (3 if self.cfg.modality == "RGB" else 2 * self.cfg.data.segment_length) common_transform = Compose([ Stack(bgr=self.cfg.modality == "RGB" and self.cfg.data.preprocessing.get("bgr", False)), ToTorchFormatTensor(div=self.cfg.data.preprocessing.rescale), GroupNormalize( mean=list(self.cfg.data.preprocessing.mean), std=list(self.cfg.data.preprocessing.std), ), ExtractTimeFromChannel(channel_count), ]) self.train_transform = Compose([ GroupMultiScaleCrop( self.cfg.data.preprocessing.input_size, self.cfg.data.train_augmentation.multiscale_crop_scales, ), GroupRandomHorizontalFlip(is_flow=self.cfg.modality == "Flow"), common_transform, ]) self.test_transform = Compose([ GroupScale(self.cfg.data.test_augmentation.rescale_size), GroupCenterCrop(self.cfg.data.preprocessing.input_size), common_transform, ])
def main(): global args global best_prec1 args = parser.parse_args() print('Training arguments:') for k, v in vars(args).items(): print('\t{}: {}'.format(k, v)) if args.data_name == 'ucf101': num_class = 101 elif args.data_name == 'hmdb51': num_class = 51 else: raise ValueError('Unknown dataset ' + args.data_name) model = Model(num_class, args.num_segments, args.representation, base_model=args.arch) print(model) train_loader = torch.utils.data.DataLoader(CoviarDataSet( args.data_root, args.data_name, video_list=args.train_list, num_segments=args.num_segments, representation=args.representation, transform=model.get_augmentation(), is_train=True, accumulate=(not args.no_accumulation), ), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(CoviarDataSet( args.data_root, args.data_name, video_list=args.test_list, num_segments=args.num_segments, representation=args.representation, transform=torchvision.transforms.Compose([ GroupScale(int(model.scale_size)), GroupCenterCrop(model.crop_size), ]), is_train=False, accumulate=(not args.no_accumulation), ), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) model = torch.nn.DataParallel(model, device_ids=args.gpus).cuda() cudnn.benchmark = True params_dict = dict(model.named_parameters()) params = [] for key, value in params_dict.items(): decay_mult = 0.0 if 'bias' in key else 1.0 if ('module.base_model.conv1' in key or 'module.base_model.bn1' in key or 'data_bn' in key) and args.representation in ['mv', 'residual']: lr_mult = 0.1 elif '.fc.' in key: lr_mult = 1.0 else: lr_mult = 0.01 params += [{ 'params': value, 'lr': args.lr, 'lr_mult': lr_mult, 'decay_mult': decay_mult }] optimizer = torch.optim.Adam(params, weight_decay=args.weight_decay, eps=0.001) criterion = torch.nn.CrossEntropyLoss().cuda() for epoch in range(args.epochs): cur_lr = adjust_learning_rate(optimizer, epoch, args.lr_steps, args.lr_decay) train(train_loader, model, criterion, optimizer, epoch, cur_lr) if epoch % args.eval_freq == 0 or epoch == args.epochs - 1: prec1 = validate(val_loader, model, criterion) is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) if is_best or epoch % SAVE_FREQ == 0: save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best, filename='checkpoint.pth.tar')
def main(conf, test_set, test_part=-1): gulp_path = os.path.join(conf.gulp_test_dir, conf.modality.lower(), 'test', test_set) gulp_path = os.path.realpath(gulp_path) gulp_path = Path(gulp_path) classes_map = pickle.load(open(conf.classes_map, "rb")) conf.num_classes = count_num_classes(classes_map) net = TSN(conf.num_classes, 1, conf.modality, base_model=conf.arch, consensus_type=conf.crop_fusion_type, dropout=conf.dropout) checkpoint = torch.load(conf.weights) print("Model epoch {} best prec@1: {}".format(checkpoint['epoch'], checkpoint['best_prec1'])) base_dict = { '.'.join(k.split('.')[1:]): v for k, v in list(checkpoint['state_dict'].items()) } net.load_state_dict(base_dict) if conf.test_crops == 1: cropping = torchvision.transforms.Compose([ GroupScale(net.scale_size), GroupCenterCrop(net.input_size), ]) elif conf.test_crops == 10: cropping = torchvision.transforms.Compose( [GroupOverSample(net.input_size, net.scale_size)]) else: raise ValueError( "Only 1 and 10 crops are supported while we got {}".format( conf.test_crops)) class_type = 'verb+noun' if conf.class_type == 'action' else conf.class_type if conf.modality == 'Flow': dataset = EpicVideoFlowDataset(gulp_path=gulp_path, class_type=class_type) else: dataset = EpicVideoDataset(gulp_path=gulp_path, class_type=class_type) data_loader = torch.utils.data.DataLoader(EpicTSNTestDataset( dataset, classes_map, num_segments=conf.test_segments, new_length=1 if conf.modality == "RGB" else 5, modality=conf.modality, transform=torchvision.transforms.Compose([ cropping, Stack(roll=conf.arch == 'BNInception'), ToTorchFormatTensor(div=conf.arch != 'BNInception'), GroupNormalize(net.input_mean, net.input_std), ]), part=test_part), batch_size=1, shuffle=False, num_workers=conf.workers * 2, pin_memory=True) net = torch.nn.DataParallel(net, device_ids=conf.gpus).cuda() net.eval() total_num = len(data_loader.dataset) output = [] proc_start_time = time.time() for i, (keys, input_) in enumerate(data_loader): rst = eval_video(conf, (i, keys, input_), net) output.append(rst[1:]) cnt_time = time.time() - proc_start_time print('video {} done, total {}/{}, average {} sec/video'.format( i, i + 1, total_num, float(cnt_time) / (i + 1))) video_index = [x[0] for x in output] scores = [x[1] for x in output] save_scores = './{}/tsn_{}_{}_testset_{}_{}_lr_{}_model_{:03d}.npz'.format( conf.checkpoint, conf.class_type, conf.modality.lower(), test_set, conf.arch, conf.lr, checkpoint['epoch']) if test_part > 0: save_scores = save_scores.replace('.npz', '_part-{}.npz'.format(test_part)) np.savez(save_scores, segment_indices=video_index, scores=scores)
def main(): global args global best_prec1 args = parser.parse_args() print('Training arguments:') for k, v in vars(args).items(): print('\t{}: {}'.format(k, v)) if args.data_name == 'ucf101': num_class = 101 elif args.data_name == 'hmdb51': num_class = 51 else: raise ValueError('Unknown dataset ' + args.data_name) # num_class: total number of classes # num_segments: number of TSN segments, default=3 # representation: iframe, mv, residual # base_model: base architecture model = Model(num_class, args.num_segments, args.representation, base_model=args.arch, mv_stack_size=args.mv_stack_size) print(model) # dataset (Dataset) – dataset from which to load the data. # batch_size – how many samples per batch to load (default: 1). # shuffle – set to True to have the data reshuffled at every epoch. # num_workers – how many subprocesses to use for data loading. 0 means that the data will be loaded in the main process. (default: 0) # pin_memory – If True, the data loader will copy tensors into CUDA pinned memory before returning them. train_loader = torch.utils.data.DataLoader( CoviarDataSet( args.data_root, args.data_name, video_list=args.train_list, num_segments=args.num_segments, representation=args.representation, transform=model.get_augmentation(), # get_augmentation() = # GroupMultiScaleCrop + GroupRandomHorizontalFlip # GroupMultiScaleCrop contains stack mv # seems np.stack in resize_mv() called in GroupMultiScaleCrop # has the same effects as Stack() in TSN # ----------------------- # TSN: # transform=torchvision.transforms.Compose([ # train_augmentation, # train_augmentation = model.get_augmentation(), same # Stack(roll=args.arch == 'BNInception'), # this line seems important # ToTorchFormatTensor(div=args.arch != 'BNInception'), # normalize, # used for RGBDiff # ])), # ---------------------- is_train=True, accumulate=(not args.no_accumulation), mv_stack_size=args.mv_stack_size), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader( CoviarDataSet( args.data_root, args.data_name, video_list=args.test_list, num_segments=args.num_segments, representation=args.representation, transform=torchvision.transforms. Compose([ # seems important to stacking GroupScale(int(model.scale_size)), GroupCenterCrop( model.crop_size ), # here they both use model.crop_size (instead of TSN's net.input_size in test_model.py) ]), # this function contains stack # seems np.stack in resize_mv() called in GroupCenterCrop # has the same effects as Stack() in TSN # ----------------------- # TSN: # transform=torchvision.transforms.Compose([ # GroupScale(int(scale_size)), # GroupCenterCrop(crop_size), # Stack(roll=args.arch == 'BNInception'), # this line seems important # ToTorchFormatTensor(div=args.arch != 'BNInception'), # normalize, # ])), # ----------------------- is_train=False, accumulate=(not args.no_accumulation), mv_stack_size=args.mv_stack_size), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) # parallel gpu setting model = torch.nn.DataParallel(model, device_ids=args.gpus).cuda() cudnn.benchmark = True params_dict = dict(model.named_parameters()) params = [] for key, value in params_dict.items(): decay_mult = 0.0 if 'bias' in key else 1.0 if ('module.base_model.conv1' in key or 'module.base_model.bn1' in key or 'data_bn' in key) and args.representation in ['mv', 'residual']: lr_mult = 0.1 elif '.fc.' in key: lr_mult = 1.0 else: lr_mult = 0.01 params += [{ 'params': value, 'lr': args.lr, 'lr_mult': lr_mult, 'decay_mult': decay_mult }] optimizer = torch.optim.Adam(params, weight_decay=args.weight_decay, eps=0.001) criterion = torch.nn.CrossEntropyLoss().cuda() for epoch in range(args.epochs): cur_lr = adjust_learning_rate(optimizer, epoch, args.lr_steps, args.lr_decay) train(train_loader, model, criterion, optimizer, epoch, cur_lr) if epoch % args.eval_freq == 0 or epoch == args.epochs - 1: prec1 = validate(val_loader, model, criterion) is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) if is_best or epoch % SAVE_FREQ == 0: save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best, filename='checkpoint.pth.tar')
def main(): global args global best_prec1 args = parser.parse_args() print('Training arguments:') for k, v in vars(args).items(): print('\t{}: {}'.format(k, v)) if args.data_name == 'ucf101': num_class = 101 elif args.data_name == 'hmdb51': num_class = 51 elif args.data_name == 'mine': num_class = 2 else: raise ValueError('Unknown dataset ' + args.data_name) model = Model(num_class, args.num_segments, args.representation, base_model=args.arch) print(model) if 'resnet3D' in args.arch: train_crop_min_ratio = 0.75 train_crop_min_scale = 0.25 mean = [0.4345, 0.4051, 0.3775] std = [0.2768, 0.2713, 0.2737] value_scale = 1 train_transform = Compose([ RandomResizedCrop( model.crop_size, (train_crop_min_scale, 1.0), (train_crop_min_ratio, 1.0 / train_crop_min_ratio)), RandomHorizontalFlip(), ToTensor(), ScaleValue(value_scale), Normalize(mean, std) ]) test_trainsform = Compose([ Resize(model.crop_size), CenterCrop(model.crop_size), ToTensor(), # range [0, 255] -> [0.0,1.0] ScaleValue(1), Normalize(mean, std) ]) train_loader = torch.utils.data.DataLoader( CoviarDataSet( args.data_root, args.data_name, video_list=args.train_list, num_segments=args.num_segments, representation=args.representation, transform=model.get_augmentation(), #train_transform, is_train=True, accumulate=(not args.no_accumulation), model_name=args.arch), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, worker_init_fn=worker_init_fn) val_loader = torch.utils.data.DataLoader( CoviarDataSet( args.data_root, args.data_name, video_list=args.test_list, num_segments=args.num_segments, representation=args.representation, transform=torchvision.transforms.Compose([ GroupScale(int(model.scale_size)), GroupCenterCrop(model.crop_size) ]), #test_trainsform, is_train=True, accumulate=(not args.no_accumulation), model_name=args.arch), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, worker_init_fn=worker_init_fn) model = torch.nn.DataParallel(model, device_ids=args.gpus).cuda() cudnn.benchmark = True params_dict = dict(model.named_parameters()) params = [] for key, value in params_dict.items(): decay_mult = 0.0 if 'bias' in key else 1.0 if ('module.base_model.conv1' in key or 'module.base_model.bn1' in key or 'data_bn' in key) and args.representation in ['mv', 'residual']: lr_mult = 0.1 elif '.fc.' in key: lr_mult = 1.0 else: lr_mult = 0.01 params += [{ 'params': value, 'lr': args.lr, 'lr_mult': lr_mult, 'decay_mult': decay_mult }] #optimizer = torch.optim.SGD(params, weight_decay=0.001, momentum=0.9, nesterov=False) #scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=10) optimizer = torch.optim.Adam(params, weight_decay=args.weight_decay, eps=0.001) criterion = torch.nn.CrossEntropyLoss().cuda() for epoch in range(args.epochs): cur_lr = adjust_learning_rate(optimizer, epoch, args.lr_steps, args.lr_decay) #cur_lr = get_lr(optimizer) train(train_loader, model, criterion, optimizer, epoch, cur_lr) #prec1, prev_val_loss = validate(val_loader, model, criterion) #scheduler.step(prev_val_loss) if epoch % args.eval_freq == 0 or epoch == args.epochs - 1: prec1, _ = validate(val_loader, model, criterion) # 紀錄訓練歷程 np.savez("train_history/train_history.npz", loss=np.array(train_loss), top1=np.array(train_prec), lr=np.array(train_lr)) np.savez("train_history/valid_history.npz", loss=np.array(valid_loss), top1=np.array(valid_prec)) is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) if is_best or epoch % SAVE_FREQ == 0: save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best, filename='checkpoint.pth.tar')
def main(): net = Model(num_class, args.test_segments, args.representation, base_model=args.arch) checkpoint = torch.load(args.weights) print("model epoch {} best prec@1: {}".format(checkpoint['epoch'], checkpoint['best_prec1'])) base_dict = { '.'.join(k.split('.')[1:]): v for k, v in list(checkpoint['state_dict'].items()) } net.load_state_dict(base_dict) if args.test_crops == 1: cropping = torchvision.transforms.Compose([ GroupScale(net.scale_size), GroupCenterCrop(net.crop_size), ]) elif args.test_crops == 10: cropping = torchvision.transforms.Compose([ GroupOverSample(net.crop_size, net.scale_size, is_mv=(args.representation == 'mv')) ]) else: raise ValueError( "Only 1 and 10 crops are supported, but got {}.".format( args.test_crops)) data_loader = torch.utils.data.DataLoader(CoviarDataSet( args.data_root, args.data_name, video_list=args.test_list, num_segments=args.test_segments, representation=args.representation, transform=cropping, is_train=False, accumulate=(not args.no_accumulation), ), batch_size=1, shuffle=False, num_workers=args.workers * 2, pin_memory=True) if args.gpus is not None: devices = [args.gpus[i] for i in range(args.workers)] else: devices = list(range(args.workers)) net = torch.nn.DataParallel(net.cuda(devices[0]), device_ids=devices) net.eval() data_gen = enumerate(data_loader) total_num = len(data_loader.dataset) output = [] def forward_video(data): input_var = torch.autograd.Variable(data, volatile=True) scores = net(input_var) scores = scores.view((-1, args.test_segments * args.test_crops) + scores.size()[1:]) scores = torch.mean(scores, dim=1) return scores.data.cpu().numpy().copy() proc_start_time = time.time() for i, (data, label) in data_gen: video_scores = forward_video(data) output.append((video_scores, label[0])) cnt_time = time.time() - proc_start_time if (i + 1) % 100 == 0: print('video {} done, total {}/{}, average {} sec/video'.format( i, i + 1, total_num, float(cnt_time) / (i + 1))) video_pred = [np.argmax(x[0]) for x in output] video_labels = [x[1] for x in output] print('Accuracy {:.02f}% ({})'.format( float(np.sum(np.array(video_pred) == np.array(video_labels))) / len(video_pred) * 100.0, len(video_pred))) if args.save_scores is not None: name_list = [x.strip().split()[0] for x in open(args.test_list)] order_dict = {e: i for i, e in enumerate(sorted(name_list))} reorder_output = [None] * len(output) reorder_label = [None] * len(output) reorder_name = [None] * len(output) for i in range(len(output)): idx = order_dict[name_list[i]] reorder_output[idx] = output[i] reorder_label[idx] = video_labels[i] reorder_name[idx] = name_list[i] np.savez(args.save_scores, scores=reorder_output, labels=reorder_label, names=reorder_name)
# Move to GPU if available and set to evaluation model.eval() model.to(device) # Define the transform batch_size = 1 snippet_length = 1 # Number of frames composing the snippet, 1 for RGB, 5 for optical flow snippet_channels = 3 # Number of channels in a frame, 3 for RGB, 2 for optical flow height, width = 224, 224 crop_count = 10 if crop_count == 1: cropping = Compose([ GroupScale(model.scale_size), GroupCenterCrop(model.input_size), ]) elif crop_count == 10: cropping = GroupOverSample(model.input_size, model.scale_size) else: raise ValueError("Only 1 and 10 crop_count are supported while we got {}".format(crop_count)) transform = Compose([ cropping, Stack(roll=base_model == base_model), ToTorchFormatTensor(div=base_model != base_model), GroupNormalize(model.input_mean, model.input_std), ]) pred_verb_indices = [] pred_noun_indices = []
def main(): # define the model net = Model(num_class, args.test_segments, args.representation, base_model=args.arch, new_length=args.new_length, use_databn=args.use_databn, gen_flow_or_delta=args.gen_flow_or_delta, gen_flow_ds_factor=args.gen_flow_ds_factor, arch_estimator=args.arch_estimator, att=args.att) # load the trained model checkpoint = torch.load(args.weights, map_location=lambda storage, loc: storage) print("model epoch {} best prec@1: {}".format(checkpoint['epoch'], checkpoint['best_prec1'])) base_dict = { '.'.join(k.split('.')[1:]): v for k, v in list(checkpoint['state_dict'].items()) } net.load_state_dict(base_dict, strict=False) # setup the data loader if args.test_crops == 1: cropping = torchvision.transforms.Compose([ GroupScale(net.scale_size), GroupCenterCrop(net.crop_size), ]) elif args.test_crops == 10: cropping = torchvision.transforms.Compose( [GroupOverSample(net.crop_size, net.scale_size)]) else: raise ValueError( "Only 1 and 10 crops are supported, but got {}.".format( args.test_crops)) data_loader = torch.utils.data.DataLoader(CoviarDataSet( args.data_root, args.flow_root, args.data_name, video_list=args.test_list, num_segments=args.test_segments, representation=args.representation, new_length=args.new_length, flow_ds_factor=args.flow_ds_factor, upsample_interp=args.upsample_interp, transform=cropping, is_train=False, accumulate=(not args.no_accumulation), gop=args.gop, flow_folder=args.data_flow, viz=args.viz), batch_size=1, shuffle=False, num_workers=args.workers * 2, pin_memory=True) # deploy model on gpu if args.gpus is not None: devices = [args.gpus[i] for i in range(args.workers)] else: devices = list(range(args.workers)) net.cuda(devices[0]) #net.base_model.cuda(devices[-1]) net = torch.nn.DataParallel(net, device_ids=devices) # switch to inference model and start to iterate over the test set net.eval() total_num = len(data_loader.dataset) output = [] # process each video to obtain its predictions def forward_video(input_mv, input_residual, att=0): input_mv_var = torch.autograd.Variable(input_mv, volatile=True) input_residual_var = torch.autograd.Variable(input_residual, volatile=True) if att == 0: scores, gen_flow = net(input_mv_var, input_residual_var) if att == 1: scores, gen_flow, att_flow = net(input_mv_var, input_residual_var) scores = scores.view((-1, args.test_segments * args.test_crops) + scores.size()[1:]) scores = torch.mean(scores, dim=1) if att == 0: return scores.data.cpu().numpy().copy(), gen_flow if att == 1: return scores.data.cpu().numpy().copy(), gen_flow, att_flow proc_start_time = time.time() # iterate over the whole test set for i, (input_flow, input_mv, input_residual, label) in enumerate(data_loader): input_mv = input_mv.cuda(args.gpus[-1], async=True) input_residual = input_residual.cuda(args.gpus[0], async=True) input_flow = input_flow.cuda(args.gpus[-1], async=True) # print("input_flow shape:") # print(input_flow.shape) # torch.Size([batch_size, num_crops*num_segments, 2, 224, 224]) # print("input_flow type:") # print(input_flow.type()) # torch.cuda.FloatTensor if args.att == 0: video_scores, gen_flow = forward_video(input_mv, input_residual) if args.att == 1: video_scores, gen_flow, att_flow = forward_video( input_mv, input_residual, args.att) output.append((video_scores, label[0])) cnt_time = time.time() - proc_start_time if (i + 1) % 100 == 0: print('video {} done, total {}/{}, average {} sec/video'.format( i, i + 1, total_num, float(cnt_time) / (i + 1))) video_pred = [np.argmax(x[0]) for x in output] video_labels = [x[1] for x in output] print('Accuracy {:.02f}% ({})'.format( float(np.sum(np.array(video_pred) == np.array(video_labels))) / len(video_pred) * 100.0, len(video_pred))) if args.save_scores is not None: name_list = [x.strip().split()[0] for x in open(args.test_list)] order_dict = {e: i for i, e in enumerate(sorted(name_list))} reorder_output = [None] * len(output) reorder_label = [None] * len(output) reorder_name = [None] * len(output) for i in range(len(output)): idx = order_dict[name_list[i]] reorder_output[idx] = output[i] reorder_label[idx] = video_labels[i] reorder_name[idx] = name_list[i] np.savez(args.save_scores, scores=reorder_output, labels=reorder_label, names=reorder_name)
def main(): # load trained model ''' @Param num_class: total number of classes num_segments: number of TSN segments, test default = 25 representation: iframe, mv, residual base_model: base architecture ''' net = Model(num_class, args.test_segments, args.representation, base_model=args.arch, mv_stack_size=args.mv_stack_size) # -----------------------------MODIFIED_CODE_START------------------------------- # print(net) # -----------------------------MODIFIED_CODE_END--------------------------------- # checkpoint trained model ? (not best model checkpoint = torch.load(args.weights) print("model epoch {} best prec@1: {}".format(checkpoint['epoch'], checkpoint['best_prec1'])) base_dict = { '.'.join(k.split('.')[1:]): v for k, v in list(checkpoint['state_dict'].items()) } net.load_state_dict(base_dict) # ----------------------- # CLASS torchvision.transforms.Compose(transforms)[SOURCE] # Composes several transforms together. # Parameters: transforms (list of Transform objects) – list of transforms to compose. # ----------------------- # ----------------------- # TSN: # if args.test_crops == 1: # cropping = torchvision.transforms.Compose([ # GroupScale(net.scale_size), # GroupCenterCrop(net.input_size), # ]) # ----------------------- if args.test_crops == 1: cropping = torchvision.transforms.Compose([ GroupScale(net.scale_size), GroupCenterCrop(net.crop_size), ]) # ??? what's difference between net.input_size and net.crop_size # line 70 in model.py # def crop_size(self): # return self._input_size # seems they are same here # ----------------------- # TSN: # elif args.test_crops == 10: # cropping = torchvision.transforms.Compose([ # GroupOverSample(net.input_size, net.scale_size) # ]) # ----------------------- # is_mv=(args.representation == 'mv') seems quite important elif args.test_crops == 10: cropping = torchvision.transforms.Compose([ GroupOverSample(net.crop_size, net.scale_size, is_mv=(args.representation == 'mv')) ]) # --test-crops specifies how many crops per segment. # The value should be 1 or 10. # 1 means using only one center crop. # 10 means using 5 crops for both (horizontal) flips. else: raise ValueError( "Only 1 and 10 crops are supported, but got {}.".format( args.test_crops)) data_loader = torch.utils.data.DataLoader( CoviarDataSet( args.data_root, args.data_name, video_list=args.test_list, num_segments=args.test_segments, representation=args.representation, transform=cropping, # seems important to stacking # test_crops == 1: GroupScale + GroupCenterCrop # the same as val_data_loader in train.py # seems np.stack in resize_mv() called in GroupCenterCrop # has the same effects as Stack() in TSN # test_crops == 10: GroupOverSample # ----------------------- # TSN: # transform=torchvision.transforms.Compose([ # cropping, # Stack(roll=args.arch == 'BNInception'), # this line seems important # ToTorchFormatTensor(div=args.arch != 'BNInception'), # GroupNormalize(net.input_mean, net.input_std), # ])), # ----------------------- is_train=False, accumulate=(not args.no_accumulation), mv_stack_size=args.mv_stack_size), batch_size=1, shuffle=False, # -----------------------------ORIGINAL_CODE_START----------------------------- # num_workers=args.workers * 2, pin_memory=True) # -----------------------------ORIGINAL_CODE_END------------------------------- # -----------------------------MODIFIED_CODE_START----------------------------- num_workers=args.workers, pin_memory=True) # -----------------------------MODIFIED_CODE_END------------------------------- if args.gpus is not None: devices = [args.gpus[i] for i in range(args.workers)] else: devices = list(range(args.workers)) net = torch.nn.DataParallel(net.cuda(devices[0]), device_ids=devices) net.eval() data_gen = enumerate(data_loader) total_num = len(data_loader.dataset) output = [] def forward_video(data): # torch.Size([batch_size, num_segment, 2*MV_STACK_SIZE, height, width]) # -----------------------------MODIFIED_CODE_START------------------------------- # print("data.shape"+str(data.shape)) # testing: torch.Size([1, 25, 10, 224, 224]) # training: torch.Size([40, 3, 10, 224, 224]) # original:data.shape:torch.Size([1, 250, 2, 224, 224]) # so it seems that the format of input data in this function is not correct # -----------------------------MODIFIED_CODE_END--------------------------------- input_var = torch.autograd.Variable(data, volatile=True) # -----------------------------MODIFIED_CODE_START------------------------------- # print("input_var:"+str(input_var.shape)) # input_var:torch.Size([1, 25, 10, 224, 224]) # original: input_var.shape:torch.Size([1, 250, 2, 224, 224]) # -----------------------------MODIFIED_CODE_END--------------------------------- # compute output scores = net(input_var) # -----------------------------MODIFIED_CODE_START------------------------------- # torch.Size([batch_size*num_segment, num_class]) # print("scores: "+str(scores.shape)) # testing: torch.Size([25, 101]) # training: torch.Size([120, 101]) # print("scores.size()") # print(scores.size()) # torch.Size([25, 101]) # -----------------------------MODIFIED_CODE_END--------------------------------- # what does args.test_segments * args.test_crops mean?? # view(*shape) → Tensor: Returns a new tensor with the same data as the self tensor but of a different shape. # Parameters shape (torch.Size or int...) – the desired size scores = scores.view((-1, args.test_segments * args.test_crops) + scores.size()[1:]) scores = torch.mean(scores, dim=1) return scores.data.cpu().numpy().copy() proc_start_time = time.time() for i, (data, label) in data_gen: video_scores = forward_video(data) output.append((video_scores, label[0])) cnt_time = time.time() - proc_start_time if (i + 1) % 100 == 0: print('video {} done, total {}/{}, average {} sec/video'.format( i, i + 1, total_num, float(cnt_time) / (i + 1))) video_pred = [np.argmax(x[0]) for x in output] video_labels = [x[1] for x in output] print('Accuracy {:.02f}% ({})'.format( float(np.sum(np.array(video_pred) == np.array(video_labels))) / len(video_pred) * 100.0, len(video_pred))) if args.save_scores is not None: name_list = [x.strip().split()[0] for x in open(args.test_list)] order_dict = {e: i for i, e in enumerate(sorted(name_list))} reorder_output = [None] * len(output) reorder_label = [None] * len(output) reorder_name = [None] * len(output) for i in range(len(output)): idx = order_dict[name_list[i]] reorder_output[idx] = output[i] reorder_label[idx] = video_labels[i] reorder_name[idx] = name_list[i] np.savez(args.save_scores, scores=reorder_output, labels=reorder_label, names=reorder_name)
def main(): writter = SummaryWriter('./log/test', comment='') net = Model(2, args.num_segments, args.representation, base_model=args.arch) checkpoint = torch.load(args.weights) # print("model epoch {} best prec@1: {}".format(checkpoint['epoch'], checkpoint['best_prec1'])) print("model epoch {} lowest loss {}".format(checkpoint['epoch'], checkpoint['loss_min'])) base_dict = {'.'.join(k.split('.')[1:]): v for k, v in list(checkpoint['state_dict'].items())} net.load_state_dict(base_dict) if args.test_crops == 1: cropping = torchvision.transforms.Compose([ GroupScale(net.scale_size), GroupCenterCrop(net.crop_size), ]) elif args.test_crops == 10: cropping = torchvision.transforms.Compose([ GroupOverSample(net.crop_size, net.scale_size, is_mv=(args.representation == 'mv')) ]) else: raise ValueError("Only 1 and 10 crops are supported, but got {}.".format(args.test_crops)) data_loader = torch.utils.data.DataLoader( CoviarDataSet( args.data_root, video_list=args.test_list, num_segments=args.num_segments, representation=args.representation, transform=cropping, is_train=False, accumulate=(not args.no_accumulation), ), batch_size=1, shuffle=False, num_workers=args.workers * 2, pin_memory=True) devices = [torch.device("cuda:%d" % device) for device in args.gpus] net = torch.nn.DataParallel(net.cuda(devices[0]), device_ids=devices) net.eval() total_num = len(data_loader.dataset) scores = [] labels = [] proc_start_time = time.time() correct_nums = 0 for i, (input_pairs, label) in enumerate(data_loader): with torch.no_grad: input_pairs[0] = input_pairs[0].float().to(devices[0]) input_pairs[1] = input_pairs[1].float().to(devices[0]) label = label.float().to(devices[0]) outputs, y = net(input_pairs) _, predicts = torch.max(y, 1) scores.append(y.detach().cpu().numpy()) labels.append(label.detach().cpu().numpy()) correct_nums += (predicts == label.clone().long()).sum() cnt_time = time.time() - proc_start_time if (i + 1) % 100 == 0: print('video {} done, total {}/{}, average {} sec/video'.format(i, i + 1, total_num, float(cnt_time) / (i + 1))) predits = np.argmax(scores, 1) labels = np.around(labels).astype(np.long).ravel() acc = 100 * correct_nums / len(data_loader.dataset) target_names = ['Copy', 'Not Copy'] # writter.add_pr_curve('Precision/Recall', labels, predits) writter.add_text('Accuracy', '%.3f%%' % acc) writter.add_text(classification_report(labels, predits, target_names=target_names)) print(('Validating Results: accuracy: {accuracy:.3f}%'.format(accuracy=acc))) if args.save_scores is not None: with open(args.save_scores + '_scores.pkl', 'wb') as fp: pickle.dump(scores, fp) with open(args.save_scores + '_labels.pkl', 'wb') as fp: pickle.dump(labels, fp)
def main(): # loading input arguments for training global args global best_prec1 global start_epoch start_epoch = 0 args = parser.parse_args() print('Training arguments:') for k, v in vars(args).items(): print('\t{}: {}'.format(k, v)) if args.data_name == 'ucf101': num_class = 101 elif args.data_name == 'hmdb51': num_class = 51 elif args.data_name == 'kinetics400': num_class = 400 else: raise ValueError('Unknown dataset ' + args.data_name) # define the model architecture model = Model(num_class, args.num_segments, args.representation, base_model=args.arch, new_length=args.new_length, use_databn=args.use_databn, gen_flow_or_delta=args.gen_flow_or_delta, gen_flow_ds_factor=args.gen_flow_ds_factor, arch_estimator=args.arch_estimator, arch_d=args.arch_d, att=args.att) print(model) # load the pre-trained model if args.weights is not None: checkpoint = torch.load(args.weights, map_location=lambda storage, loc: storage) print("model epoch {} best prec@1: {}".format(checkpoint['epoch'], checkpoint['best_prec1'])) base_dict = {'.'.join(k.split('.')[1:]): v for k,v in list(checkpoint['state_dict'].items())} model.load_state_dict(base_dict, strict=False) # define the data loader for reading training data train_loader = torch.utils.data.DataLoader( CoviarDataSet( args.data_root, args.flow_root, args.data_name, video_list=args.train_list, num_segments=args.num_segments, representation=args.representation, new_length=args.new_length, flow_ds_factor=args.flow_ds_factor, upsample_interp=args.upsample_interp, transform=model.get_augmentation(), is_train=True, accumulate=(not args.no_accumulation), gop=args.gop, flow_folder=args.data_flow, mv_minmaxnorm=args.mv_minmaxnorm, ), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) # define the data loader for reading val data val_loader = torch.utils.data.DataLoader( CoviarDataSet( args.data_root, args.flow_root, args.data_name, video_list=args.test_list, num_segments=args.num_segments, representation=args.representation, new_length=args.new_length, flow_ds_factor=args.flow_ds_factor, upsample_interp=args.upsample_interp, transform=torchvision.transforms.Compose([ GroupScale(int(model.scale_size)), GroupCenterCrop(model.crop_size), ]), is_train=False, accumulate=(not args.no_accumulation), gop=args.gop, flow_folder=args.data_flow, mv_minmaxnorm=args.mv_minmaxnorm, ), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) model = torch.nn.DataParallel(model, device_ids=args.gpus).cuda(args.gpus[0]) cudnn.benchmark = True # define optimizer and specify the corresponding parameters params_dict = dict(model.named_parameters()) params_cls = [] params_gf = [] params_d = [] for key, value in params_dict.items(): if 'base_model' in key: decay_mult = 0.0 if 'bias' in key else 1.0 lr_mult = args.lr_cls_mult # for cls, just finetune. if '.fc.' in key: lr_mult = 1.0 params_cls += [{'params': value, 'lr': args.lr, 'lr_mult': lr_mult, 'decay_mult': decay_mult}] if 'gen_flow_model' in key: decay_mult = 0.0 if 'bias' in key else 1.0 lr_mult = args.lr_mse_mult params_gf += [{'params': value, 'lr': args.lr, 'lr_mult': lr_mult, 'decay_mult': decay_mult}] if 'discriminator' in key: decay_mult = 0.0 if 'bias' in key else 1.0 lr_mult = args.lr_d_mult params_d += [{'params': value, 'lr': args.lr, 'lr_mult': lr_mult, 'decay_mult': decay_mult}] optimizer_cls = torch.optim.Adam( params_cls, weight_decay=args.weight_decay, eps=0.001) optimizer_gf = torch.optim.Adam( params_gf, weight_decay=args.weight_decay, eps=0.001) optimizer_d = torch.optim.Adam( params_d, weight_decay=args.weight_decay, eps=0.001) # resume training from previous checkpoint if args.resume is not None: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage) start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) if 'optimizer_cls' in checkpoint.keys(): optimizer_cls.load_state_dict(checkpoint['optimizer_cls']) optimizer_gf.load_state_dict(checkpoint['optimizer_gf']) optimizer_d.load_state_dict(checkpoint['optimizer_d']) def load_opt_update_cuda(optimizer, cuda_id): for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda(cuda_id) load_opt_update_cuda(optimizer_cls, args.gpus[0]) load_opt_update_cuda(optimizer_gf, args.gpus[0]) load_opt_update_cuda(optimizer_d, args.gpus[0]) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) # define several loss functions criterion = torch.nn.CrossEntropyLoss().cuda(args.gpus[0]) if args.loss_mse == 'MSELoss': criterion_mse = torch.nn.MSELoss().cuda(args.gpus[0]) elif args.loss_mse == 'SmoothL1Loss': criterion_mse = torch.nn.SmoothL1Loss().cuda(args.gpus[0]) elif args.loss_mse == 'L1': criterion_mse = torch.nn.L1Loss().cuda(args.gpus[0]) # finally done with setup and start to train model for epoch in range(start_epoch, args.epochs): # determine the learning rate for the current epoch cur_lr_cls = adjust_learning_rate(optimizer_cls, epoch, args.lr_steps, args.lr_decay) #, freeze=True, epoch_thre=args.epoch_thre) cur_lr_gf = adjust_learning_rate(optimizer_gf, epoch, args.lr_steps, args.lr_decay) cur_lr_d = adjust_learning_rate(optimizer_d, epoch, args.lr_steps, args.lr_decay) # perform training train(train_loader, model, criterion, criterion_mse, optimizer_cls, optimizer_gf, optimizer_d, epoch, cur_lr_cls, cur_lr_gf, cur_lr_d, args.lr_cls, args.lr_adv_g, args.lr_adv_d, args.lr_mse, args.att) # perform validation if needed if epoch % args.eval_freq == 0 or epoch == args.epochs - 1: prec1 = validate(val_loader, model, criterion, criterion_mse, args.lr_cls, args.lr_adv_g, args.lr_adv_d, args.lr_mse, args.att) is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) if is_best or epoch % SAVE_FREQ == 0: save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer_cls': optimizer_cls.state_dict(), 'optimizer_gf': optimizer_gf.state_dict(), 'optimizer_d': optimizer_d.state_dict(), }, is_best, filename='checkpoint.pth.tar')
def main(): net = Model(num_class, base_model=args.arch) checkpoint = torch.load(args.weights) print("model epoch {} best prec@1: {}".format(checkpoint['epoch'], checkpoint['best_prec1'])) base_dict = { '.'.join(k.split('.')[1:]): v for k, v in list(checkpoint['state_dict'].items()) } net.load_state_dict(base_dict) if args.test_crops == 1: cropping = torchvision.transforms.Compose([ GroupScale(net.scale_size), GroupCenterCrop(net.crop_size), ]) elif args.test_crops == 10: cropping = torchvision.transforms.Compose([ GroupOverSample(net.crop_size, net.scale_size, is_mv=(args.representation == 'mv')) ]) else: raise ValueError( "Only 1 and 10 crops are supported, but got {}.".format( args.test_crops)) data_loader = torch.utils.data.DataLoader(FoodDataSet( args.data_root, img_list=args.test_list, transform=cropping, is_train=False, ), batch_size=1, shuffle=False, num_workers=args.workers * 2, pin_memory=True) if args.gpus is not None: devices = [args.gpus[i] for i in range(args.workers)] else: devices = list(range(args.workers)) net = torch.nn.DataParallel(net.cuda(devices[0]), device_ids=devices) net.eval() data_gen = enumerate(data_loader) total_num = len(data_loader.dataset) output = [] def forward_img(data): """ Args: data (Tensor): size [batch_size, c, h, w] Returns: scores (Tensor) : size [batch_size, num_class] """ with torch.no_grad(): input_var = torch.autograd.Variable(data, volatile=True) scores = net(input_var) scores = scores.view((-1, args.test_crops) + scores.size()[1:]) scores = torch.mean(scores, dim=1) return scores.data.cpu().numpy().copy() proc_start_time = time.time() for i, (data, label) in data_gen: # data = [1, c, h ,w], label = [1] img_scores = forward_img(data) output.append((img_scores[0], label[0])) cnt_time = time.time() - proc_start_time if (i + 1) % 100 == 0: print('image {} done, total {}/{}, average {} sec/image'.format( i, i + 1, total_num, float(cnt_time) / (i + 1))) img_pred = [np.argmax(x[0]) for x in output] img_labels = [x[1] for x in output] print('Accuracy {:.02f}% ({})'.format( float(np.sum(np.array(img_pred) == np.array(img_labels))) / len(img_pred) * 100.0, len(img_pred)))