示例#1
0
def predict(clip, model):
    if opt.no_mean_norm and not opt.std_norm:
        norm_method = Normalize([0, 0, 0], [1, 1, 1])
    elif not opt.std_norm:
        norm_method = Normalize(opt.mean, [1, 1, 1])
    else:
        norm_method = Normalize(opt.mean, opt.std)

    spatial_transform = Compose([
        Scale((150, 150)),
        #Scale(int(opt.sample_size / opt.scale_in_test)),
        #CornerCrop(opt.sample_size, opt.crop_position_in_test),
        ToTensor(opt.norm_value), norm_method
    ])
    if spatial_transform is not None:
        # spatial_transform.randomize_parameters()
        clip = [spatial_transform(img) for img in clip] 
    clip = torch.stack(clip, dim=0)
    clip = clip.unsqueeze(0)
    with torch.no_grad():
        print(clip.shape)
        outputs = model(clip)
        outputs = F.softmax(outputs)
    print(outputs)
    scores, idx = torch.topk(outputs, k=1)
    mask = scores > 0.6
    preds = idx[mask]
    return preds
示例#2
0
def pre_process_frame(frame, opt):
    # Convert from BGR opencv channel layout to RGB
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    # Convert to pillow format for easy pre-processing
    frame = Image.fromarray(frame)

    if opt.no_mean_norm and not opt.std_norm:
        norm_method = Normalize([0, 0, 0], [1, 1, 1])
    elif not opt.std_norm:
        norm_method = Normalize(opt.mean, [1, 1, 1])
    else:
        norm_method = Normalize(opt.mean, opt.std)

    spatial_transforms_det = Compose([
        Scale(opt.sample_size),
        CenterCrop(opt.sample_size),
        ToTensor(opt.norm_value), norm_method
    ])

    # Use torchvision transforms for compatibility with SSAR model
    spatial_transforms_clf = transforms.Compose([
        transforms.Resize(opt.sample_size_clf),
        transforms.ToTensor(),
        transforms.Normalize(opt.mean_clf, opt.std_clf)
    ])

    det_frame = spatial_transforms_det(frame)
    clf_frame = spatial_transforms_clf(frame)
    return det_frame, clf_frame
示例#3
0
    def __init__(self, model_file, sample_duration, model_type, cuda_id=0):

        self.opt = parse_opts()

        self.opt.model = model_type

        self.opt.root_path = './C3D_ResNet/data'

        self.opt.resume_path = os.path.join(self.opt.root_path, model_file)
        self.opt.pretrain_path = os.path.join(self.opt.root_path,
                                              'models/resnet-18-kinetics.pth')

        self.opt.cuda_id = cuda_id
        self.opt.dataset = 'ucf101'
        self.opt.n_classes = 400
        self.opt.n_finetune_classes = 3
        self.opt.ft_begin_index = 4
        self.opt.model_depth = 18
        self.opt.resnet_shortcut = 'A'
        self.opt.sample_duration = sample_duration
        self.opt.batch_size = 1
        self.opt.n_threads = 1
        self.opt.checkpoint = 5

        self.opt.arch = '{}-{}'.format(self.opt.model, self.opt.model_depth)
        self.opt.mean = get_mean(self.opt.norm_value,
                                 dataset=self.opt.mean_dataset)
        self.opt.std = get_std(self.opt.norm_value)
        # print(self.opt)

        print('Loading C3D action-recognition model..')

        self.model, parameters = generate_model(self.opt)
        # print(self.model)

        if self.opt.no_mean_norm and not self.opt.std_norm:
            norm_method = Normalize([0, 0, 0], [1, 1, 1])
        elif not self.opt.std_norm:
            norm_method = Normalize(self.opt.mean, [1, 1, 1])
        else:
            norm_method = Normalize(self.opt.mean, self.opt.std)

        if self.opt.resume_path:
            print('    loading checkpoint {}'.format(self.opt.resume_path))
            checkpoint = torch.load(self.opt.resume_path)
            # assert self.opt.arch == checkpoint['arch']

            self.opt.begin_epoch = checkpoint['epoch']
            self.model.load_state_dict(checkpoint['state_dict'])

        self.spatial_transform = Compose([
            ScaleQC(int(self.opt.sample_size / self.opt.scale_in_test)),
            CornerCrop(self.opt.sample_size, self.opt.crop_position_in_test),
            ToTensor(self.opt.norm_value), norm_method
        ])

        self.target_transform = ClassLabel()

        self.model.eval()
示例#4
0
def model_process(count, model):
    opt = parse_opts()

    if opt.root_path != '':
        opt.video_path = os.path.join(opt.root_path, opt.video_path)
        opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path)
        opt.result_path = os.path.join(opt.root_path, opt.result_path)
        if opt.resume_path:
            opt.resume_path = os.path.join(opt.root_path, opt.resume_path)
        if opt.pretrain_path:
            opt.pretrain_path = os.path.join(opt.root_path, opt.pretrain_path)
    opt.scales = [opt.initial_scale]
    for i in range(1, opt.n_scales):
        opt.scales.append(opt.scales[-1] * opt.scale_step)
    #opt.arch = '{}-{}'.format(opt.model, opt.model_depth)
    opt.mean = get_mean(opt.norm_value, dataset=opt.mean_dataset)
    opt.std = get_std(opt.norm_value)
    #print(opt)
    #print(opt.result_path)
    with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file:
        json.dump(vars(opt), opt_file)

    torch.manual_seed(opt.manual_seed)

    #print(model)
    criterion = nn.CrossEntropyLoss()
    if not opt.no_cuda:
        criterion = criterion.cuda()

    if opt.no_mean_norm and not opt.std_norm:
        norm_method = Normalize([0, 0, 0], [1, 1, 1])
    elif not opt.std_norm:
        norm_method = Normalize(opt.mean, [1, 1, 1])
    else:
        norm_method = Normalize(opt.mean, opt.std)

    print('testing is run')

    if opt.test:
        spatial_transform = Compose([
            Scale(int(opt.sample_size / opt.scale_in_test)),
            CornerCrop(opt.sample_size, opt.crop_position_in_test),
            ToTensor(opt.norm_value), norm_method
        ])
        temporal_transform = LoopPadding(opt.sample_duration)
        target_transform = VideoID()

        test_data = get_test_set(opt, spatial_transform, temporal_transform,
                                 target_transform)

        test_loader = torch.utils.data.DataLoader(test_data,
                                                  batch_size=opt.batch_size,
                                                  shuffle=False,
                                                  num_workers=opt.n_threads,
                                                  pin_memory=True)

        tester.test(count, test_loader, model, opt, test_data.class_names)
示例#5
0
文件: main.py 项目: shuxiao0312/STRG
def get_normalize_method(mean, std, no_mean_norm, no_std_norm):
    if no_mean_norm:
        if no_std_norm:
            return Normalize([0, 0, 0], [1, 1, 1])
        else:
            return Normalize([0, 0, 0], std)
    else:
        if no_std_norm:
            return Normalize(mean, [1, 1, 1])
        else:
            return Normalize(mean, std)
示例#6
0
def get_loaders(opt):
    """ Make dataloaders for train and validation sets
	"""
    # train loader
    norm_method = Normalize(mean=[0.485, 0.456, 0.406],
                            std=[0.229, 0.224, 0.225])
    spatial_transform = Compose([
        Scale((opt.sample_size, opt.sample_size)),
        Resize(256),
        CenterCrop(224),
        ToTensor(), norm_method
    ])
    temporal_transform = TemporalRandomCrop(25)
    target_transform = ClassLabel()
    training_data = get_training_set(opt, spatial_transform,
                                     temporal_transform, target_transform)
    train_loader = torch.utils.data.DataLoader(training_data,
                                               batch_size=opt.batch_size,
                                               shuffle=True,
                                               num_workers=opt.num_workers,
                                               pin_memory=True)

    # validation loader
    target_transform = ClassLabel()
    temporal_transform = LoopPadding(25)
    validation_data = get_validation_set(opt, spatial_transform,
                                         temporal_transform, target_transform)
    val_loader = torch.utils.data.DataLoader(validation_data,
                                             batch_size=opt.batch_size,
                                             shuffle=False,
                                             num_workers=opt.num_workers,
                                             pin_memory=True)
    return train_loader, val_loader
示例#7
0
def get_dataloader(opt):

    mean = [110.63666788 / 255, 103.16065604 / 255, 96.29023126 / 255]
    std = [1, 1, 1]

    norm_method = Normalize(mean, std)

    spatial_transform = Compose(
        [Scale(112),
         CornerCrop(112, 'c'),
         ToTensor(255), norm_method])

    temporal_transform = LoopPadding(16)
    target_transform = ClassLabel()

    test_data = SurgicalDataset(os.path.abspath(opt.frames_path),
                                os.path.abspath(
                                    opt.video_phase_annotation_path),
                                opt.class_names,
                                spatial_transform=spatial_transform,
                                temporal_transform=temporal_transform,
                                target_transform=target_transform,
                                sample_duration=16)

    test_loader = torch.utils.data.DataLoader(test_data,
                                              batch_size=1,
                                              shuffle=False,
                                              num_workers=4,
                                              pin_memory=True)

    return test_loader
示例#8
0
 def __init__(self,
              root_dir,
              spatial_transform=None,
              seqLen=20,
              train=True,
              mulSeg=False,
              numSeg=1,
              fmt='.png',
              regression=True,
              numOrdClass=12):
     normalize = Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
     self.images, self.labels, self.numFrames = gen_split(
         root_dir, 5, train)  # vedi sopra
     self.main_spatial_transform = spatial_transform  # transformation di data augmentation
     self.spatial_transform_rgb = Compose(
         [self.main_spatial_transform,
          ToTensor(), normalize])
     if regression == False:
         self.spatial_transform_mmaps = Compose([
             self.main_spatial_transform,
             Scale(7),
             ToTensor(),
             Binary(0.4)
         ])
     else:
         self.spatial_transform_mmaps = Compose(
             [self.main_spatial_transform,
              Scale(7), ToTensor()])
     self.train = train
     self.mulSeg = mulSeg
     self.numSeg = numSeg
     self.seqLen = seqLen
     self.fmt = fmt
     self.numOrdClass = numOrdClass
def classify_video(video_dir, video_name, class_names, model, opt):
    # print("video_dir: {}, video_name: {}".format(video_dir,video_name));
    assert opt.mode in ['score', 'feature']

    spatial_transform = Compose([Scale(opt.sample_size),
                                 CenterCrop(opt.sample_size),
                                 ToTensor(),
                                 Normalize(opt.mean, [1, 1, 1])])
    temporal_transform = LoopPadding(opt.sample_duration)
    data = Video(video_dir, spatial_transform=spatial_transform,
                 temporal_transform=temporal_transform,
                 sample_duration=opt.sample_duration)
    data_loader = torch.utils.data.DataLoader(data, batch_size=opt.batch_size,
                                              shuffle=False, num_workers=opt.n_threads, pin_memory=True)

    video_outputs = []
    # video_segments = []
    for i, (inputs, segments) in enumerate(data_loader):
        inputs = Variable(inputs, volatile=True)
        outputs = model(inputs)

        video_outputs.append(outputs.cpu().data)
        # video_segments.append(segments)

    if len(video_outputs) != 0:
        video_outputs = torch.cat(video_outputs)
        return video_outputs.numpy()
    else:
        return None
示例#10
0
def extract_feature(opt, video_dir, C3D_model):
    assert opt.mode in ['score', 'feature']

    spatial_transform = Compose([Scale(opt.sample_size),
                                 CenterCrop(opt.sample_size),
                                 ToTensor(),
                                 Normalize(opt.mean, [1, 1, 1])])
    temporal_transform = LoopPadding(opt.sample_duration)
    load_image_fn = None
    data = Video(opt, video_dir, load_image_fn,
                 spatial_transform=spatial_transform,
                 temporal_transform=temporal_transform,
                 sample_duration=opt.sample_duration)
    data_loader = torch.utils.data.DataLoader(data, batch_size=opt.batch_size,
                                              shuffle=False, num_workers=opt.n_threads, pin_memory=True)

    c3d_features = []
    for i, clip in enumerate(data_loader):

        print(clip.mean())

        ## c3d feats
        clip = clip.to(opt.device)
        with torch.no_grad():
            c3d_outputs = C3D_model(clip)

        # 汇总
        c3d_features.append(c3d_outputs.cpu().data) # torch.Size([8, 512, 14, 14])

    c3d_features = torch.cat(c3d_features, 0)  # c3d feature of one video


    return c3d_features.cpu().numpy()
示例#11
0
def get_loaders(opt):
	""" Make dataloaders for train and validation sets
	"""
	# train loader
	opt.mean = get_mean(opt.norm_value, dataset=opt.mean_dataset)
	if opt.no_mean_norm and not opt.std_norm:
		norm_method = Normalize([0, 0, 0], [1, 1, 1])
	elif not opt.std_norm:
		norm_method = Normalize(opt.mean, [1, 1, 1])
	else:
		norm_method = Normalize(opt.mean, opt.std)
	spatial_transform = Compose([
		# crop_method,
		Scale((opt.sample_size, opt.sample_size)),
		# RandomHorizontalFlip(),
		ToTensor(opt.norm_value), norm_method
	])
	temporal_transform = TemporalRandomCrop(16)
	target_transform = ClassLabel()
	training_data = get_training_set(opt, spatial_transform,
									 temporal_transform, target_transform)
	train_loader = torch.utils.data.DataLoader(
		training_data,
		batch_size=opt.batch_size,
		shuffle=True,
		num_workers=opt.num_workers,
		pin_memory=True)

	# validation loader
	spatial_transform = Compose([
		Scale((opt.sample_size, opt.sample_size)),
		# CenterCrop(opt.sample_size),
		ToTensor(opt.norm_value), norm_method
	])
	target_transform = ClassLabel()
	temporal_transform = LoopPadding(16)
	validation_data = get_validation_set(
		opt, spatial_transform, temporal_transform, target_transform)
	val_loader = torch.utils.data.DataLoader(
		validation_data,
		batch_size=opt.batch_size,
		shuffle=False,
		num_workers=opt.num_workers,
		pin_memory=True)
	return train_loader, val_loader
示例#12
0
def classify_video(video_dir,
                   video_name,
                   class_names,
                   model,
                   opt,
                   annotation_digit=5):
    assert opt.mode in ['score', 'feature']

    spatial_transform = Compose([
        Scale(opt.sample_size),
        CenterCrop(opt.sample_size),
        ToTensor(),
        Normalize(opt.mean, [1, 1, 1])
    ])
    temporal_transform = LoopPadding(opt.sample_duration)
    data = Video(video_dir,
                 spatial_transform=spatial_transform,
                 temporal_transform=temporal_transform,
                 sample_duration=opt.sample_duration)
    data_loader = torch.utils.data.DataLoader(data,
                                              batch_size=opt.batch_size,
                                              shuffle=False,
                                              num_workers=opt.n_threads,
                                              pin_memory=True)

    print('reading file from: ', video_dir, 'file name: ', video_name)

    video_outputs = []
    video_segments = []
    shit_lol = enumerate(data_loader)
    for i, (inputs, segments) in enumerate(data_loader):
        inputs = Variable(inputs, volatile=True)
        outputs = model(inputs)

        video_outputs.append(outputs.cpu().data)
        video_segments.append(segments)

    video_outputs = torch.cat(video_outputs)
    video_segments = torch.cat(video_segments)
    results = {'video': video_name, 'clips': []}

    _, max_indices = video_outputs.max(dim=1)
    for i in range(video_outputs.size(0)):
        clip_results = {
            'segment': video_segments[i].tolist(),
        }

        if opt.mode == 'score':
            clip_results['label'] = class_names[max_indices[i]]
            clip_results['scores'] = video_outputs[i].tolist()
        elif opt.mode == 'feature':
            clip_results['features'] = video_outputs[i].tolist()
            clip_results['ground_truth_annotaion'] = annotation_digit

        results['clips'].append(clip_results)

    return results
def classify_video(video_dir, video_name, class_names, model, opt):
    assert opt.mode in ['score', 'feature']
    print('video_name, class_names', video_name)
    spatial_transform = Compose([
        Scale(opt.sample_size),
        CenterCrop(opt.sample_size),
        ToTensor(),
        Normalize(opt.mean, [1, 1, 1])
    ])
    temporal_transform = LoopPadding(opt.sample_duration)
    data = Video(video_dir,
                 spatial_transform=spatial_transform,
                 temporal_transform=temporal_transform,
                 sample_duration=opt.sample_duration)
    data_loader = torch.utils.data.DataLoader(data,
                                              batch_size=opt.batch_size,
                                              shuffle=False,
                                              num_workers=opt.n_threads,
                                              pin_memory=True)

    video_outputs = []
    video_segments = []
    print('Running on video', video_dir)

    #print ('Data loader size', len(data_loader))
    for i, (inputs, segments) in enumerate(data_loader):
        inputs = Variable(inputs, volatile=True)
        print(i, inputs.size(), segments.shape)
        outputs = model(inputs)

        video_outputs.append(outputs.cpu().data)
        video_segments.append(segments)

    #print('Video outputs and segments', video_outputs)
    results = {'video': video_name, 'clips': []}
    if len(video_outputs) > 0:
        print('Video outputs and segments: ', video_outputs[0].shape)

        video_outputs = torch.cat(video_outputs)
        video_segments = torch.cat(video_segments)

        _, max_indices = video_outputs.max(dim=1)
        print('Video outputs', video_outputs.size())
        for i in range(video_outputs.size(0)):
            clip_results = {
                'segment': video_segments[i].tolist(),
            }

            if opt.mode == 'score':
                clip_results['label'] = class_names[max_indices[i]]
                clip_results['scores'] = video_outputs[i].tolist()
            elif opt.mode == 'feature':
                clip_results['features'] = video_outputs[i].tolist()

            results['clips'].append(clip_results)

    return results
示例#14
0
def main_run(dataset,model_state_dict, dataset_dir, seqLen, memSize,stackSize):

    if dataset == 'gtea61':
        num_classes = 61
    elif dataset == 'gtea71':
      num_classes = 71
    elif dataset == 'gtea_gaze':
        num_classes = 44
    elif dataset == 'egtea':
        num_classes = 106

    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    normalize = Normalize(mean=mean, std=std)
    spatial_transform = Compose([Scale(256), CenterCrop(224)])
    spatial_transorm2 = Compose([Scale((7,7)), ToTensor()])
    sequence = True


    vid_seq_test = makeDataset(dataset_dir, spatial_transorm2 ,spatial_transform=spatial_transform, stackSize=stackSize, fmt='.png', phase='Test', seqLen=seqLen)

    test_loader = torch.utils.data.DataLoader(vid_seq_test, batch_size=1,
                            shuffle=False, num_workers=2, pin_memory=True)

    model = attentionModel(num_classes=num_classes, mem_size=memSize)
    model.load_state_dict(torch.load(model_state_dict))
    
    for params in model.parameters():
        params.requires_grad = False
    
    model.train(False)
    model.cuda()
    test_samples = vid_seq_test.__len__()
    print('Number of samples = {}'.format(test_samples))
    print('Evaluating...')
    numCorr = 0
    true_labels = []
    predicted_labels = []
    
    with torch.no_grad():
        #for j, (inputs, targets) in enumerate(test_loader):
        for flowX, flowY, inputs, targets in test_loader:
            inputVariable = Variable(inputs.permute(1, 0, 2, 3, 4).cuda())
            output_label, _ , flowXprediction , flowYprediction = model(inputVariable)
            
            _, predicted = torch.max(output_label.data, 1)
            numCorr += (predicted == targets.cuda()).sum()
            true_labels.append(targets)
            predicted_labels.append(predicted.cpu())
    
    test_accuracy = torch.true_divide(numCorr, test_samples) * 100
    test_accuracy = 'Test Accuracy = {}%'.format(test_accuracy)
    print(test_accuracy)
示例#15
0
def classify_video(video_dir, video_name, class_names, model, opt):
    assert opt.mode in ['score', 'feature']

    spatial_transform = Compose([Scale(opt.sample_size),
                                 CenterCrop(opt.sample_size),
                                 ToTensor(),
                                 Normalize(opt.mean, [1, 1, 1])])
    temporal_transform = LoopPadding(opt.sample_duration)
    data = Video(video_dir, spatial_transform=spatial_transform,
                 temporal_transform=temporal_transform,
                 sample_duration=opt.sample_duration,
                 stride=opt.stride)
    data_loader = torch.utils.data.DataLoader(data, batch_size=opt.batch_size,
                                              shuffle=False, num_workers=opt.n_threads, pin_memory=True)

    video_outputs = []
    video_segments = []
    for i, (inputs, segments) in enumerate(data_loader):
        inputs = Variable(inputs, volatile=True)
        outputs = model(inputs)

        video_outputs.append(outputs.cpu().data)
        video_segments.append(segments)

    if len(video_outputs) == 0:
        with open("error.list", 'a') as fout:
            fout.write("{}\n".format(video_name))
        return {}

    video_outputs = torch.cat(video_outputs)
    video_segments = torch.cat(video_segments)

    results = {
        'video': video_name,
        'clips': []
    }

    _, max_indices = video_outputs.max(dim=1)
    for i in range(video_outputs.size(0)):
        clip_results = {
            'segment': video_segments[i].tolist(),
        }

        if opt.mode == 'score':
            clip_results['label'] = class_names[max_indices[i]]
            clip_results['scores'] = video_outputs[i].tolist()
        elif opt.mode == 'feature':
            clip_results['features'] = video_outputs[i].tolist()

        results['clips'].append(clip_results)

    return results
示例#16
0
def extract_feature(opt, video_dir, C3D_model, load_image_fn, C2D_model,
                    c2d_shape, duration):
    assert opt.mode in ['score', 'feature']
    C, H, W = c2d_shape

    spatial_transform = Compose([
        Scale(opt.sample_size),
        CenterCrop(opt.sample_size),
        ToTensor(),
        Normalize(opt.mean, [1, 1, 1])
    ])
    temporal_transform = LoopPadding(opt.sample_duration)

    opt.num_segments = max(int(duration / opt.clip_len), 1)
    data = Video(opt,
                 video_dir,
                 load_image_fn,
                 spatial_transform=spatial_transform,
                 temporal_transform=temporal_transform,
                 sample_duration=opt.sample_duration)
    data_loader = torch.utils.data.DataLoader(data,
                                              batch_size=opt.batch_size,
                                              shuffle=False,
                                              num_workers=0,
                                              pin_memory=True)

    c3d_features = []
    c2d_features = []
    for i, (clip, frames_npy_data) in enumerate(data_loader):

        ## c3d feats
        clip = clip.to(opt.device)
        with torch.no_grad():
            c3d_outputs = C3D_model(clip)

        frames = frames_npy_data.to(opt.device)
        with torch.no_grad():
            c2d_outputs = C2D_model(frames).squeeze()
            if len(c2d_outputs.shape) == 1:
                c2d_outputs = c2d_outputs.unsqueeze(0)

        # 汇总
        c3d_features.append(c3d_outputs.cpu().data)
        c2d_features.append(c2d_outputs.cpu().data)

    try:
        c3d_features = torch.cat(c3d_features)  # c3d feature of one video
        c2d_features = torch.cat(c2d_features)  # c3d feature of one video
    except:
        return None, None

    return c3d_features.cpu().numpy(), c2d_features.cpu().numpy()
示例#17
0
def get_cam_visualisation(self,
                          resnet,
                          weight_softmax,
                          input_pil_image,
                          preprocess_for_viz=None,
                          preprocess_for_model=None):
    if preprocess_for_viz == None:
        preprocess_for_viz = Compose([
            Scale(256),
            CenterCrop(224),
        ])
    if preprocess_for_model == None:
        normalize = Normalize(mean=[0.485, 0.456, 0.406],
                              std=[0.229, 0.224, 0.225])

        preprocess_for_model = Compose(
            [Scale(256), CenterCrop(224),
             ToTensor(), normalize])

    tensor_image = preprocess_for_model(input_pil_image)
    pil_image = preprocess_for_viz(input_pil_image)

    logit, feature_conv, _ = resnet(tensor_image.unsqueeze(0).cuda())

    bz, nc, h, w = feature_conv.size()
    feature_conv = feature_conv.view(bz, nc, h * w)

    h_x = F.softmax(logit, dim=1).data
    probs, idx = h_x.sort(1, True)

    cam_img = torch.bmm(weight_softmax[idx[:, 0]].unsqueeze(1),
                        feature_conv).squeeze(1)
    cam_img = F.softmax(cam_img, 1).data
    cam_img = cam_img.cpu()
    cam_img = cam_img.reshape(h, w)
    cam_img = cam_img - torch.min(cam_img)
    cam_img = cam_img / torch.max(cam_img)

    cam_img = np.uint8(255 * cam_img)
    img = np.uint8(pil_image)

    output_cam = cv2.resize(cam_img, pil_image.size)
    heatmap = cv2.applyColorMap(output_cam, cv2.COLORMAP_JET)
    img = cv2.cvtColor(np.uint8(img), cv2.COLOR_RGB2BGR)

    result = heatmap * 0.4 + img * 0.6
    result = cv2.cvtColor(np.uint8(result), cv2.COLOR_BGR2RGB)

    return Image.fromarray(result)
示例#18
0
def classify_video(video_dir, video_name, class_names, model, opt):
    assert opt.mode in ['score', 'feature']

    spatial_transform = Compose([
        Scale(opt.sample_size),
        CenterCrop(opt.sample_size),
        ToTensor(),
        Normalize(opt.mean, [1, 1, 1])
    ])
    temporal_transform = LoopPadding(opt.sample_duration)
    data = Video(video_dir,
                 spatial_transform=spatial_transform,
                 temporal_transform=temporal_transform,
                 sample_duration=opt.sample_duration)
    data_loader = torch.utils.data.DataLoader(data,
                                              batch_size=opt.batch_size,
                                              shuffle=False,
                                              num_workers=opt.n_threads,
                                              pin_memory=True)

    video_outputs = []
    video_segments = []

    with torch.no_grad():
        for i, (inputs, segments) in enumerate(data_loader):
            inputs = Variable(inputs)
            outputs = model(inputs)

            video_outputs.append(outputs.cpu().data)
            video_segments.append(segments)

        video_outputs = torch.cat(video_outputs)
        video_segments = torch.cat(video_segments)
        results = {'video': video_name, 'clips': []}

        os.mkdir('features/' + video_name.split('.')[0])

        mypath = 'features/' + video_name.split('.')[0] + '/'

        _, max_indices = video_outputs.max(dim=1)
        for i in range(video_outputs.size(0)):

            with open(mypath + str(i) + '.txt', 'w+') as f:

                f.write(' '.join(map(str, video_outputs[i].tolist())))

        return results
示例#19
0
def extract_feats(file_path, net, filenames, frame_num, batch_size, save_path):
    """Extract 3D features (saved in .npy) for a video. """
    net.eval()
    mean = get_mean(255, dataset='kinetics')
    std = get_std(255)
    transform = Compose([
        trn.ToPILImage(),
        Scale(112),
        CornerCrop(112, 'c'),
        ToTensor(),
        Normalize(mean, std)
    ])

    print("Network loaded")
    #Read videos and extract features in batches
    for file in filenames[start_idx:end_idx]:
        feat_file = os.path.join(save_path, file[:-4] + '.npy')
        if os.path.exists(feat_file):
            continue
        vid = imageio.get_reader(os.path.join(file_path, file), 'ffmpeg')

        curr_frames = []
        for frame in vid:
            if len(frame.shape) < 3:
                frame = np.repeat(frame, 3)
            curr_frames.append(transform(frame).unsqueeze(0))
        curr_frames = torch.cat(curr_frames, dim=0)
        print("Shape of frames: {0}".format(curr_frames.shape))
        idx = np.linspace(0, len(curr_frames) - 1, frame_num).astype(int)
        print("Captured {} clips: {}".format(len(idx), curr_frames.shape))

        curr_feats = []
        for i in range(0, len(idx), batch_size):
            curr_batch = [
                curr_frames[x - 8:x + 8, ...].unsqueeze(0)
                for x in idx[i:i + batch_size]
            ]
            curr_batch = torch.cat(curr_batch, dim=0).cuda()
            out = net(curr_batch.transpose(1, 2).cuda())
            curr_feats.append(out.detach().cpu())
            print("Appended {} features {}".format(i + 1, out.shape))
        curr_feats = torch.cat(curr_feats, 0)
        del out
        #set_trace()
        np.save(feat_file, curr_feats.numpy())
        print("Saved file {}\nExiting".format(file[:-4] + '.npy'))
示例#20
0
def classify_video(video_dir, video_name, model, opt):
    assert opt.mode in ['score', 'feature']

    spatial_transform = Compose([
        Scale(opt.sample_size),
        CenterCrop(opt.sample_size),
        ToTensor(),
        Normalize(opt.mean, [1, 1, 1])
    ])
    temporal_transform = LoopPadding(opt.sample_duration)
    data = Video(video_dir,
                 spatial_transform=spatial_transform,
                 temporal_transform=temporal_transform,
                 sample_duration=opt.sample_duration)
    data_loader = torch.utils.data.DataLoader(data,
                                              batch_size=opt.batch_size,
                                              shuffle=False,
                                              num_workers=opt.n_threads,
                                              pin_memory=True)

    video_outputs = []
    video_segments = []

    for i, (inputs, segments) in enumerate(data_loader):
        inputs = Variable(inputs, volatile=True)
        outputs = model(inputs)
        video_outputs.append(outputs.cpu().data)
        video_segments.append(segments)

    video_outputs = torch.cat(video_outputs)
    video_segments = torch.cat(video_segments)
    # results = {
    #     'video': video_name,
    #     'clips': []
    # }
    clips = []
    _, max_indices = video_outputs.max(dim=1)
    for i in range(video_outputs.size(0)):
        clip_results = {
            'segment': video_segments[i].tolist(),
        }

        clip_results['features'] = video_outputs[i].tolist()
        clips.append(clip_results)

    return video_name, clips
def classify_video(video_dir, video_name, model, opt):
    assert opt.mode in ['score', 'feature']

    spatial_transform = Compose([
        Scale(opt.sample_size),
        CenterCrop(opt.sample_size),
        ToTensor(),
        Normalize(opt.mean, [1, 1, 1])
    ])
    temporal_transform = LoopPadding(opt.sample_duration)
    data = Video(video_dir,
                 spatial_transform=spatial_transform,
                 temporal_transform=temporal_transform,
                 sample_duration=opt.sample_duration)
    data_loader = torch.utils.data.DataLoader(data,
                                              batch_size=opt.batch_size,
                                              shuffle=False,
                                              num_workers=opt.n_threads,
                                              pin_memory=False)

    video_outputs = []
    video_segments = []

    with torch.no_grad():
        for i, (inputs, segments) in enumerate(data_loader):

            inputs = Variable(inputs)

            outputs = model(inputs)

            video_outputs.append(outputs.cpu().data)
            video_segments.append(segments)

    if video_outputs:
        video_outputs = torch.cat(video_outputs)
        video_segments = torch.cat(video_segments)

    results = dict()
    results['video'] = video_name
    results['features'] = video_outputs
    results['clips'] = video_segments

    return results
示例#22
0
def eval(model):
    crop_method = GroupRandomScaleCenterCrop(size=(224, 224))
    norm = Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    spatial_transform = Compose(
        [crop_method,
            GroupRandomHorizontalFlip(),
            ToTensor(), norm])
    temporal_transform = RandomCrop(size=16, stride=1)
    target_transform = Label()
    val_data = RWF2000('/content/RWF_2000/frames/',
                     '/content/Action_Recognition' + '/RWF-2000.json', 'validation',
                     spatial_transform, temporal_transform, target_transform, 'rwf-2000')
    # print(len(val_data))
    val_loader = DataLoader(val_data,
                            batch_size=16,
                            shuffle=False,
                            num_workers=4,
                            pin_memory=True)
    criterion = nn.CrossEntropyLoss()
    val_loss, val_acc = val(val_loader, model, criterion)
示例#23
0
def classify_video(video_dir, video_name, class_names, model, opt):
    assert opt.mode == 'feature'

    spatial_transform = Compose([
        Scale(opt.sample_size),
        CenterCrop(opt.sample_size),
        ToTensor(),
        Normalize(opt.mean, [1, 1, 1])
    ])
    temporal_transform = LoopPadding(opt.sample_duration)
    data = Video(video_dir,
                 spatial_transform=spatial_transform,
                 temporal_transform=temporal_transform,
                 sample_duration=opt.sample_duration)
    data_loader = torch.utils.data.DataLoader(data,
                                              batch_size=opt.batch_size,
                                              shuffle=False,
                                              num_workers=opt.n_threads,
                                              pin_memory=True)

    video_outputs = []
    video_segments = []
    with torch.no_grad():

        for i, (inputs, segments) in enumerate(data_loader):
            inputs = Variable(inputs)
            outputs = model(inputs)

            video_outputs.append(outputs.cpu().data)
            video_segments.append(segments)

    video_outputs = torch.cat(video_outputs)
    # video_segments = torch.cat(video_segments)
    results = []

    for i in range(video_outputs.size(0)):
        clip_results = np.expand_dims(video_outputs[i].numpy(), axis=0)

        results.append(clip_results)
    results = np.concatenate(results, axis=0)
    return results
示例#24
0
    def __init__(self,
                 root_dir,
                 spatial_transform=None,
                 seqLen=20,
                 train=True,
                 mulSeg=False,
                 numSeg=1,
                 fmt='.png',
                 phase='train',
                 regressor=False):

        self.images, self.maps, self.labels, self.numFrames = gen_split(
            root_dir, 5, phase)
        normalize = Normalize(mean=[0.485, 0.456, 0.406],
                              std=[0.229, 0.224, 0.225])
        self.spatial_transform0 = spatial_transform
        self.spatial_rgb = Compose(
            [self.spatial_transform0,
             ToTensor(), normalize])

        if not (regressor):
            self.spatial_transform_map = Compose(
                [self.spatial_transform0,
                 Scale(7),
                 ToTensor(),
                 Binary(0.4)])
        else:
            self.spatial_transform_map = Compose(
                [self.spatial_transform0,
                 Scale(7), ToTensor()])

        self.train = train
        self.mulSeg = mulSeg
        self.numSeg = numSeg
        self.seqLen = seqLen
        self.fmt = fmt
示例#25
0
def main_run(numEpochs, lr, stepSize, decayRate, trainBatchSize, seqLen,
             memSize, evalInterval, evalMode, numWorkers, outDir,
             fightsDir_train, noFightsDir_train, fightsDir_test,
             noFightsDir_test):

    train_dataset_dir_fights = fightsDir_train
    train_dataset_dir_noFights = noFightsDir_train
    test_dataset_dir_fights = fightsDir_test
    test_dataset_dir_noFights = noFightsDir_test

    trainDataset, trainLabels, trainNumFrames = make_split(
        train_dataset_dir_fights, train_dataset_dir_noFights)
    testDataset, testLabels, testNumFrames = make_split(
        test_dataset_dir_fights, test_dataset_dir_noFights)

    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    normalize = Normalize(mean=mean, std=std)
    spatial_transform = Compose([
        Scale(256),
        RandomHorizontalFlip(),
        MultiScaleCornerCrop([1, 0.875, 0.75, 0.65625], 224),
        ToTensor(), normalize
    ])

    vidSeqTrain = VideoDataset(trainDataset,
                               trainLabels,
                               trainNumFrames,
                               spatial_transform=spatial_transform,
                               seqLen=seqLen)

    trainLoader = torch.utils.data.DataLoader(vidSeqTrain,
                                              batch_size=trainBatchSize,
                                              shuffle=True,
                                              num_workers=numWorkers,
                                              pin_memory=True,
                                              drop_last=True)

    if evalMode == 'centerCrop':
        test_spatial_transform = Compose(
            [Scale(256), CenterCrop(224),
             ToTensor(), normalize])
        testBatchSize = 1
    elif evalMode == 'tenCrops':
        test_spatial_transform = Compose(
            [Scale(256), TenCrops(size=224, mean=mean, std=std)])
        testBatchSize = 1
    elif evalMode == 'fiveCrops':
        test_spatial_transform = Compose(
            [Scale(256), FiveCrops(size=224, mean=mean, std=std)])
        testBatchSize = 1
    elif evalMode == 'horFlip':
        test_spatial_transform = Compose([
            Scale(256),
            CenterCrop(224),
            FlippedImagesTest(mean=mean, std=std)
        ])
        testBatchSize = 1

    vidSeqTest = VideoDataset(testDataset,
                              testLabels,
                              testNumFrames,
                              seqLen=seqLen,
                              spatial_transform=test_spatial_transform)

    testLoader = torch.utils.data.DataLoader(vidSeqTest,
                                             batch_size=testBatchSize,
                                             shuffle=False,
                                             num_workers=int(numWorkers / 2),
                                             pin_memory=True)

    numTrainInstances = vidSeqTrain.__len__()
    numTestInstances = vidSeqTest.__len__()

    print('Number of training samples = {}'.format(numTrainInstances))
    print('Number of testing samples = {}'.format(numTestInstances))

    modelFolder = './experiments_' + outDir  # Dir for saving models and log files
    # Create the dir
    if os.path.exists(modelFolder):
        print(modelFolder + ' exists!!!')
        sys.exit()
    else:
        os.makedirs(modelFolder)
    # Log files
    writer = SummaryWriter(modelFolder)
    trainLogLoss = open((modelFolder + '/trainLogLoss.txt'), 'w')
    trainLogAcc = open((modelFolder + '/trainLogAcc.txt'), 'w')
    testLogLoss = open((modelFolder + '/testLogLoss.txt'), 'w')
    testLogAcc = open((modelFolder + '/testLogAcc.txt'), 'w')

    model = ViolenceModel(mem_size=memSize)

    trainParams = []
    for params in model.parameters():
        params.requires_grad = True
        trainParams += [params]
    model.train(True)
    model.cuda()

    lossFn = nn.CrossEntropyLoss()
    optimizerFn = torch.optim.RMSprop(trainParams, lr=lr)
    optimScheduler = torch.optim.lr_scheduler.StepLR(optimizerFn, stepSize,
                                                     decayRate)

    minAccuracy = 50

    for epoch in range(numEpochs):
        optimScheduler.step()
        epochLoss = 0
        numCorrTrain = 0
        iterPerEpoch = 0
        model.train(True)
        print('Epoch = {}'.format(epoch + 1))
        writer.add_scalar('lr', optimizerFn.param_groups[0]['lr'], epoch + 1)
        for i, (inputs, targets) in enumerate(trainLoader):
            iterPerEpoch += 1
            optimizerFn.zero_grad()
            inputVariable1 = Variable(inputs.permute(1, 0, 2, 3, 4).cuda())
            labelVariable = Variable(targets.cuda())
            outputLabel = model(inputVariable1)
            loss = lossFn(outputLabel, labelVariable)
            loss.backward()
            optimizerFn.step()
            outputProb = torch.nn.Softmax(dim=1)(outputLabel)
            _, predicted = torch.max(outputProb.data, 1)
            numCorrTrain += (predicted == targets.cuda()).sum()
            epochLoss += loss.data[0]
        avgLoss = epochLoss / iterPerEpoch
        trainAccuracy = (numCorrTrain / numTrainInstances) * 100
        print('Training: Loss = {} | Accuracy = {}% '.format(
            avgLoss, trainAccuracy))
        writer.add_scalar('train/epochLoss', avgLoss, epoch + 1)
        writer.add_scalar('train/accuracy', trainAccuracy, epoch + 1)
        trainLogLoss.write('Training loss after {} epoch = {}\n'.format(
            epoch + 1, avgLoss))
        trainLogAcc.write('Training accuracy after {} epoch = {}\n'.format(
            epoch + 1, trainAccuracy))

        if (epoch + 1) % evalInterval == 0:
            model.train(False)
            print('Evaluating...')
            testLossEpoch = 0
            testIter = 0
            numCorrTest = 0
            for j, (inputs, targets) in enumerate(testLoader):
                testIter += 1
                if evalMode == 'centerCrop':
                    inputVariable1 = Variable(inputs.permute(1, 0, 2, 3,
                                                             4).cuda(),
                                              volatile=True)
                else:
                    inputVariable1 = Variable(inputs[0].cuda(), volatile=True)
                labelVariable = Variable(targets.cuda(async=True),
                                         volatile=True)
                outputLabel = model(inputVariable1)
                outputLabel_mean = torch.mean(outputLabel, 0, True)
                testLoss = lossFn(outputLabel_mean, labelVariable)
                testLossEpoch += testLoss.data[0]
                _, predicted = torch.max(outputLabel_mean.data, 1)
                numCorrTest += (predicted == targets[0]).sum()
            testAccuracy = (numCorrTest / numTestInstances) * 100
            avgTestLoss = testLossEpoch / testIter
            print('Testing: Loss = {} | Accuracy = {}% '.format(
                avgTestLoss, testAccuracy))
            writer.add_scalar('test/epochloss', avgTestLoss, epoch + 1)
            writer.add_scalar('test/accuracy', testAccuracy, epoch + 1)
            testLogLoss.write('Test Loss after {} epochs = {}\n'.format(
                epoch + 1, avgTestLoss))
            testLogAcc.write('Test Accuracy after {} epochs = {}%\n'.format(
                epoch + 1, testAccuracy))
            if testAccuracy > minAccuracy:
                savePathClassifier = (modelFolder + '/bestModel.pth')
                torch.save(model, savePathClassifier)
                minAccuracy = testAccuracy
    trainLogAcc.close()
    testLogAcc.close()
    trainLogLoss.close()
    testLogLoss.close()
    writer.export_scalars_to_json(modelFolder + "/all_scalars.json")
    writer.close()
    return True
示例#26
0
    opt.mean = get_mean(opt.norm_value, dataset=opt.mean_dataset)
    opt.std = get_std(opt.norm_value)
    print(opt)
    with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file:
        json.dump(vars(opt), opt_file)

    torch.manual_seed(opt.manual_seed)

    model, parameters = generate_model(opt)
    # print(model)
    criterion = nn.CrossEntropyLoss()
    if not opt.no_cuda:
        criterion = criterion.cuda()

    if opt.no_mean_norm and not opt.std_norm:
        norm_method = Normalize([0, 0, 0], [1, 1, 1])
    elif not opt.std_norm:
        norm_method = Normalize(opt.mean, [1, 1, 1])
    else:
        norm_method = Normalize(opt.mean, opt.std)

    if not opt.no_train:
        assert opt.train_crop in ['random', 'corner', 'center']
        if opt.train_crop == 'random':
            crop_method = MultiScaleRandomCrop(opt.scales, opt.sample_size)
        elif opt.train_crop == 'corner':
            crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size)
        elif opt.train_crop == 'center':
            crop_method = MultiScaleCornerCrop(opt.scales,
                                               opt.sample_size,
                                               crop_positions=['c'])
示例#27
0
        json.dump(vars(opt), opt_file)

    torch.manual_seed(opt.manual_seed)

    model = generate_model(opt)
    print(model)
    criterion = nn.CrossEntropyLoss()
    if not opt.no_cuda:
        criterion = criterion.cuda()

    if not opt.no_train:
        spatial_transform = Compose([
            MultiScaleCornerCrop(opt.scales, opt.sample_size),
            RandomHorizontalFlip(),
            ToTensor(opt.norm_value),
            Normalize(opt.mean, [1, 1, 1])
        ])
        temporal_transform = TemporalRandomCrop(opt.sample_duration)
        target_transform = ClassLabel()
        if opt.dataset == 'kinetics':
            training_data = Kinetics(opt.video_path,
                                     opt.annotation_path,
                                     'training',
                                     spatial_transform=spatial_transform,
                                     temporal_transform=temporal_transform,
                                     target_transform=target_transform)
        else:
            training_data = ActivityNet(opt.video_path,
                                        opt.annotation_path,
                                        'training',
                                        spatial_transform=spatial_transform,
示例#28
0
    opt.arch = 'resnet-{}'.format(opt.model_depth)
    opt.mean = get_mean()
    opt.std = get_std()
    print(opt, flush=True)
    with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file:
        json.dump(vars(opt), opt_file)

    torch.manual_seed(opt.manual_seed)

    model, parameters, arch_parameters = generate_model(opt)
    print(model, flush=True)
    criterion = nn.CrossEntropyLoss()
    if not opt.no_cuda:
        criterion = criterion.cuda()

    norm_method = Normalize(opt.mean, opt.std)

    if not opt.no_train:
        assert opt.train_crop in ['random', 'corner', 'center']
        if opt.train_crop == 'random':
            crop_method = MultiScaleRandomCrop(opt.scales, opt.sample_size)
        elif opt.train_crop == 'corner':
            crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size)
        elif opt.train_crop == 'center':
            crop_method = MultiScaleCornerCrop(opt.scales,
                                               opt.sample_size,
                                               crop_positions=['c'])
        spatial_transform = Compose([
            crop_method,
            RandomHorizontalFlip(opt.dataset),
            ToTensor(), norm_method
示例#29
0
def main_run(dataset, stage, trainDatasetDir, valDatasetDir, stage1_dict,
             stackSize, out_dir, seqLen, trainBatchSize, valBatchSize,
             numEpochs, lr1, decay_factor, decay_step, memSize, alphaX,
             alphaY):

    if dataset == 'gtea61':
        num_classes = 61
    elif dataset == 'gtea71':
        num_classes = 71
    elif dataset == 'gtea_gaze':
        num_classes = 44
    elif dataset == 'egtea':
        num_classes = 106
    else:
        print('Dataset not found')
        sys.exit()

    model_folder = os.path.join(
        './', out_dir, 'attConvLSTM', str(seqLen),
        'stage' + str(stage))  # Dir for saving models and log files
    # Create the dir
    if os.path.exists(model_folder):
        print('Directory {} exists!'.format(model_folder))
        sys.exit()
    os.makedirs(model_folder)

    # Log files
    writer = SummaryWriter(model_folder)
    train_log_loss = open((model_folder + '/train_log_loss.txt'), 'w')
    train_log_acc = open((model_folder + '/train_log_acc.txt'), 'w')
    val_log_loss = open((model_folder + '/val_log_loss.txt'), 'w')
    val_log_acc = open((model_folder + '/val_log_acc.txt'), 'w')

    # Data loader
    normalize = Normalize(mean=[0.485, 0.456, 0.406],
                          std=[0.229, 0.224, 0.225])
    spatial_transform = Compose([
        Scale(256),
        RandomHorizontalFlip(),
        MultiScaleCornerCrop([1, 0.875, 0.75, 0.65625], 224)
    ])
    spatial_transform2 = Compose([Scale((7, 7)), ToTensor()])

    vid_seq_train = makeDataset(trainDatasetDir,
                                spatial_transform2,
                                spatial_transform=spatial_transform,
                                sequence=False,
                                numSeg=1,
                                stackSize=stackSize,
                                fmt='.png',
                                seqLen=seqLen)

    trainInstances = vid_seq_train.__len__()

    train_loader = torch.utils.data.DataLoader(vid_seq_train,
                                               batch_size=trainBatchSize,
                                               shuffle=True,
                                               num_workers=4,
                                               pin_memory=True)

    if valDatasetDir is not None:
        vid_seq_val = makeDataset(valDatasetDir,
                                  spatial_transform2,
                                  spatial_transform=Compose(
                                      [Scale(256), CenterCrop(224)]),
                                  sequence=False,
                                  numSeg=1,
                                  stackSize=stackSize,
                                  fmt='.png',
                                  phase='Test',
                                  seqLen=seqLen)
        valInstances = vid_seq_val.__len__()

        val_loader = torch.utils.data.DataLoader(vid_seq_val,
                                                 batch_size=valBatchSize,
                                                 shuffle=False,
                                                 num_workers=2,
                                                 pin_memory=True)

    train_params = []
    if stage == 1:
        model = attentionModel(num_classes=num_classes, mem_size=memSize)
        model.train(False)
        for params in model.parameters():
            params.requires_grad = False
    else:  # stage == 2
        model = attentionModel(num_classes=num_classes, mem_size=memSize)
        model.load_state_dict(torch.load(stage1_dict), strict=False)
        model.train(False)

        for params in model.parameters():
            params.requires_grad = False
        #
        for params in model.resNet.layer4[0].conv1.parameters():
            params.requires_grad = True
            train_params += [params]

        for params in model.resNet.layer4[0].conv2.parameters():
            params.requires_grad = True
            train_params += [params]

        for params in model.resNet.layer4[1].conv1.parameters():
            params.requires_grad = True
            train_params += [params]

        for params in model.resNet.layer4[1].conv2.parameters():
            params.requires_grad = True
            train_params += [params]

        for params in model.resNet.layer4[2].conv1.parameters():
            params.requires_grad = True
            train_params += [params]

        for params in model.resNet.layer4[2].conv2.parameters():
            params.requires_grad = True
            train_params += [params]
        #
        for params in model.resNet.fc.parameters():
            params.requires_grad = True
            train_params += [params]

        model.resNet.layer4[0].conv1.train(True)
        model.resNet.layer4[0].conv2.train(True)
        model.resNet.layer4[1].conv1.train(True)
        model.resNet.layer4[1].conv2.train(True)
        model.resNet.layer4[2].conv1.train(True)
        model.resNet.layer4[2].conv2.train(True)
        model.resNet.fc.train(True)

    for params in model.lstm_cell.parameters():
        params.requires_grad = True
        train_params += [params]

    for params in model.classifier.parameters():
        params.requires_grad = True
        train_params += [params]

    model.lstm_cell.train(True)

    model.classifier.train(True)
    model.cuda()

    loss_fn = nn.CrossEntropyLoss()
    loss_fn_regression = nn.MSELoss()  # Loss function for the regression model

    optimizer_fn = torch.optim.Adam(train_params,
                                    lr=lr1,
                                    weight_decay=4e-5,
                                    eps=1e-4)

    optim_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer_fn, milestones=decay_step, gamma=decay_factor)

    train_iter = 0
    min_accuracy = 0
    for epoch in range(numEpochs):
        epoch_loss = 0
        numCorrTrain = 0
        x_loss = 0
        y_loss = 0
        trainSamples = 0
        iterPerEpoch = 0

        model.lstm_cell.train(True)
        model.classifier.train(True)
        writer.add_scalar('lr', optimizer_fn.param_groups[0]['lr'], epoch + 1)

        if stage == 2:
            model.resNet.layer4[0].conv1.train(True)
            model.resNet.layer4[0].conv2.train(True)
            model.resNet.layer4[1].conv1.train(True)
            model.resNet.layer4[1].conv2.train(True)
            model.resNet.layer4[2].conv1.train(True)
            model.resNet.layer4[2].conv2.train(True)
            model.resNet.fc.train(True)

        #for i, (inputs, targets) in enumerate(train_loader):
        for flowX, flowY, inputs, targets in train_loader:
            train_iter += 1
            iterPerEpoch += 1
            optimizer_fn.zero_grad()

            flowX = flowX.cuda()
            flowY = flowY.cuda()

            inputVariable = Variable(inputs.permute(1, 0, 2, 3, 4).cuda())
            labelVariable = Variable(targets.cuda())
            trainSamples += inputs.size(0)

            output_label, _, flowXprediction, flowYprediction = model(
                inputVariable)

            #Reshaping predictions and inputs in order
            #to correctly regress on the inputs
            flowXprediction = flowXprediction.view(-1)
            flowX = torch.reshape(flowX, (-1, )).float()

            flowYprediction = flowYprediction.view(-1)
            flowY = torch.reshape(flowY, (-1, )).float()

            #print(f'Prediction: {flowXprediction.size()}')
            #print(f'Input : {flowX.size()}')

            #sys.exit()

            lossX = alphaX * loss_fn_regression(flowXprediction, flowX)
            lossY = alphaY * loss_fn_regression(flowYprediction, flowY)
            loss = loss_fn(output_label, labelVariable)

            #Weighting the loss of the ss task
            #by multiplying it by alpha
            total_loss = loss + lossX + lossY
            total_loss.backward()

            optimizer_fn.step()
            _, predicted = torch.max(output_label.data, 1)
            numCorrTrain += (predicted == targets.cuda()).sum()
            x_loss += lossX.item()
            y_loss += lossY.item()
            epoch_loss += loss.item()

        optim_scheduler.step()
        avg_x_loss = x_loss / iterPerEpoch
        avg_y_loss = y_loss / iterPerEpoch
        avg_loss = epoch_loss / iterPerEpoch
        trainAccuracy = torch.true_divide(numCorrTrain, trainSamples) * 100

        print('Train: Epoch = {} | Loss = {} | Accuracy = {}'.format(
            epoch + 1, avg_loss, trainAccuracy))
        print('X loss after {} epoch = {}% '.format(epoch + 1, avg_x_loss))
        print('Y loss after {} epoch = {}% '.format(epoch + 1, avg_y_loss))

        writer.add_scalar('train/epoch_loss', avg_loss, epoch + 1)
        writer.add_scalar('train/accuracy', trainAccuracy, epoch + 1)
        writer.add_scalar('x_train_loss', avg_x_loss, epoch + 1)
        writer.add_scalar('y_train_loss', avg_y_loss, epoch + 1)

        train_log_loss.write('Training X loss after {} epoch= {}'.format(
            epoch + 1, avg_x_loss))
        train_log_loss.write('Training Y loss after {} epoch= {}'.format(
            epoch + 1, avg_y_loss))
        train_log_loss.write('Training loss after {} epoch = {}\n'.format(
            epoch + 1, avg_loss))
        train_log_acc.write('Training accuracy after {} epoch = {}\n'.format(
            epoch + 1, trainAccuracy))

        if valDatasetDir is not None:
            model.train(False)
            val_loss_epoch = 0
            val_iter = 0
            val_x_loss = 0
            val_y_loss = 0
            val_samples = 0
            numCorr = 0
            mmap_loss = 0

            with torch.no_grad():
                #for j, (inputs, targets) in enumerate(val_loader):
                for flowX, flowY, inputs, targets in val_loader:
                    val_iter += 1
                    val_samples += inputs.size(0)

                    flowX = flowX.cuda()
                    flowY = flowY.cuda()

                    inputVariable = Variable(
                        inputs.permute(1, 0, 2, 3, 4).cuda())
                    labelVariable = Variable(targets.cuda(async=True))
                    #labelVariable = Variable(targets.cuda())

                    output_label, _, flowXprediction, flowYprediction = model(
                        inputVariable)

                    #Reshaping predictions and inputs in order
                    #to correctly regress on the inputs
                    flowXprediction = flowXprediction.view(-1)
                    flowX = torch.reshape(flowX, (-1, )).float()

                    flowYprediction = flowXprediction.view(-1)
                    flowY = torch.reshape(flowX, (-1, )).float()

                    lossX = alphaX * loss_fn_regression(flowXprediction, flowX)
                    lossY = alphaY * loss_fn_regression(flowYprediction, flowY)

                    val_loss = loss_fn(output_label, labelVariable)
                    val_loss_epoch += val_loss.item()
                    val_x_loss += lossX.item()
                    val_y_loss += lossY.item()

                    _, predicted = torch.max(output_label.data, 1)
                    numCorr += (predicted == targets.cuda()).sum()

            avg_x_val_loss = val_x_loss / val_iter
            avg_y_val_loss = val_y_loss / val_iter
            val_accuracy = torch.true_divide(numCorr, val_samples) * 100
            avg_val_loss = val_loss_epoch / val_iter

            print('Val X Loss after {} epochs, loss = {}'.format(
                epoch + 1, avg_x_val_loss))
            print('Val Y Loss after {} epochs, loss = {}'.format(
                epoch + 1, avg_y_val_loss))
            print('Val: Epoch = {} | Loss {} | Accuracy = {}'.format(
                epoch + 1, avg_val_loss, val_accuracy))

            writer.add_scalar('val x/epoch_loss', avg_x_val_loss, epoch + 1)
            writer.add_scalar('val y/epoch_loss', avg_y_val_loss, epoch + 1)
            writer.add_scalar('val/epoch_loss', avg_val_loss, epoch + 1)
            writer.add_scalar('val/accuracy', val_accuracy, epoch + 1)
            val_log_loss.write('Val X Loss after {} epochs = {}\n'.format(
                epoch + 1, avg_x_val_loss))
            val_log_loss.write('Val Y Loss after {} epochs = {}\n'.format(
                epoch + 1, avg_y_val_loss))
            val_log_loss.write('Val Loss after {} epochs = {}\n'.format(
                epoch + 1, avg_val_loss))
            val_log_acc.write('Val Accuracy after {} epochs = {}%\n'.format(
                epoch + 1, val_accuracy))

            if val_accuracy > min_accuracy:
                save_path_model = (model_folder + '/model_rgb_state_dict.pth')
                torch.save(model.state_dict(), save_path_model)
                min_accuracy = val_accuracy

    train_log_loss.close()
    train_log_acc.close()
    val_log_acc.close()
    val_log_loss.close()
    writer.export_scalars_to_json(model_folder + "/all_scalars.json")
    writer.close()
        class_to_name[i] = class_to_name[i].replace(' ', '-')
    if args.dataset == 'ucf101':
        num_class = 101
        args.n_classes = 101
        img_prefix = 'image_'
    else:
        num_class = 174
        args.n_classes = 174
        img_prefix = ''

    whole_model, parameters = generate_model(args)
    print(whole_model)
    # input('...')

    if args.no_mean_norm and not args.std_norm:
        norm_method = Normalize([0, 0, 0], [1, 1, 1])
    elif not args.std_norm:
        norm_method = Normalize(args.mean, [1, 1, 1])
    else:
        norm_method = Normalize(args.mean, args.std)

    spatial_transform = Compose([
        Scale(args.sample_size),
        CenterCrop(args.sample_size),
        ToTensor(args.norm_value), norm_method
    ])
    # if not args.test_temp_crop == 'sparse':
    if args.compared_temp_transform == 'shuffle':
        temp_transform = ShuffleFrames(args.sample_duration)
    else:
        temp_transform = ReverseFrames(args.sample_duration)