Exemplo n.º 1
0
def main(DATASET,
         LABELS,
         CLASS_IDS,
         BATCH_SIZE,
         ANNOTATION_FILE,
         SEQ_SIZE=16,
         STEP=16,
         nstrokes=-1,
         N_EPOCHS=25,
         base_name=""):
    '''
    Extract sequence features from AutoEncoder.
    
    Parameters:
    -----------
    DATASET : str
        path to the video dataset
    LABELS : str
        path containing stroke labels
    CLASS_IDS : str
        path to txt file defining classes, similar to THUMOS
    BATCH_SIZE : int
        size for batch of clips
    SEQ_SIZE : int
        no. of frames in a clip (min. 16 for 3D CNN extraction)
    STEP : int
        stride for next example. If SEQ_SIZE=16, STEP=8, use frames (0, 15), (8, 23) ...
    partition : str
        'all' / 'train' / 'test' / 'val' : Videos to be considered
    nstrokes : int
        partial extraction of features (do not execute for entire dataset)
    
    Returns:
    --------
    trajectories, stroke_names
    
    '''
    if not os.path.isdir(base_name):
        os.makedirs(base_name)
    seed = 1234
    attn_utils.seed_everything(seed)

    ###########################################################################
    # Read the strokes
    # Divide the highlight dataset files into training, validation and test sets
    train_lst, val_lst, test_lst = autoenc_utils.split_dataset_files(DATASET)
    print("No. of training videos : {}".format(len(train_lst)))

    ###########################################################################
    # Create a Dataset
    # Clip level transform. Use this with framewiseTransform flag turned off
    train_transform = transforms.Compose([
        videotransforms.RandomCrop(300),
        videotransforms.ToPILClip(),
        videotransforms.Resize((32 * multiple, 32 * multiple)),
        #                                         videotransforms.RandomCrop(112),
        videotransforms.ToTensor(),
        videotransforms.Normalize(),
        #videotransforms.RandomHorizontalFlip(),\
    ])
    test_transform = transforms.Compose([
        videotransforms.CenterCrop(300),
        videotransforms.ToPILClip(),
        videotransforms.Resize((32 * multiple, 32 * multiple)),
        #                                         videotransforms.RandomCrop(112),
        videotransforms.ToTensor(),
        videotransforms.Normalize(),
        #videotransforms.RandomHorizontalFlip(),\
    ])
    train_dataset = CricketStrokesDataset(train_lst,
                                          DATASET,
                                          LABELS,
                                          CLASS_IDS,
                                          frames_per_clip=SEQ_SIZE,
                                          step_between_clips=STEP,
                                          train=True,
                                          framewiseTransform=False,
                                          transform=train_transform)
    val_dataset = CricketStrokesDataset(val_lst,
                                        DATASET,
                                        LABELS,
                                        CLASS_IDS,
                                        frames_per_clip=SEQ_SIZE,
                                        step_between_clips=STEP,
                                        train=False,
                                        framewiseTransform=False,
                                        transform=test_transform)

    ###########################################################################

    labs_keys, labs_values = attn_utils.get_cluster_labels(ANNOTATION_FILE)

    num_classes = len(list(set(labs_values)))

    # created weighted Sampler for class imbalance
    if not os.path.isfile(
            os.path.join(
                base_name, "weights_c" + str(num_classes) + "_" +
                str(len(train_dataset)) + ".pkl")):
        samples_weight = attn_utils.get_sample_weights(train_dataset,
                                                       labs_keys, labs_values,
                                                       train_lst)
        with open(
                os.path.join(
                    base_name, "weights_c" + str(num_classes) + "_" +
                    str(len(train_dataset)) + ".pkl"), "wb") as fp:
            pickle.dump(samples_weight, fp)
    with open(
            os.path.join(
                base_name, "weights_c" + str(num_classes) + "_" +
                str(len(train_dataset)) + ".pkl"), "rb") as fp:
        samples_weight = pickle.load(fp)
    sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=BATCH_SIZE,
                              sampler=sampler,
                              worker_init_fn=np.random.seed(12))

    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=BATCH_SIZE,
                            shuffle=False)

    data_loaders = {"train": train_loader, "test": val_loader}

    ###########################################################################
    #    # visualize the samples
    #    samp_transform = transforms.Compose([videotransforms.RandomCrop(300),
    #                                         videotransforms.ToPILClip(),
    #                                         videotransforms.Resize((32*multiple, 32*multiple)),
    ##                                         videotransforms.RandomCrop(112),
    #                                         videotransforms.ToTensor(), ])
    ##                                         videotransforms.FiveCrop(112),
    ##                                         transforms.Lambda(lambda vcrops: torch.stack([transforms.ToTensor()(crop) \
    ##                                                for img_crop in vcrops for crop in img_crop])),])
    #    samp_dataset = CricketStrokesDataset(train_lst, DATASET, LABELS, CLASS_IDS,
    #                                        frames_per_clip=SEQ_SIZE, step_between_clips=STEP,
    #                                        train=True, framewiseTransform=False,
    #                                        transform=samp_transform)
    #    vis_samples(samp_dataset)

    ###########################################################################
    # load model and set loss function
    #    model = convrnn.ConvGRU(input_size=3, hidden_size=20, kernel_size=3, num_layers=1)
    model = convrnn.ConvLSTM(input_channels=3,
                             hidden_channels=[128, 64, 64, 32, 32],
                             kernel_size=3,
                             num_classes=num_classes,
                             multiple=multiple,
                             step=5,
                             effective_step=[4])

    #    for ft in model.parameters():
    #        ft.requires_grad = False
    #    inp_feat_size = model.fc.in_features
    #    model.fc = nn.Linear(inp_feat_size, num_classes)
    model = model.to(device)

    # Setup the loss fxn
    criterion = nn.CrossEntropyLoss()
    #    criterion = nn.MSELoss()

    #    # Layers to finetune. Last layer should be displayed
    print("Params to learn:")
    params_to_update = []
    for name, param in model.named_parameters():
        if param.requires_grad == True:
            params_to_update.append(param)
            print("\t {}".format(name))

    # Observe that all parameters are being optimized
    optimizer_ft = torch.optim.Adam(params_to_update, lr=0.001)
    #    optimizer_ft = optim.SGD(params_to_update, lr=0.01, momentum=0.9)

    # Decay LR by a factor of 0.1 every 7 epochs
    lr_scheduler = StepLR(optimizer_ft, step_size=10, gamma=0.1)

    #    ###########################################################################
    # Training the model
    start = time.time()

    model = train_model(model,
                        data_loaders,
                        criterion,
                        optimizer_ft,
                        lr_scheduler,
                        labs_keys,
                        labs_values,
                        num_epochs=N_EPOCHS)

    end = time.time()

    # save the best performing model
    save_model_checkpoint(base_name, model, N_EPOCHS, "Adam")
    # Load model checkpoints
    model = load_weights(base_name, model, N_EPOCHS, "Adam")

    print("Total Execution time for {} epoch : {}".format(
        N_EPOCHS, (end - start)))

    #    ###########################################################################

    print("Predicting ...")
    acc = predict(model, data_loaders, labs_keys, labs_values, phase='test')

    print("#Parameters : {} ".format(autoenc_utils.count_parameters(model)))

    return model
Exemplo n.º 2
0
def extract_3DCNN_feats(DATASET, LABELS, BATCH_SIZE, SEQ_SIZE=16, STEP=16, \
                        foldno=1, train=True, nclasses=51, model_path=None, \
                        nstrokes=-1):
    '''
    Extract sequence features from AutoEncoder.
    
    Parameters:
    -----------
    DATASET : str
        path to the video dataset
    LABELS : str
        path containing stroke labels
    CLASS_IDS : str
        path to txt file defining classes, similar to THUMOS
    BATCH_SIZE : int
        size for batch of clips
    SEQ_SIZE : int
        no. of frames in a clip (min. 16 for 3D CNN extraction)
    STEP : int
        stride for next example. If SEQ_SIZE=16, STEP=8, use frames (0, 15), (8, 23) ...
    partition : str
        'all' / 'train' / 'test' / 'val' : Videos to be considered
    nstrokes : int
        partial extraction of features (do not execute for entire dataset)
    
    Returns:
    --------
    trajectories, stroke_names
    
    '''

    ###########################################################################
    ###########################################################################
    # Create a Dataset
    # Clip level transform. Use this with framewiseTransform flag turned off
    clip_transform = transforms.Compose([
        T.CenterCrop(224),
        T.ToPILClip(),
        T.Resize((112, 112)),
        #                                         T.RandomCrop(112),
        T.ToHMDBTensor(),
        #                                         T.Normalize(),
        #T.RandomHorizontalFlip(),\
    ])
    part_dataset = hmdb.HMDB51(DATASET,
                               LABELS,
                               SEQ_SIZE,
                               step_between_clips=STEP,
                               fold=foldno,
                               train=train,
                               transform=clip_transform)

    data_loader = DataLoader(dataset=part_dataset,
                             batch_size=BATCH_SIZE,
                             shuffle=False)

    ###########################################################################
    # Validate / Evaluate
    stroke_names = []
    trajectories, stroke_traj = [], []
    num_strokes = 0
    extractor = Clip2Vec(model_path, nclasses)
    #INPUT_SIZE = extractor.layer_output_size
    prev_stroke = None

    print("Total Batches : {} :: BATCH_SIZE : {}".format(
        data_loader.__len__(), BATCH_SIZE))
    assert SEQ_SIZE >= 16, "SEQ_SIZE should be >= 16"
    for bno, (inputs, vid_path, start_pts, end_pts,
              _) in enumerate(data_loader):
        # get video clips (B, SL, C, H, W)
        print("Batch No : {}".format(bno))
        # Extract spatio-temporal features from clip using 3D ResNet (For SL >= 16)
        inputs = inputs.permute(0, 2, 1, 3, 4).float()
        inputs = extractor.get_vec(inputs)

        # convert to start frames and end frames from tensors to lists
        inputs_lst, batch_stroke_names = separate_video_tensors(
            inputs, vid_path)

        if bno == 0:
            prev_stroke = batch_stroke_names[0]

        for enc_idx, enc_input in enumerate(inputs_lst):
            # get no of sequences that can be extracted from enc_input tensor
            if prev_stroke != batch_stroke_names[enc_idx]:
                # append old stroke to trajectories
                if len(stroke_traj) > 0:
                    num_strokes += 1
                    trajectories.append(stroke_traj)
                    stroke_names.append(prev_stroke)
                    stroke_traj = []

            enc_output = enc_input
            enc_output = enc_output.squeeze(axis=1).cpu().data.numpy()
            # convert to [[[stroke1(size 32 each) ... ], [], ...], [ [], ... ]]
            stroke_traj.extend(
                [enc_output[i, :] for i in range(enc_output.shape[0])])
            prev_stroke = batch_stroke_names[enc_idx]

        if nstrokes >= -1 and num_strokes == nstrokes:
            break

    # for last batch only if extracted for full dataset
    if len(stroke_traj) > 0 and nstrokes < 0:
        trajectories.append(stroke_traj)
        stroke_names.append(batch_stroke_names[-1])

    # group_strokewise not needed here as videos are trimmed. Was needed for strokes
    # to generate a lists of stroke feature lists for an untrimmed video.


#    trajectories, stroke_names = group_strokewise(trajectories, stroke_names)

    traj_dict = {}
    for i, vid in enumerate(stroke_names):
        traj_dict[vid] = np.array(trajectories[i])
    return traj_dict, stroke_names
Exemplo n.º 3
0
def extract_I3D_feats(DATASET, LABELS, BATCH_SIZE, SEQ_SIZE=16, STEP=16, \
                      foldno=1, train=True, nclasses=51, model_path=None, \
                      nstrokes=-1):

    clip_transform = transforms.Compose([
        T.CenterCrop(224),
        #                                         T.ToPILClip(),
        #                                         T.Resize((224, 224)),
        #                                         T.ToHMDBTensor(),
        #                                         T.Normalize(),
    ])
    part_dataset = hmdb.HMDB51(DATASET,
                               LABELS,
                               SEQ_SIZE,
                               step_between_clips=STEP,
                               fold=foldno,
                               train=train,
                               transform=clip_transform)

    data_loader = DataLoader(dataset=part_dataset,
                             batch_size=BATCH_SIZE,
                             shuffle=False)

    i3d = InceptionI3d(51, in_channels=3)
    #    i3d.replace_logits(157) # for charades
    if model_path is not None:
        i3d.load_state_dict(torch.load(model_path))

    i3d = i3d.cuda()

    # Validate / Evaluate
    stroke_names = []
    trajectories, stroke_traj = [], []
    num_strokes = 0

    prev_stroke = None

    print("Total Batches : {} :: BATCH_SIZE : {}".format(
        data_loader.__len__(), BATCH_SIZE))
    assert SEQ_SIZE >= 16, "SEQ_SIZE should be >= 16"
    for bno, (inputs, vid_path, start_pts, end_pts,
              _) in enumerate(data_loader):
        # get video clips (B, SL, C, H, W)
        print("Batch No : {}".format(bno))
        # Extract spatio-temporal features from clip using I3D (For SL >= 16)
        inputs = inputs.permute(0, 4, 1, 2, 3).float().cuda()
        inputs = i3d.extract_features(
            inputs)  # returned (B, 1024, 1, 1, 1) tensor

        # convert to start frames and end frames from tensors to lists
        inputs_lst, batch_stroke_names = separate_video_tensors(
            inputs.cpu(), vid_path)

        if bno == 0:
            prev_stroke = batch_stroke_names[0]

        for enc_idx, enc_input in enumerate(inputs_lst):
            # get no of sequences that can be extracted from enc_input tensor
            if prev_stroke != batch_stroke_names[enc_idx]:
                # append old stroke to trajectories
                if len(stroke_traj) > 0:
                    num_strokes += 1
                    trajectories.append(stroke_traj)
                    stroke_names.append(prev_stroke)
                    stroke_traj = []

            enc_output = enc_input
            enc_output = enc_output.squeeze(4).squeeze(3).squeeze(
                2).cpu().data.numpy()
            # convert to [[[stroke1(size 32 each) ... ], [], ...], [ [], ... ]]
            stroke_traj.extend(
                [enc_output[i] for i in range(enc_output.shape[0])])
            prev_stroke = batch_stroke_names[enc_idx]

        if nstrokes >= -1 and num_strokes == nstrokes:
            break

    # for last batch only if extracted for full dataset
    if len(stroke_traj) > 0 and nstrokes < 0:
        trajectories.append(stroke_traj)
        stroke_names.append(batch_stroke_names[-1])

    # group_strokewise not needed here as videos are trimmed. Was needed for strokes
    # to generate a lists of stroke feature lists for an untrimmed video.


#    trajectories, stroke_names = group_strokewise(trajectories, stroke_names)

    traj_dict = {}
    for i, vid in enumerate(stroke_names):
        traj_dict[vid] = np.array(trajectories[i])
    return traj_dict, stroke_names
Exemplo n.º 4
0
def run(init_lr=0.1, max_steps=64e3, mode='rgb', root='', batch_size=32, save_model='i3dIter1k_'):
    
    num_epochs = 30
    seed_everything()
    if not os.path.isdir(log_path):
        os.makedirs(log_path)
    
    # setup dataset
    train_transforms = transforms.Compose([T.RandomCrop(224),
                                         T.ToPILClip(), 
                                         T.Resize((224, 224)),
#                                         T.RandomCrop(112), 
                                         T.ToTensor(), 
                                         T.Normalize(),
                                        #T.RandomHorizontalFlip(),\
                                        ])
    test_transforms = transforms.Compose([T.CenterCrop(224),
                                         T.ToPILClip(), 
                                         T.Resize((224, 224)),
#                                         T.RandomCrop(112), 
                                         T.ToTensor(), 
                                         T.Normalize(),
                                        #T.RandomHorizontalFlip(),\
                                        ])    
#    train_transforms = transforms.Compose([T.RandomCrop(224),
#                                           T.RandomHorizontalFlip(),
#    ])
#    test_transforms = transforms.Compose([T.CenterCrop(224)])

    dataset = HMDB51(DATASET, LABELS, 16, step_between_clips = 1, 
                     fold=1, train=True, transform=train_transforms)
#    samples_weight = get_hmdb_sample_weights(dataset)
#    sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
#    dataset = Dataset(train_split, 'training', root, mode, train_transforms)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)#sampler=sampler, worker_init_fn=np.random.seed(12)) #shuffle=True) #, num_workers=36, pin_memory=True)
    val_dataset = HMDB51(DATASET, LABELS, 16, step_between_clips = 1, 
                     fold=1, train=False, transform=test_transforms)
#    val_dataset = Dataset(train_split, 'testing', root, mode, test_transforms)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False) #, num_workers=36, pin_memory=True)    

    dataloaders = {'train': dataloader, 'test': val_dataloader}
    datasets = {'train': dataset, 'test': val_dataset}

#    vis_samples(dataset, True)
    
    # setup the model
    if mode == 'flow':
        i3d = InceptionI3d(400, in_channels=2)
        i3d.load_state_dict(torch.load('/home/arpan/VisionWorkspace/pytorch-i3d/models/flow_imagenet.pt'))
    else:
        i3d = InceptionI3d(400, in_channels=3)
        i3d.load_state_dict(torch.load('/home/arpan/VisionWorkspace/pytorch-i3d/models/rgb_imagenet.pt'))
    i3d.replace_logits(51)
    #i3d.load_state_dict(torch.load('/ssd/models/000920.pt'))
    i3d = i3d.to(device)
#    i3d = nn.DataParallel(i3d)

    lr = init_lr
    optimizer = optim.SGD(i3d.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001)
#    lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [10, 25]) # [300, 1000])
    # Decay LR by a factor of 0.1 every 7 epochs
    lr_sched = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
#    criterion = nn.CrossEntropyLoss()
    num_steps_per_update = 4 # accum gradient
    steps = 0
    # train it
    start = time.time()
#    print("No. of Iters / Epoch ; {}".format(len(dataloaders['train'])))
#    for epoch in range(num_epochs): #while steps < max_steps:
##        print( 'Step {}/{}'.format(steps, max_steps))
#        print('Epoch {}/{}'.format(epoch+1, num_epochs))
#        print('-' * 10)
#
#        # Each epoch has a training and validation phase
#        for phase in ['train', 'test']:
#            if phase == 'train':
#                i3d.train(True)
#            else:
#                i3d.train(False)  # Set model to evaluate mode
#                
#            tot_loss = 0.0
#            tot_loc_loss = 0.0
#            tot_cls_loss = 0.0
#            num_iter = 0
#            
#            running_corrects = 0
#            count = [0.] * 51
#            
#            # Iterate over data.
#            for bno, (inputs, vid_path, start_pts, end_pts, labels) in enumerate(dataloaders[phase]):
#                num_iter += 1
#                # wrap them in Variable
#                inputs = inputs.permute(0, 2, 1, 3, 4).float()      # for PIL and ToTensor
##                inputs = inputs.permute(0, 4, 1, 2, 3).float()      # for Raw Crops
#                inputs = inputs.to(device)
##                t = inputs.size(2)
#                labels = labels.to(device)
#
#                iter_counts = Counter(labels.tolist())
#                for k,v in iter_counts.items():
#                    count[k]+=v
#                    
#                optimizer.zero_grad()
#                
#                per_frame_logits = i3d(inputs)  # get B x N_CLASSES X 1
#                per_frame_logits = per_frame_logits.squeeze(2)
#                # upsample to input size
##                per_frame_logits = F.upsample(per_frame_logits, t, mode='linear')
#
#                # compute localization loss
##                loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels)
##                tot_loc_loss += loc_loss.data[0]
#
#                # compute classification loss (with max-pooling along time B x C x T)
##                cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0])
##                tot_cls_loss += cls_loss.data[0]
#                cls_loss = F.cross_entropy(per_frame_logits, labels)
#
##                loss = (0.5*loc_loss + 0.5*cls_loss)/num_steps_per_update
#                loss = cls_loss     #/num_steps_per_update
#                tot_loss += loss.item()
##                loss.backward()
#                
##                print("{}  : bno : {}".format(phase, bno))
#                                    # backward + optimize only if in training phase
#                if phase == 'train':
#                    loss.backward()
#                    optimizer.step()
#                    
#                running_corrects += torch.sum(torch.max(per_frame_logits, 1)[1] == labels.data)
#
###                if num_iter == num_steps_per_update and phase == 'train':
##                if phase == 'train':
##                    steps += 1
##                    num_iter = 0
##                    optimizer.step()
##                    optimizer.zero_grad()
##                    lr_sched.step()
##                    if steps % 10 == 0:
##                        print('{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/(10*num_steps_per_update), tot_cls_loss/(10*num_steps_per_update), tot_loss/10))
##                        # save model
##                        torch.save(i3d.state_dict(), save_model+str(steps).zfill(6)+'.pt')
##                        tot_loss = tot_loc_loss = tot_cls_loss = 0.
##                if (bno + 1) % 10 == 0:
##                    print('{} : {}/{} Loss: {:.4f} Corrects: {:.4f}'.format(phase, 
##                          bno, len(dataloaders[phase]), tot_loc_loss, running_corrects))
#                if bno == 1000:
#                    break
#            if phase == 'train':
#                lr_sched.step()
#                print("Category Weights : {}".format(count))
#            epoch_loss = tot_loss / (16*(bno+1))  #len(dataloaders[phase].dataset)
#            epoch_acc = running_corrects.double() / (16*(bno+1)) #  len(dataloaders[phase].dataset)
#            print('{} Loss: {:.6f} Acc: {:.6f} LR: {}'.format(phase, epoch_loss, epoch_acc, 
#                  lr_sched.get_last_lr()[0]))
#            
##            if phase == 'val':
##                print('{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/num_iter, tot_cls_loss/num_iter, (tot_loss*num_steps_per_update)/num_iter) )
#                
#            if (epoch+1) % 10 == 0:
#                torch.save(i3d.state_dict(), os.path.join(log_path, save_model+str(epoch+1).zfill(6)+'.pt'))
                
    i3d.load_state_dict(torch.load(os.path.join(log_path, save_model+str(num_epochs).zfill(6)+'.pt')))
    
                
    end = time.time()
    print("Total Execution time for {} epoch : {}".format(num_epochs, (end-start)))
    
    ###########################################################################
    
    # Predictions
    
    predict(i3d, dataloaders, 16, 'test')