def main(DATASET, LABELS, CLASS_IDS, BATCH_SIZE, ANNOTATION_FILE, SEQ_SIZE=16, STEP=16, nstrokes=-1, N_EPOCHS=25, base_name=""): ''' Extract sequence features from AutoEncoder. Parameters: ----------- DATASET : str path to the video dataset LABELS : str path containing stroke labels CLASS_IDS : str path to txt file defining classes, similar to THUMOS BATCH_SIZE : int size for batch of clips SEQ_SIZE : int no. of frames in a clip (min. 16 for 3D CNN extraction) STEP : int stride for next example. If SEQ_SIZE=16, STEP=8, use frames (0, 15), (8, 23) ... partition : str 'all' / 'train' / 'test' / 'val' : Videos to be considered nstrokes : int partial extraction of features (do not execute for entire dataset) Returns: -------- trajectories, stroke_names ''' if not os.path.isdir(base_name): os.makedirs(base_name) seed = 1234 attn_utils.seed_everything(seed) ########################################################################### # Read the strokes # Divide the highlight dataset files into training, validation and test sets train_lst, val_lst, test_lst = autoenc_utils.split_dataset_files(DATASET) print("No. of training videos : {}".format(len(train_lst))) ########################################################################### # Create a Dataset # Clip level transform. Use this with framewiseTransform flag turned off train_transform = transforms.Compose([ videotransforms.RandomCrop(300), videotransforms.ToPILClip(), videotransforms.Resize((32 * multiple, 32 * multiple)), # videotransforms.RandomCrop(112), videotransforms.ToTensor(), videotransforms.Normalize(), #videotransforms.RandomHorizontalFlip(),\ ]) test_transform = transforms.Compose([ videotransforms.CenterCrop(300), videotransforms.ToPILClip(), videotransforms.Resize((32 * multiple, 32 * multiple)), # videotransforms.RandomCrop(112), videotransforms.ToTensor(), videotransforms.Normalize(), #videotransforms.RandomHorizontalFlip(),\ ]) train_dataset = CricketStrokesDataset(train_lst, DATASET, LABELS, CLASS_IDS, frames_per_clip=SEQ_SIZE, step_between_clips=STEP, train=True, framewiseTransform=False, transform=train_transform) val_dataset = CricketStrokesDataset(val_lst, DATASET, LABELS, CLASS_IDS, frames_per_clip=SEQ_SIZE, step_between_clips=STEP, train=False, framewiseTransform=False, transform=test_transform) ########################################################################### labs_keys, labs_values = attn_utils.get_cluster_labels(ANNOTATION_FILE) num_classes = len(list(set(labs_values))) # created weighted Sampler for class imbalance if not os.path.isfile( os.path.join( base_name, "weights_c" + str(num_classes) + "_" + str(len(train_dataset)) + ".pkl")): samples_weight = attn_utils.get_sample_weights(train_dataset, labs_keys, labs_values, train_lst) with open( os.path.join( base_name, "weights_c" + str(num_classes) + "_" + str(len(train_dataset)) + ".pkl"), "wb") as fp: pickle.dump(samples_weight, fp) with open( os.path.join( base_name, "weights_c" + str(num_classes) + "_" + str(len(train_dataset)) + ".pkl"), "rb") as fp: samples_weight = pickle.load(fp) sampler = WeightedRandomSampler(samples_weight, len(samples_weight)) train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, sampler=sampler, worker_init_fn=np.random.seed(12)) val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False) data_loaders = {"train": train_loader, "test": val_loader} ########################################################################### # # visualize the samples # samp_transform = transforms.Compose([videotransforms.RandomCrop(300), # videotransforms.ToPILClip(), # videotransforms.Resize((32*multiple, 32*multiple)), ## videotransforms.RandomCrop(112), # videotransforms.ToTensor(), ]) ## videotransforms.FiveCrop(112), ## transforms.Lambda(lambda vcrops: torch.stack([transforms.ToTensor()(crop) \ ## for img_crop in vcrops for crop in img_crop])),]) # samp_dataset = CricketStrokesDataset(train_lst, DATASET, LABELS, CLASS_IDS, # frames_per_clip=SEQ_SIZE, step_between_clips=STEP, # train=True, framewiseTransform=False, # transform=samp_transform) # vis_samples(samp_dataset) ########################################################################### # load model and set loss function # model = convrnn.ConvGRU(input_size=3, hidden_size=20, kernel_size=3, num_layers=1) model = convrnn.ConvLSTM(input_channels=3, hidden_channels=[128, 64, 64, 32, 32], kernel_size=3, num_classes=num_classes, multiple=multiple, step=5, effective_step=[4]) # for ft in model.parameters(): # ft.requires_grad = False # inp_feat_size = model.fc.in_features # model.fc = nn.Linear(inp_feat_size, num_classes) model = model.to(device) # Setup the loss fxn criterion = nn.CrossEntropyLoss() # criterion = nn.MSELoss() # # Layers to finetune. Last layer should be displayed print("Params to learn:") params_to_update = [] for name, param in model.named_parameters(): if param.requires_grad == True: params_to_update.append(param) print("\t {}".format(name)) # Observe that all parameters are being optimized optimizer_ft = torch.optim.Adam(params_to_update, lr=0.001) # optimizer_ft = optim.SGD(params_to_update, lr=0.01, momentum=0.9) # Decay LR by a factor of 0.1 every 7 epochs lr_scheduler = StepLR(optimizer_ft, step_size=10, gamma=0.1) # ########################################################################### # Training the model start = time.time() model = train_model(model, data_loaders, criterion, optimizer_ft, lr_scheduler, labs_keys, labs_values, num_epochs=N_EPOCHS) end = time.time() # save the best performing model save_model_checkpoint(base_name, model, N_EPOCHS, "Adam") # Load model checkpoints model = load_weights(base_name, model, N_EPOCHS, "Adam") print("Total Execution time for {} epoch : {}".format( N_EPOCHS, (end - start))) # ########################################################################### print("Predicting ...") acc = predict(model, data_loaders, labs_keys, labs_values, phase='test') print("#Parameters : {} ".format(autoenc_utils.count_parameters(model))) return model
def extract_3DCNN_feats(DATASET, LABELS, BATCH_SIZE, SEQ_SIZE=16, STEP=16, \ foldno=1, train=True, nclasses=51, model_path=None, \ nstrokes=-1): ''' Extract sequence features from AutoEncoder. Parameters: ----------- DATASET : str path to the video dataset LABELS : str path containing stroke labels CLASS_IDS : str path to txt file defining classes, similar to THUMOS BATCH_SIZE : int size for batch of clips SEQ_SIZE : int no. of frames in a clip (min. 16 for 3D CNN extraction) STEP : int stride for next example. If SEQ_SIZE=16, STEP=8, use frames (0, 15), (8, 23) ... partition : str 'all' / 'train' / 'test' / 'val' : Videos to be considered nstrokes : int partial extraction of features (do not execute for entire dataset) Returns: -------- trajectories, stroke_names ''' ########################################################################### ########################################################################### # Create a Dataset # Clip level transform. Use this with framewiseTransform flag turned off clip_transform = transforms.Compose([ T.CenterCrop(224), T.ToPILClip(), T.Resize((112, 112)), # T.RandomCrop(112), T.ToHMDBTensor(), # T.Normalize(), #T.RandomHorizontalFlip(),\ ]) part_dataset = hmdb.HMDB51(DATASET, LABELS, SEQ_SIZE, step_between_clips=STEP, fold=foldno, train=train, transform=clip_transform) data_loader = DataLoader(dataset=part_dataset, batch_size=BATCH_SIZE, shuffle=False) ########################################################################### # Validate / Evaluate stroke_names = [] trajectories, stroke_traj = [], [] num_strokes = 0 extractor = Clip2Vec(model_path, nclasses) #INPUT_SIZE = extractor.layer_output_size prev_stroke = None print("Total Batches : {} :: BATCH_SIZE : {}".format( data_loader.__len__(), BATCH_SIZE)) assert SEQ_SIZE >= 16, "SEQ_SIZE should be >= 16" for bno, (inputs, vid_path, start_pts, end_pts, _) in enumerate(data_loader): # get video clips (B, SL, C, H, W) print("Batch No : {}".format(bno)) # Extract spatio-temporal features from clip using 3D ResNet (For SL >= 16) inputs = inputs.permute(0, 2, 1, 3, 4).float() inputs = extractor.get_vec(inputs) # convert to start frames and end frames from tensors to lists inputs_lst, batch_stroke_names = separate_video_tensors( inputs, vid_path) if bno == 0: prev_stroke = batch_stroke_names[0] for enc_idx, enc_input in enumerate(inputs_lst): # get no of sequences that can be extracted from enc_input tensor if prev_stroke != batch_stroke_names[enc_idx]: # append old stroke to trajectories if len(stroke_traj) > 0: num_strokes += 1 trajectories.append(stroke_traj) stroke_names.append(prev_stroke) stroke_traj = [] enc_output = enc_input enc_output = enc_output.squeeze(axis=1).cpu().data.numpy() # convert to [[[stroke1(size 32 each) ... ], [], ...], [ [], ... ]] stroke_traj.extend( [enc_output[i, :] for i in range(enc_output.shape[0])]) prev_stroke = batch_stroke_names[enc_idx] if nstrokes >= -1 and num_strokes == nstrokes: break # for last batch only if extracted for full dataset if len(stroke_traj) > 0 and nstrokes < 0: trajectories.append(stroke_traj) stroke_names.append(batch_stroke_names[-1]) # group_strokewise not needed here as videos are trimmed. Was needed for strokes # to generate a lists of stroke feature lists for an untrimmed video. # trajectories, stroke_names = group_strokewise(trajectories, stroke_names) traj_dict = {} for i, vid in enumerate(stroke_names): traj_dict[vid] = np.array(trajectories[i]) return traj_dict, stroke_names
def extract_I3D_feats(DATASET, LABELS, BATCH_SIZE, SEQ_SIZE=16, STEP=16, \ foldno=1, train=True, nclasses=51, model_path=None, \ nstrokes=-1): clip_transform = transforms.Compose([ T.CenterCrop(224), # T.ToPILClip(), # T.Resize((224, 224)), # T.ToHMDBTensor(), # T.Normalize(), ]) part_dataset = hmdb.HMDB51(DATASET, LABELS, SEQ_SIZE, step_between_clips=STEP, fold=foldno, train=train, transform=clip_transform) data_loader = DataLoader(dataset=part_dataset, batch_size=BATCH_SIZE, shuffle=False) i3d = InceptionI3d(51, in_channels=3) # i3d.replace_logits(157) # for charades if model_path is not None: i3d.load_state_dict(torch.load(model_path)) i3d = i3d.cuda() # Validate / Evaluate stroke_names = [] trajectories, stroke_traj = [], [] num_strokes = 0 prev_stroke = None print("Total Batches : {} :: BATCH_SIZE : {}".format( data_loader.__len__(), BATCH_SIZE)) assert SEQ_SIZE >= 16, "SEQ_SIZE should be >= 16" for bno, (inputs, vid_path, start_pts, end_pts, _) in enumerate(data_loader): # get video clips (B, SL, C, H, W) print("Batch No : {}".format(bno)) # Extract spatio-temporal features from clip using I3D (For SL >= 16) inputs = inputs.permute(0, 4, 1, 2, 3).float().cuda() inputs = i3d.extract_features( inputs) # returned (B, 1024, 1, 1, 1) tensor # convert to start frames and end frames from tensors to lists inputs_lst, batch_stroke_names = separate_video_tensors( inputs.cpu(), vid_path) if bno == 0: prev_stroke = batch_stroke_names[0] for enc_idx, enc_input in enumerate(inputs_lst): # get no of sequences that can be extracted from enc_input tensor if prev_stroke != batch_stroke_names[enc_idx]: # append old stroke to trajectories if len(stroke_traj) > 0: num_strokes += 1 trajectories.append(stroke_traj) stroke_names.append(prev_stroke) stroke_traj = [] enc_output = enc_input enc_output = enc_output.squeeze(4).squeeze(3).squeeze( 2).cpu().data.numpy() # convert to [[[stroke1(size 32 each) ... ], [], ...], [ [], ... ]] stroke_traj.extend( [enc_output[i] for i in range(enc_output.shape[0])]) prev_stroke = batch_stroke_names[enc_idx] if nstrokes >= -1 and num_strokes == nstrokes: break # for last batch only if extracted for full dataset if len(stroke_traj) > 0 and nstrokes < 0: trajectories.append(stroke_traj) stroke_names.append(batch_stroke_names[-1]) # group_strokewise not needed here as videos are trimmed. Was needed for strokes # to generate a lists of stroke feature lists for an untrimmed video. # trajectories, stroke_names = group_strokewise(trajectories, stroke_names) traj_dict = {} for i, vid in enumerate(stroke_names): traj_dict[vid] = np.array(trajectories[i]) return traj_dict, stroke_names
def run(init_lr=0.1, max_steps=64e3, mode='rgb', root='', batch_size=32, save_model='i3dIter1k_'): num_epochs = 30 seed_everything() if not os.path.isdir(log_path): os.makedirs(log_path) # setup dataset train_transforms = transforms.Compose([T.RandomCrop(224), T.ToPILClip(), T.Resize((224, 224)), # T.RandomCrop(112), T.ToTensor(), T.Normalize(), #T.RandomHorizontalFlip(),\ ]) test_transforms = transforms.Compose([T.CenterCrop(224), T.ToPILClip(), T.Resize((224, 224)), # T.RandomCrop(112), T.ToTensor(), T.Normalize(), #T.RandomHorizontalFlip(),\ ]) # train_transforms = transforms.Compose([T.RandomCrop(224), # T.RandomHorizontalFlip(), # ]) # test_transforms = transforms.Compose([T.CenterCrop(224)]) dataset = HMDB51(DATASET, LABELS, 16, step_between_clips = 1, fold=1, train=True, transform=train_transforms) # samples_weight = get_hmdb_sample_weights(dataset) # sampler = WeightedRandomSampler(samples_weight, len(samples_weight)) # dataset = Dataset(train_split, 'training', root, mode, train_transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)#sampler=sampler, worker_init_fn=np.random.seed(12)) #shuffle=True) #, num_workers=36, pin_memory=True) val_dataset = HMDB51(DATASET, LABELS, 16, step_between_clips = 1, fold=1, train=False, transform=test_transforms) # val_dataset = Dataset(train_split, 'testing', root, mode, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False) #, num_workers=36, pin_memory=True) dataloaders = {'train': dataloader, 'test': val_dataloader} datasets = {'train': dataset, 'test': val_dataset} # vis_samples(dataset, True) # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) i3d.load_state_dict(torch.load('/home/arpan/VisionWorkspace/pytorch-i3d/models/flow_imagenet.pt')) else: i3d = InceptionI3d(400, in_channels=3) i3d.load_state_dict(torch.load('/home/arpan/VisionWorkspace/pytorch-i3d/models/rgb_imagenet.pt')) i3d.replace_logits(51) #i3d.load_state_dict(torch.load('/ssd/models/000920.pt')) i3d = i3d.to(device) # i3d = nn.DataParallel(i3d) lr = init_lr optimizer = optim.SGD(i3d.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) # lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [10, 25]) # [300, 1000]) # Decay LR by a factor of 0.1 every 7 epochs lr_sched = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) # criterion = nn.CrossEntropyLoss() num_steps_per_update = 4 # accum gradient steps = 0 # train it start = time.time() # print("No. of Iters / Epoch ; {}".format(len(dataloaders['train']))) # for epoch in range(num_epochs): #while steps < max_steps: ## print( 'Step {}/{}'.format(steps, max_steps)) # print('Epoch {}/{}'.format(epoch+1, num_epochs)) # print('-' * 10) # # # Each epoch has a training and validation phase # for phase in ['train', 'test']: # if phase == 'train': # i3d.train(True) # else: # i3d.train(False) # Set model to evaluate mode # # tot_loss = 0.0 # tot_loc_loss = 0.0 # tot_cls_loss = 0.0 # num_iter = 0 # # running_corrects = 0 # count = [0.] * 51 # # # Iterate over data. # for bno, (inputs, vid_path, start_pts, end_pts, labels) in enumerate(dataloaders[phase]): # num_iter += 1 # # wrap them in Variable # inputs = inputs.permute(0, 2, 1, 3, 4).float() # for PIL and ToTensor ## inputs = inputs.permute(0, 4, 1, 2, 3).float() # for Raw Crops # inputs = inputs.to(device) ## t = inputs.size(2) # labels = labels.to(device) # # iter_counts = Counter(labels.tolist()) # for k,v in iter_counts.items(): # count[k]+=v # # optimizer.zero_grad() # # per_frame_logits = i3d(inputs) # get B x N_CLASSES X 1 # per_frame_logits = per_frame_logits.squeeze(2) # # upsample to input size ## per_frame_logits = F.upsample(per_frame_logits, t, mode='linear') # # # compute localization loss ## loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels) ## tot_loc_loss += loc_loss.data[0] # # # compute classification loss (with max-pooling along time B x C x T) ## cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0]) ## tot_cls_loss += cls_loss.data[0] # cls_loss = F.cross_entropy(per_frame_logits, labels) # ## loss = (0.5*loc_loss + 0.5*cls_loss)/num_steps_per_update # loss = cls_loss #/num_steps_per_update # tot_loss += loss.item() ## loss.backward() # ## print("{} : bno : {}".format(phase, bno)) # # backward + optimize only if in training phase # if phase == 'train': # loss.backward() # optimizer.step() # # running_corrects += torch.sum(torch.max(per_frame_logits, 1)[1] == labels.data) # ### if num_iter == num_steps_per_update and phase == 'train': ## if phase == 'train': ## steps += 1 ## num_iter = 0 ## optimizer.step() ## optimizer.zero_grad() ## lr_sched.step() ## if steps % 10 == 0: ## print('{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/(10*num_steps_per_update), tot_cls_loss/(10*num_steps_per_update), tot_loss/10)) ## # save model ## torch.save(i3d.state_dict(), save_model+str(steps).zfill(6)+'.pt') ## tot_loss = tot_loc_loss = tot_cls_loss = 0. ## if (bno + 1) % 10 == 0: ## print('{} : {}/{} Loss: {:.4f} Corrects: {:.4f}'.format(phase, ## bno, len(dataloaders[phase]), tot_loc_loss, running_corrects)) # if bno == 1000: # break # if phase == 'train': # lr_sched.step() # print("Category Weights : {}".format(count)) # epoch_loss = tot_loss / (16*(bno+1)) #len(dataloaders[phase].dataset) # epoch_acc = running_corrects.double() / (16*(bno+1)) # len(dataloaders[phase].dataset) # print('{} Loss: {:.6f} Acc: {:.6f} LR: {}'.format(phase, epoch_loss, epoch_acc, # lr_sched.get_last_lr()[0])) # ## if phase == 'val': ## print('{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/num_iter, tot_cls_loss/num_iter, (tot_loss*num_steps_per_update)/num_iter) ) # # if (epoch+1) % 10 == 0: # torch.save(i3d.state_dict(), os.path.join(log_path, save_model+str(epoch+1).zfill(6)+'.pt')) i3d.load_state_dict(torch.load(os.path.join(log_path, save_model+str(num_epochs).zfill(6)+'.pt'))) end = time.time() print("Total Execution time for {} epoch : {}".format(num_epochs, (end-start))) ########################################################################### # Predictions predict(i3d, dataloaders, 16, 'test')