def extract_trans_feats(model, DATASET, LABELS, CLASS_IDS, BATCH_SIZE, SEQ_SIZE=16, STEP=16, partition='train', nstrokes=-1, base_name=""): ''' Extract sequence features from AutoEncoder. Parameters: ----------- model : tt.TransformerModel TransformerModel object DATASET : str path to the video dataset LABELS : str path containing stroke labels BATCH_SIZE : int size for batch of clips SEQ_SIZE : int no. of frames in a clip STEP : int stride for next example. If SEQ_SIZE=16, STEP=8, use frames (0, 15), (8, 23) ... partition : str 'train' / 'test' / 'val' : Videos to be considered nstrokes : int partial extraction of features (do not execute for entire dataset) base_name : str path containing the pickled feature dumps Returns: -------- features_dictionary, stroke_names ''' ########################################################################### # Read the strokes # Divide the highlight dataset files into training, validation and test sets train_lst, val_lst, test_lst = autoenc_utils.split_dataset_files(DATASET) print("No. of training videos : {}".format(len(train_lst))) ##################################################################### if partition == 'train': partition_lst = train_lst ft_path = os.path.join(base_name, "C" + str(cluster_size) + "_train.pkl") elif partition == 'val': partition_lst = val_lst ft_path = os.path.join(base_name, "C" + str(cluster_size) + "_val.pkl") elif partition == 'test': partition_lst = test_lst ft_path = os.path.join(base_name, "C" + str(cluster_size) + "_test.pkl") else: print("Partition should be : train / val / test") return ########################################################################### # Create a Dataset part_dataset = StrokeFeatureSequenceDataset(ft_path, partition_lst, DATASET, LABELS, CLASS_IDS, frames_per_clip=SEQ_SIZE, extracted_frames_per_clip=2, step_between_clips=STEP, train=True) data_loader = DataLoader(dataset=part_dataset, batch_size=BATCH_SIZE, shuffle=False) ########################################################################### # Validate / Evaluate model.eval() stroke_names = [] trajectories, stroke_traj = [], [] num_strokes = 0 prev_stroke = None print("Total Batches : {} :: BATCH_SIZE : {}".format( data_loader.__len__(), BATCH_SIZE)) ########################################################################### for bno, (inputs, vid_path, stroke, labels) in enumerate(data_loader): # inputs of shape BATCH x SEQ_LEN x FEATURE_DIM inputs = inputs.float() # inp_emb = attn_utils.get_long_tensor(inputs) # comment out for SA # inputs = inp_emb.t().contiguous().to(device) # comment out for SA inputs = inputs.permute(1, 0, 2).contiguous().to(device) # forward # track history if only in train with torch.set_grad_enabled(False): outputs = model.get_vec( inputs) # output size (BATCH, SEQ_SIZE, NCLUSTERS) outputs = outputs.transpose(0, 1).contiguous() # convert to start frames and end frames from tensors to lists stroke = [s.tolist() for s in stroke] # outputs are the reconstructed features. Use compressed enc_out values(maybe wtd.). inputs_lst, batch_stroke_names = autoenc_utils.separate_stroke_tensors(outputs, \ vid_path, stroke) # for sequence of features from batch segregated extracted features. if bno == 0: prev_stroke = batch_stroke_names[0] for enc_idx, enc_input in enumerate(inputs_lst): # get no of sequences that can be extracted from enc_input tensor nSeqs = enc_input.size(0) if prev_stroke != batch_stroke_names[enc_idx]: # append old stroke to trajectories if len(stroke_traj) > 0: num_strokes += 1 trajectories.append(stroke_traj) stroke_names.append(prev_stroke) stroke_traj = [] # enc_output = model.encoder(enc_input.to(device)) # enc_output = enc_output.squeeze(axis=1).cpu().data.numpy() enc_output = enc_input.cpu().data.numpy() # convert to [[[stroke1(size 32 each) ... ], [], ...], [ [], ... ]] stroke_traj.extend([enc_output[i,j,:] for i in range(enc_output.shape[0]) \ for j in range(enc_output.shape[1])]) prev_stroke = batch_stroke_names[enc_idx] if nstrokes > -1 and num_strokes >= nstrokes: break # for last batch only if extracted for full dataset if len(stroke_traj) > 0 and nstrokes < 0: trajectories.append(stroke_traj) stroke_names.append(batch_stroke_names[-1]) # convert to dictionary of features with keys as stroke names(with ext). features = {} for i, t in enumerate(trajectories): features[stroke_names[i]] = np.array(t) # trajectories, stroke_names = autoenc_utils.group_strokewise(trajectories, stroke_names) return features, stroke_names
def extract_attn_feats(model, DATASET, LABELS, CLASS_IDS, BATCH_SIZE, SEQ_SIZE=16, STEP=16, partition='train', nstrokes=-1, base_name=""): ''' Extract sequence features from AutoEncoder. Parameters: ----------- encoder, decoder : attn_model.Encoder relative path to the checkpoint file for Autoencoder DATASET : str path to the video dataset LABELS : str path containing stroke labels BATCH_SIZE : int size for batch of clips SEQ_SIZE : int no. of frames in a clip STEP : int stride for next example. If SEQ_SIZE=16, STEP=8, use frames (0, 15), (8, 23) ... partition : str 'train' / 'test' / 'val' : Videos to be considered nstrokes : int partial extraction of features (do not execute for entire dataset) base_name : str path containing the pickled feature dumps Returns: -------- features_dictionary, stroke_names ''' ########################################################################### # Read the strokes # Divide the highlight dataset files into training, validation and test sets train_lst, val_lst, test_lst = autoenc_utils.split_dataset_files(DATASET) print("No. of training videos : {}".format(len(train_lst))) ##################################################################### if partition == 'train': partition_lst = train_lst elif partition == 'val': partition_lst = val_lst elif partition == 'test': partition_lst = test_lst else: print("Partition should be : train / val / test") return ########################################################################### # Create a Dataset # Clip level transform. Use this with framewiseTransform flag turned off clip_transform = transforms.Compose([ videotransforms.CenterCrop(224), videotransforms.ToPILClip(), videotransforms.Resize((112, 112)), # videotransforms.RandomCrop(112), videotransforms.ToTensor(), # videotransforms.Normalize(), #videotransforms.RandomHorizontalFlip(),\ ]) part_dataset = CricketStrokesDataset(partition_lst, DATASET, LABELS, CLASS_IDS, frames_per_clip=SEQ_SIZE, step_between_clips=STEP, train=False, framewiseTransform=False, transform=clip_transform) data_loader = DataLoader(dataset=part_dataset, batch_size=BATCH_SIZE, shuffle=False) ########################################################################### # Validate / Evaluate model.eval() stroke_names = [] trajectories, stroke_traj = [], [] num_strokes = 0 prev_stroke = None print("Total Batches : {} :: BATCH_SIZE : {}".format( data_loader.__len__(), BATCH_SIZE)) ########################################################################### for bno, (inputs, vid_path, stroke, labels) in enumerate(data_loader): # inputs of shape BATCH x SEQ_LEN x FEATURE_DIM inputs = inputs.permute(0, 2, 1, 3, 4).float() inputs = inputs.to(device) # print("Batch No : {} / {}".format(bno, len(data_loader))) # forward # track history if only in train with torch.set_grad_enabled(False): recon_x, mu, logvar = model(inputs) # dec_out_lst = [] # dec_out_lst.append(out_mu) # # outputs = torch.stack(dec_out_lst, dim=1) # convert to start frames and end frames from tensors to lists stroke = [s.tolist() for s in stroke] # outputs are the reconstructed features. Use compressed enc_out values(maybe wtd.). inputs_lst, batch_stroke_names = autoenc_utils.separate_stroke_tensors(mu, \ vid_path, stroke) # for sequence of features from batch segregated extracted features. if bno == 0: prev_stroke = batch_stroke_names[0] for enc_idx, enc_input in enumerate(inputs_lst): # get no of sequences that can be extracted from enc_input tensor nSeqs = enc_input.size(0) if prev_stroke != batch_stroke_names[enc_idx]: # append old stroke to trajectories if len(stroke_traj) > 0: num_strokes += 1 trajectories.append(stroke_traj) stroke_names.append(prev_stroke) stroke_traj = [] # enc_output = model.encoder(enc_input.to(device)) # enc_output = enc_output.squeeze(axis=1).cpu().data.numpy() enc_output = enc_input.cpu().data.numpy() # convert to [[[stroke1(size 32 each) ... ], [], ...], [ [], ... ]] stroke_traj.extend( [enc_output[i, :] for i in range(enc_output.shape[0])]) prev_stroke = batch_stroke_names[enc_idx] if nstrokes > -1 and num_strokes >= nstrokes: break # for last batch only if extracted for full dataset if len(stroke_traj) > 0 and nstrokes < 0: trajectories.append(stroke_traj) stroke_names.append(batch_stroke_names[-1]) # convert to dictionary of features with keys as stroke names(with ext). features = {} for i, t in enumerate(trajectories): features[stroke_names[i]] = np.array(t) # trajectories, stroke_names = autoenc_utils.group_strokewise(trajectories, stroke_names) return features, stroke_names
def extract_sequence_feats(model_path, DATASET, LABELS, CLASS_IDS, BATCH_SIZE, INPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS, SEQ_SIZE=16, STEP=16, partition='all', nstrokes=-1): ''' Extract sequence features from AutoEncoder. Parameters: ----------- model_path : str relative path to the checkpoint file for Autoencoder DATASET : str path to the video dataset LABELS : str path containing stroke labels BATCH_SIZE : int size for batch of clips INPUT_SIZE : int size of the extracted feature vector (output of ResNet). Input size of Autoencoder. HIDDEN_SIZE : int hidden size of autoencoder. #Parameters of autoencoder depend on it. NUM_LAYERS : int No. of GRU layers in the Autoencoder SEQ_SIZE : int no. of frames in a clip STEP : int stride for next example. If SEQ_SIZE=16, STEP=8, use frames (0, 15), (8, 23) ... partition : str 'all' / 'train' / 'test' / 'val' : Videos to be considered nstrokes : int partial extraction of features (do not execute for entire dataset) Returns: -------- trajectories, stroke_names ''' ########################################################################### # Read the strokes # Divide the highlight dataset files into training, validation and test sets train_lst, val_lst, test_lst = autoenc_utils.split_dataset_files(DATASET) print("No. of training videos : {}".format(len(train_lst))) ##################################################################### if partition == 'all': partition_lst = train_lst partition_lst.extend(val_lst) partition_lst.extend(test_lst) elif partition == 'train': partition_lst = train_lst elif partition == 'val': partition_lst = val_lst elif partition == 'test': partition_lst = test_lst ########################################################################### # Create a Dataset # Frame-wise transform clip_transform = transforms.Compose([transforms.ToPILImage(), transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),]) # Clip level transform. Use this with framewiseTransform flag turned off # clip_transform = transforms.Compose([videotransforms.ToPILClip(), # videotransforms.Resize((112, 112)), ## videotransforms.RandomCrop(112), # videotransforms.ToTensor(), # videotransforms.Normalize(), # #videotransforms.RandomHorizontalFlip(),\ # ]) part_dataset = CricketStrokesDataset(partition_lst, DATASET, LABELS, CLASS_IDS, frames_per_clip=SEQ_SIZE, step_between_clips=STEP, train=True, framewiseTransform=True, transform=clip_transform) data_loader = DataLoader(dataset=part_dataset, batch_size=BATCH_SIZE, shuffle=False) ########################################################################### # Create a model and load the weights of AutoEncoder model = autoenc.AutoEncoderRNN(INPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS) if model_path is not None: print("Loading model ...") checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['model_state_dict']) model = model.to(device) ########################################################################### # Validate / Evaluate model.eval() stroke_names = [] trajectories, stroke_traj = [], [] num_strokes = 0 extractor = Img2Vec() # extractor = Clip2Vec() #INPUT_SIZE = extractor.layer_output_size prev_stroke = None print("Total Batches : {} :: BATCH_SIZE : {}".format(data_loader.__len__(), BATCH_SIZE)) # assert BATCH_SIZE % SEQ_SIZE == 0, "BATCH_SIZE should be a multiple of SEQ_SIZE" for bno, (inputs, vid_path, stroke, _) in enumerate(data_loader): # get video clips (B, SL, C, H, W) print("Batch No : {}".format(bno)) # Extract spatial features using 2D ResNet if isinstance(extractor, Img2Vec): inputs = torch.stack([extractor.get_vec(x) for x in inputs]) # Extract spatio-temporal features from clip using 3D ResNet else: # for SEQ_LEN >= 16 inputs = inputs.permute(0, 2, 1, 3, 4).float() inputs = extractor.get_vec(inputs) # convert to start frames and end frames from tensors to lists stroke = [s.tolist() for s in stroke] inputs_lst, batch_stroke_names = autoenc_utils.separate_stroke_tensors(inputs, \ vid_path, stroke) if bno == 0: prev_stroke = batch_stroke_names[0] for enc_idx, enc_input in enumerate(inputs_lst): # get no of sequences that can be extracted from enc_input tensor nSeqs = enc_input.size(0) if prev_stroke != batch_stroke_names[enc_idx]: # append old stroke to trajectories if len(stroke_traj) > 0: num_strokes += 1 trajectories.append(stroke_traj) stroke_names.append(prev_stroke) stroke_traj = [] enc_output = model.encoder(enc_input.to(device)) enc_output = enc_output.squeeze(axis=1).cpu().data.numpy() # convert to [[[stroke1(size 32 each) ... ], [], ...], [ [], ... ]] stroke_traj.extend([enc_output[i, :] for i in range(enc_output.shape[0])]) prev_stroke = batch_stroke_names[enc_idx] if nstrokes >= -1 and num_strokes >= nstrokes: break # for last batch only if extracted for full dataset if len(stroke_traj) > 0 and nstrokes < 0: trajectories.append(stroke_traj) stroke_names.append(batch_stroke_names[-1]) trajectories, stroke_names = autoenc_utils.group_strokewise(trajectories, stroke_names) #stroke_vecs, stroke_names = aggregate_outputs(sequence_outputs, seq_stroke_names) #stroke_vecs = [stroke.cpu().data.numpy() for stroke in stroke_vecs] # save to disk # np.save("trajectories.npy", trajectories) # with open('stroke_names_val.pkl', 'wb') as fp: # pickle.dump(stroke_names, fp) # read the files from disk # trajectories = np.load("trajectories.npy") # with open('stroke_names_val.pkl', 'rb') as fp: # stroke_names = pickle.load(fp) # print("#Parameters : {}".format(autoenc_utils.count_parameters(model))) return trajectories, stroke_names
def extract_2DCNN_feats(DATASET, LABELS, CLASS_IDS, BATCH_SIZE, \ partition='all', nstrokes=-1): ''' Extract sequence features from AutoEncoder. Parameters: ----------- DATASET : str path to the video dataset LABELS : str path containing stroke labels CLASS_IDS : str path to txt file defining classes, similar to THUMOS BATCH_SIZE : int size for batch of clips Autoencoder. partition : str 'all' / 'train' / 'test' / 'val' : Videos to be considered nstrokes : int partial extraction of features (if don't want to execute for entire dataset) Returns: -------- trajectories, stroke_names ''' ########################################################################### # Read the strokes # Divide the highlight dataset files into training, validation and test sets train_lst, val_lst, test_lst = autoenc_utils.split_dataset_files(DATASET) print("No. of training videos : {}".format(len(train_lst))) ##################################################################### if partition == 'all': partition_lst = train_lst partition_lst.extend(val_lst) partition_lst.extend(test_lst) elif partition == 'train': partition_lst = train_lst elif partition == 'val': partition_lst = val_lst elif partition == 'test': partition_lst = test_lst ########################################################################### # Create a Dataset # Frame-wise transform clip_transform = transforms.Compose([transforms.ToPILImage(), transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),]) # For using Frame level transform, the framewiseTransform flag turned on part_dataset = CricketStrokesDataset(partition_lst, DATASET, LABELS, CLASS_IDS, frames_per_clip=1, train=True, framewiseTransform=True, transform=clip_transform) data_loader = DataLoader(dataset=part_dataset, batch_size=BATCH_SIZE, shuffle=False) ########################################################################### # Extract using the data_loader stroke_names = [] trajectories, stroke_traj = [], [] num_strokes = 0 extractor = Img2Vec() #INPUT_SIZE = extractor.layer_output_size prev_stroke = None print("Total Batches : {} :: BATCH_SIZE : {}".format(data_loader.__len__(), BATCH_SIZE)) for bno, (inputs, vid_path, stroke, _) in enumerate(data_loader): # get video clips (B, SL, C, H, W) print("Batch No : {}".format(bno)) # Extract spatial features using 2D ResNet inputs = torch.stack([extractor.get_vec(x) for x in inputs]) # convert to start frames and end frames from tensors to lists stroke = [s.tolist() for s in stroke] inputs_lst, batch_stroke_names = autoenc_utils.separate_stroke_tensors(inputs, \ vid_path, stroke) if bno == 0: prev_stroke = batch_stroke_names[0] for enc_idx, enc_input in enumerate(inputs_lst): # get no of sequences that can be extracted from enc_input tensor nSeqs = enc_input.size(0) if prev_stroke != batch_stroke_names[enc_idx]: # append old stroke to trajectories if len(stroke_traj) > 0: num_strokes += 1 trajectories.append(stroke_traj) stroke_names.append(prev_stroke) stroke_traj = [] # enc_input is same as enc_output while extraction of features. enc_output = enc_input enc_output = enc_output.squeeze(axis=1).cpu().data.numpy() # convert to [[[stroke1(size 32 each) ... ], [], ...], [ [], ... ]] stroke_traj.extend([enc_output[i, :] for i in range(enc_output.shape[0])]) prev_stroke = batch_stroke_names[enc_idx] if nstrokes >= -1 and num_strokes == nstrokes: break # for last batch only if extracted for full dataset if len(stroke_traj) > 0 and nstrokes < 0: trajectories.append(stroke_traj) stroke_names.append(batch_stroke_names[-1]) trajectories, stroke_names = autoenc_utils.group_strokewise(trajectories, stroke_names) #stroke_vecs, stroke_names = aggregate_outputs(sequence_outputs, seq_stroke_names) return trajectories, stroke_names
def extract_3DCNN_feats(DATASET, LABELS, CLASS_IDS, BATCH_SIZE, SEQ_SIZE=16, STEP=16, \ model_path=None, nclasses=5, partition='all', nstrokes=-1): ''' Extract sequence features from AutoEncoder. Parameters: ----------- DATASET : str path to the video dataset LABELS : str path containing stroke labels CLASS_IDS : str path to txt file defining classes, similar to THUMOS BATCH_SIZE : int size for batch of clips SEQ_SIZE : int no. of frames in a clip (min. 16 for 3D CNN extraction) STEP : int stride for next example. If SEQ_SIZE=16, STEP=8, use frames (0, 15), (8, 23) ... partition : str 'all' / 'train' / 'test' / 'val' : Videos to be considered nstrokes : int partial extraction of features (do not execute for entire dataset) Returns: -------- trajectories, stroke_names ''' ########################################################################### # Read the strokes # Divide the highlight dataset files into training, validation and test sets train_lst, val_lst, test_lst = autoenc_utils.split_dataset_files(DATASET) print("No. of training videos : {}".format(len(train_lst))) ##################################################################### if partition == 'all': partition_lst = train_lst partition_lst.extend(val_lst) partition_lst.extend(test_lst) elif partition == 'train': partition_lst = train_lst elif partition == 'val': partition_lst = val_lst elif partition == 'test': partition_lst = test_lst ########################################################################### # Create a Dataset # Clip level transform. Use this with framewiseTransform flag turned off clip_transform = transforms.Compose([videotransforms.CenterCrop(224), videotransforms.ToPILClip(), videotransforms.Resize((112, 112)), # videotransforms.RandomCrop(112), videotransforms.ToTensor(), videotransforms.Normalize(), #videotransforms.RandomHorizontalFlip(),\ ]) part_dataset = CricketStrokesDataset(partition_lst, DATASET, LABELS, CLASS_IDS, frames_per_clip=SEQ_SIZE, step_between_clips=STEP, train=True, framewiseTransform=False, transform=clip_transform) data_loader = DataLoader(dataset=part_dataset, batch_size=BATCH_SIZE, shuffle=False) ########################################################################### # Validate / Evaluate stroke_names = [] trajectories, stroke_traj = [], [] num_strokes = 0 extractor = Clip2Vec(model_path, nclasses) #INPUT_SIZE = extractor.layer_output_size prev_stroke = None print("Total Batches : {} :: BATCH_SIZE : {}".format(data_loader.__len__(), BATCH_SIZE)) assert SEQ_SIZE>=16, "SEQ_SIZE should be >= 16" for bno, (inputs, vid_path, stroke, _) in enumerate(data_loader): # get video clips (B, SL, C, H, W) print("Batch No : {}".format(bno)) # Extract spatio-temporal features from clip using 3D ResNet (For SL >= 16) inputs = inputs.permute(0, 2, 1, 3, 4).float() inputs = extractor.get_vec(inputs) # convert to start frames and end frames from tensors to lists stroke = [s.tolist() for s in stroke] inputs_lst, batch_stroke_names = autoenc_utils.separate_stroke_tensors(inputs, \ vid_path, stroke) if bno == 0: prev_stroke = batch_stroke_names[0] for enc_idx, enc_input in enumerate(inputs_lst): # get no of sequences that can be extracted from enc_input tensor nSeqs = enc_input.size(0) if prev_stroke != batch_stroke_names[enc_idx]: # append old stroke to trajectories if len(stroke_traj) > 0: num_strokes += 1 trajectories.append(stroke_traj) stroke_names.append(prev_stroke) stroke_traj = [] enc_output = enc_input enc_output = enc_output.squeeze(axis=1).cpu().data.numpy() # convert to [[[stroke1(size 32 each) ... ], [], ...], [ [], ... ]] stroke_traj.extend([enc_output[i, :] for i in range(enc_output.shape[0])]) prev_stroke = batch_stroke_names[enc_idx] if nstrokes >=-1 and num_strokes == nstrokes: break # for last batch only if extracted for full dataset if len(stroke_traj) > 0 and nstrokes < 0: trajectories.append(stroke_traj) stroke_names.append(batch_stroke_names[-1]) trajectories, stroke_names = autoenc_utils.group_strokewise(trajectories, stroke_names) #stroke_vecs, stroke_names = aggregate_outputs(sequence_outputs, seq_stroke_names) return trajectories, stroke_names
def finetune_3DCNN(DATASET, LABELS, CLASS_IDS, BATCH_SIZE, ANNOTATION_FILE, SEQ_SIZE=16, STEP=16, nstrokes=-1, N_EPOCHS=25, base_name=""): ''' Extract sequence features from AutoEncoder. Parameters: ----------- DATASET : str path to the video dataset LABELS : str path containing stroke labels CLASS_IDS : str path to txt file defining classes, similar to THUMOS BATCH_SIZE : int size for batch of clips SEQ_SIZE : int no. of frames in a clip (min. 16 for 3D CNN extraction) STEP : int stride for next example. If SEQ_SIZE=16, STEP=8, use frames (0, 15), (8, 23) ... partition : str 'all' / 'train' / 'test' / 'val' : Videos to be considered nstrokes : int partial extraction of features (do not execute for entire dataset) Returns: -------- trajectories, stroke_names ''' ########################################################################### # Read the strokes # Divide the highlight dataset files into training, validation and test sets train_lst, val_lst, test_lst = autoenc_utils.split_dataset_files(DATASET) print("No. of training videos : {}".format(len(train_lst))) ########################################################################### # Create a Dataset # Clip level transform. Use this with framewiseTransform flag turned off clip_transform = transforms.Compose([videotransforms.CenterCrop(224), videotransforms.ToPILClip(), videotransforms.Resize((112, 112)), # videotransforms.RandomCrop(112), videotransforms.ToTensor(), videotransforms.Normalize(), #videotransforms.RandomHorizontalFlip(),\ ]) train_dataset = CricketStrokesDataset(train_lst, DATASET, LABELS, CLASS_IDS, frames_per_clip=SEQ_SIZE, step_between_clips=STEP, train=True, framewiseTransform=False, transform=clip_transform) val_dataset = CricketStrokesDataset(val_lst, DATASET, LABELS, CLASS_IDS, frames_per_clip=SEQ_SIZE, step_between_clips=STEP, train=False, framewiseTransform=False, transform=clip_transform) train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True) val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False) data_loaders = {"train": train_loader, "test": val_loader} ########################################################################### labs_keys, labs_values = get_cluster_labels(ANNOTATION_FILE) num_classes = len(list(set(labs_values))) ########################################################################### # load model and set loss function model = torchvision.models.video.r3d_18(pretrained=True, progress=True) for ft in model.parameters(): ft.requires_grad = False inp_feat_size = model.fc.in_features model.fc = nn.Linear(inp_feat_size, num_classes) model = model.to(device) # load checkpoint: if os.path.isfile(os.path.join(base_name, "3dresnet18_ep"+str(N_EPOCHS)+"_Adam.pt")): model.load_state_dict(torch.load(os.path.join(base_name, "3dresnet18_ep"+str(N_EPOCHS)+"_Adam.pt"))) # Setup the loss fxn criterion = nn.CrossEntropyLoss() # Layers to finetune. Last layer should be displayed params_to_update = model.parameters() print("Params to learn:") params_to_update = [] for name, param in model.named_parameters(): if param.requires_grad == True: params_to_update.append(param) print("\t",name) # Observe that all parameters are being optimized optimizer_ft = torch.optim.Adam(params_to_update, lr=0.001) # Decay LR by a factor of 0.1 every 7 epochs exp_lr_scheduler = StepLR(optimizer_ft, step_size=8, gamma=0.1) # # Observe that all parameters are being optimized # optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9) # ########################################################################### # # Training the model # # start = time.time() # # model_ft = train_model(model, data_loaders, criterion, optimizer_ft, # exp_lr_scheduler, labs_keys, labs_values, # num_epochs=N_EPOCHS) # # end = time.time() # # # save the best performing model # save_model_checkpoint(base_name, model_ft, N_EPOCHS, "Adam") # # print("Total Execution time for {} epoch : {}".format(N_EPOCHS, (end-start))) ########################################################################### # Validate / Evaluate stroke_names = [] trajectories, stroke_traj = [], [] num_strokes = 0 model = model.eval() # extractor = Clip2Vec() #INPUT_SIZE = extractor.layer_output_size prev_stroke = None print("Total Batches : {} :: BATCH_SIZE : {}".format(data_loaders['test'].__len__(), BATCH_SIZE)) assert SEQ_SIZE>=16, "SEQ_SIZE should be >= 16" for bno, (inputs, vid_path, stroke, _) in enumerate(data_loaders['test']): # get video clips (B, SL, C, H, W) print("Batch No : {}".format(bno)) # Extract spatio-temporal features from clip using 3D ResNet (For SL >= 16) inputs = inputs.permute(0, 2, 1, 3, 4).float() inputs = extractor.get_vec(inputs) # convert to start frames and end frames from tensors to lists stroke = [s.tolist() for s in stroke] inputs_lst, batch_stroke_names = autoenc_utils.separate_stroke_tensors(inputs, \ vid_path, stroke) if bno == 0: prev_stroke = batch_stroke_names[0] for enc_idx, enc_input in enumerate(inputs_lst): # get no of sequences that can be extracted from enc_input tensor nSeqs = enc_input.size(0) if prev_stroke != batch_stroke_names[enc_idx]: # append old stroke to trajectories if len(stroke_traj) > 0: num_strokes += 1 trajectories.append(stroke_traj) stroke_names.append(prev_stroke) stroke_traj = [] enc_output = enc_input enc_output = enc_output.squeeze(axis=1).cpu().data.numpy() # convert to [[[stroke1(size 32 each) ... ], [], ...], [ [], ... ]] stroke_traj.extend([enc_output[i, :] for i in range(enc_output.shape[0])]) prev_stroke = batch_stroke_names[enc_idx] if nstrokes >=-1 and num_strokes == nstrokes: break # for last batch only if extracted for full dataset if len(stroke_traj) > 0 and nstrokes < 0: trajectories.append(stroke_traj) stroke_names.append(batch_stroke_names[-1]) trajectories, stroke_names = autoenc_utils.group_strokewise(trajectories, stroke_names) #stroke_vecs, stroke_names = aggregate_outputs(sequence_outputs, seq_stroke_names) return trajectories, stroke_names