def get(cls, args): train_file = args.train_file val_file = args.val_file train_dataset = cls( args, args.data, 'train', train_file, args.cache, transform=transforms.Compose([ videotransforms.RandomCrop(args.input_size), videotransforms.RandomHorizontalFlip() ]), input_size=args.input_size) val_dataset = cls( args, args.data, 'val', val_file, args.cache, transform=transforms.Compose([ videotransforms.CenterCrop(256) ]), input_size=args.input_size) valvideo_dataset = cls( args, args.data, 'val_video', val_file, args.cache, transform=transforms.Compose([ videotransforms.CenterCrop(256) ]), input_size=args.input_size) return train_dataset, val_dataset, valvideo_dataset
def main(DATASET, LABELS, CLASS_IDS, BATCH_SIZE, ANNOTATION_FILE, SEQ_SIZE=16, STEP=16, nstrokes=-1, N_EPOCHS=25, base_name=""): ''' Extract sequence features from AutoEncoder. Parameters: ----------- DATASET : str path to the video dataset LABELS : str path containing stroke labels CLASS_IDS : str path to txt file defining classes, similar to THUMOS BATCH_SIZE : int size for batch of clips SEQ_SIZE : int no. of frames in a clip (min. 16 for 3D CNN extraction) STEP : int stride for next example. If SEQ_SIZE=16, STEP=8, use frames (0, 15), (8, 23) ... partition : str 'all' / 'train' / 'test' / 'val' : Videos to be considered nstrokes : int partial extraction of features (do not execute for entire dataset) Returns: -------- trajectories, stroke_names ''' ########################################################################### # seed everything seed = 1234 attn_utils.seed_everything(seed) if not os.path.isdir(base_name): os.makedirs(base_name) # Read the strokes # Divide the highlight dataset files into training, validation and test sets train_lst, val_lst, test_lst = autoenc_utils.split_dataset_files(DATASET) print("No. of training videos : {}".format(len(train_lst))) ########################################################################### # Create a Dataset # Clip level transform. Use this with framewiseTransform flag turned off train_transform = transforms.Compose([ videotransforms.RandomCrop(224), videotransforms.ToPILClip(), videotransforms.Resize((112, 112)), videotransforms.ToTensor(), videotransforms.Normalize(), #videotransforms.RandomHorizontalFlip(),\ ]) test_transform = transforms.Compose([ videotransforms.CenterCrop(224), videotransforms.ToPILClip(), videotransforms.Resize((112, 112)), videotransforms.ToTensor(), videotransforms.Normalize(), #videotransforms.RandomHorizontalFlip(),\ ]) train_dataset = CricketStrokesDataset(train_lst, DATASET, LABELS, CLASS_IDS, frames_per_clip=SEQ_SIZE, step_between_clips=STEP, train=True, framewiseTransform=False, transform=train_transform) val_dataset = CricketStrokesDataset(val_lst, DATASET, LABELS, CLASS_IDS, frames_per_clip=SEQ_SIZE, step_between_clips=STEP, train=False, framewiseTransform=False, transform=test_transform) ########################################################################### labs_keys, labs_values = attn_utils.get_cluster_labels(ANNOTATION_FILE) num_classes = len(list(set(labs_values))) # created weighted Sampler for class imbalance if not os.path.isfile( os.path.join( base_name, "weights_c" + str(num_classes) + "_" + str(len(train_dataset)) + ".pkl")): samples_weight = attn_utils.get_sample_weights(train_dataset, labs_keys, labs_values, train_lst) with open( os.path.join( base_name, "weights_c" + str(num_classes) + "_" + str(len(train_dataset)) + ".pkl"), "wb") as fp: pickle.dump(samples_weight, fp) with open( os.path.join( base_name, "weights_c" + str(num_classes) + "_" + str(len(train_dataset)) + ".pkl"), "rb") as fp: samples_weight = pickle.load(fp) sampler = WeightedRandomSampler(samples_weight, len(samples_weight)) train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, sampler=sampler, worker_init_fn=np.random.seed(12)) val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False) data_loaders = {"train": train_loader, "test": val_loader} ########################################################################### # load model and set loss function encoder = conv_attn_model.Conv3DEncoder(HIDDEN_SIZE, 1, bidirectional) # encoder = conv_attn_model.Conv3DAttention(HIDDEN_SIZE, num_classes, 1, 196, bidirectional) decoder = conv_attn_model.Conv3DDecoder(HIDDEN_SIZE, num_classes, 1, 1, bidirectional) # decoder = conv_encdec_model.Conv3DDecoder(HIDDEN_SIZE, HIDDEN_SIZE, 1, 196, bidirectional) # model = attn_model.Encoder(10, 20, bidirectional) # for ft in model.parameters(): # ft.requires_grad = False # inp_feat_size = model.fc.in_features # model.fc = nn.Linear(inp_feat_size, num_classes) # model = model.to(device) encoder = encoder.to(device) decoder = decoder.to(device) # # load checkpoint: # Setup the loss fxn criterion = nn.CrossEntropyLoss() # criterion = nn.MSELoss() # # Layers to finetune. Last layer should be displayed print("Params to learn:") params_to_update = [] for name, param in encoder.named_parameters(): if param.requires_grad == True: params_to_update.append(param) print("Encoder : {}".format(name)) for name, param in decoder.named_parameters(): if param.requires_grad == True: params_to_update.append(param) print("Decoder : {}".format(name)) # Observe that all parameters are being optimized # optimizer_ft = torch.optim.Adam(params_to_update, lr=0.001) # optimizer_ft = torch.optim.SGD(params_to_update, lr=0.01, momentum=0.9) encoder_optimizer = torch.optim.SGD(encoder.parameters(), lr=0.01, momentum=0.9) decoder_optimizer = torch.optim.SGD(decoder.parameters(), lr=0.01, momentum=0.9) # decoder_optimizer = None # Decay LR by a factor of 0.1 every 7 epochs lr_scheduler = StepLR(encoder_optimizer, step_size=10, gamma=0.1) # # Observe that all parameters are being optimized # optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9) # ########################################################################### # Training the model start = time.time() (encoder, decoder) = train_model(encoder, decoder, data_loaders, criterion, encoder_optimizer, decoder_optimizer, lr_scheduler, labs_keys, labs_values, num_epochs=N_EPOCHS) end = time.time() # save the best performing model attn_utils.save_attn_model_checkpoint(base_name, (encoder, decoder), N_EPOCHS, "SGD") # Load model checkpoints encoder, decoder = attn_utils.load_attn_model_checkpoint( base_name, encoder, decoder, N_EPOCHS, "SGD") print("Total Execution time for {} epoch : {}".format( N_EPOCHS, (end - start))) ########################################################################### # features_val, stroke_names_id_val = attn_utils.read_feats(os.path.join(base_name, ft_dir), # feat_val, snames_val) print("Writing prediction dictionary....") pred_out_dict, acc = predict(encoder, decoder, data_loaders, criterion, labs_keys, labs_values, phase='test') with open(os.path.join(base_name, "pred_dict.pkl"), "wb") as fp: pickle.dump(pred_out_dict, fp) # save the output wts and related information print("#Parameters Encoder : {} ".format( autoenc_utils.count_parameters(encoder))) print("#Parameters Decoder : {} ".format( autoenc_utils.count_parameters(decoder))) return encoder, decoder
def main(DATASET, LABELS, CLASS_IDS, BATCH_SIZE, ANNOTATION_FILE, SEQ_SIZE=16, STEP=16, nstrokes=-1, N_EPOCHS=25, base_name=""): ''' Extract sequence features from AutoEncoder. Parameters: ----------- DATASET : str path to the video dataset LABELS : str path containing stroke labels CLASS_IDS : str path to txt file defining classes, similar to THUMOS BATCH_SIZE : int size for batch of clips SEQ_SIZE : int no. of frames in a clip (min. 16 for 3D CNN extraction) STEP : int stride for next example. If SEQ_SIZE=16, STEP=8, use frames (0, 15), (8, 23) ... partition : str 'all' / 'train' / 'test' / 'val' : Videos to be considered nstrokes : int partial extraction of features (do not execute for entire dataset) Returns: -------- trajectories, stroke_names ''' if not os.path.isdir(base_name): os.makedirs(base_name) seed = 1234 attn_utils.seed_everything(seed) ########################################################################### # Read the strokes # Divide the highlight dataset files into training, validation and test sets train_lst, val_lst, test_lst = autoenc_utils.split_dataset_files(DATASET) print("No. of training videos : {}".format(len(train_lst))) ########################################################################### # Create a Dataset # Clip level transform. Use this with framewiseTransform flag turned off train_transforms = transforms.Compose([ videotransforms.RandomCrop(300), videotransforms.ToPILClip(), videotransforms.Resize((112, 112)), videotransforms.ToTensor(), videotransforms.Normalize(), # videotransforms.ScaledNormMinMax(), ]) test_transforms = transforms.Compose([ videotransforms.CenterCrop(300), videotransforms.ToPILClip(), videotransforms.Resize((112, 112)), videotransforms.ToTensor(), videotransforms.Normalize(), # videotransforms.ScaledNormMinMax(), ]) train_dataset = CricketStrokesDataset(train_lst, DATASET, LABELS, CLASS_IDS, frames_per_clip=SEQ_SIZE, step_between_clips=STEP, train=True, framewiseTransform=False, transform=train_transforms) val_dataset = CricketStrokesDataset(val_lst, DATASET, LABELS, CLASS_IDS, frames_per_clip=SEQ_SIZE, step_between_clips=STEP, train=False, framewiseTransform=False, transform=test_transforms) ########################################################################### labs_keys, labs_values = attn_utils.get_cluster_labels(ANNOTATION_FILE) num_classes = len(list(set(labs_values))) # created weighted Sampler for class imbalance if not os.path.isfile( os.path.join( base_name, "weights_c" + str(num_classes) + "_" + str(len(train_dataset)) + ".pkl")): samples_weight = attn_utils.get_sample_weights(train_dataset, labs_keys, labs_values, train_lst) with open( os.path.join( base_name, "weights_c" + str(num_classes) + "_" + str(len(train_dataset)) + ".pkl"), "wb") as fp: pickle.dump(samples_weight, fp) with open( os.path.join( base_name, "weights_c" + str(num_classes) + "_" + str(len(train_dataset)) + ".pkl"), "rb") as fp: samples_weight = pickle.load(fp) sampler = WeightedRandomSampler(samples_weight, len(samples_weight)) train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, sampler=sampler, worker_init_fn=np.random.seed(12)) val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False) data_loaders = {"train": train_loader, "test": val_loader} ########################################################################### # load model and set loss function model = conv_attn_model.C3DGRUv2Orig(HIDDEN_SIZE, 1, num_classes, bidirectional) model_pretrained = c3d.C3D() model_pretrained.load_state_dict( torch.load("../localization_rnn/" + wts_path)) # model_pretrained = c3d_pre.C3D() # model_pretrained.fc8 = nn.Linear(4096, 5) # model_pretrained.load_state_dict(torch.load(pretrained_c3d_wts)) copy_pretrained_weights(model_pretrained, model) # reset the last layer (default requires_grad is True) # model.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1)) # for ft in model.parameters(): # ft.requires_grad = False # inp_feat_size = model.fc.in_features # model.fc = nn.Linear(inp_feat_size, num_classes) model = model.to(device) # Setup the loss fxn criterion = nn.CrossEntropyLoss() # criterion = nn.MSELoss() # # Layers to finetune. Last layer should be displayed print("Params to learn:") params_to_update = [] for name, param in model.named_parameters(): if param.requires_grad == True: params_to_update.append(param) print("\t {}".format(name)) # Observe that all parameters are being optimized # optimizer_ft = torch.optim.Adam(params_to_update, lr=0.01) optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9) # Decay LR by a factor of 0.1 every 7 epochs lr_scheduler = StepLR(optimizer_ft, step_size=30, gamma=0.1) ########################################################################### # Training the model start = time.time() model = train_model(model, data_loaders, criterion, optimizer_ft, lr_scheduler, labs_keys, labs_values, num_epochs=N_EPOCHS) end = time.time() # save the best performing model attn_utils.save_model_checkpoint(base_name, model, N_EPOCHS, "SGD_c8_c3dgruEp60Step30") # Load model checkpoints model = attn_utils.load_weights(base_name, model, N_EPOCHS, "SGD_c8_c3dgruEp60Step30") print("Total Execution time for {} epoch : {}".format( N_EPOCHS, (end - start))) # ########################################################################### print("Predicting ...") acc = predict(model, data_loaders, labs_keys, labs_values, phase='test') print("#Parameters : {} ".format(autoenc_utils.count_parameters(model))) return model
def run(init_lr=0.1, max_steps=64e3, mode='rgb', root='', batch_size=32, save_model='i3dIter1k_'): num_epochs = 30 seed_everything() if not os.path.isdir(log_path): os.makedirs(log_path) # setup dataset train_transforms = transforms.Compose([T.RandomCrop(224), T.ToPILClip(), T.Resize((224, 224)), # T.RandomCrop(112), T.ToTensor(), T.Normalize(), #T.RandomHorizontalFlip(),\ ]) test_transforms = transforms.Compose([T.CenterCrop(224), T.ToPILClip(), T.Resize((224, 224)), # T.RandomCrop(112), T.ToTensor(), T.Normalize(), #T.RandomHorizontalFlip(),\ ]) # train_transforms = transforms.Compose([T.RandomCrop(224), # T.RandomHorizontalFlip(), # ]) # test_transforms = transforms.Compose([T.CenterCrop(224)]) dataset = HMDB51(DATASET, LABELS, 16, step_between_clips = 1, fold=1, train=True, transform=train_transforms) # samples_weight = get_hmdb_sample_weights(dataset) # sampler = WeightedRandomSampler(samples_weight, len(samples_weight)) # dataset = Dataset(train_split, 'training', root, mode, train_transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)#sampler=sampler, worker_init_fn=np.random.seed(12)) #shuffle=True) #, num_workers=36, pin_memory=True) val_dataset = HMDB51(DATASET, LABELS, 16, step_between_clips = 1, fold=1, train=False, transform=test_transforms) # val_dataset = Dataset(train_split, 'testing', root, mode, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False) #, num_workers=36, pin_memory=True) dataloaders = {'train': dataloader, 'test': val_dataloader} datasets = {'train': dataset, 'test': val_dataset} # vis_samples(dataset, True) # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) i3d.load_state_dict(torch.load('/home/arpan/VisionWorkspace/pytorch-i3d/models/flow_imagenet.pt')) else: i3d = InceptionI3d(400, in_channels=3) i3d.load_state_dict(torch.load('/home/arpan/VisionWorkspace/pytorch-i3d/models/rgb_imagenet.pt')) i3d.replace_logits(51) #i3d.load_state_dict(torch.load('/ssd/models/000920.pt')) i3d = i3d.to(device) # i3d = nn.DataParallel(i3d) lr = init_lr optimizer = optim.SGD(i3d.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) # lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [10, 25]) # [300, 1000]) # Decay LR by a factor of 0.1 every 7 epochs lr_sched = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) # criterion = nn.CrossEntropyLoss() num_steps_per_update = 4 # accum gradient steps = 0 # train it start = time.time() # print("No. of Iters / Epoch ; {}".format(len(dataloaders['train']))) # for epoch in range(num_epochs): #while steps < max_steps: ## print( 'Step {}/{}'.format(steps, max_steps)) # print('Epoch {}/{}'.format(epoch+1, num_epochs)) # print('-' * 10) # # # Each epoch has a training and validation phase # for phase in ['train', 'test']: # if phase == 'train': # i3d.train(True) # else: # i3d.train(False) # Set model to evaluate mode # # tot_loss = 0.0 # tot_loc_loss = 0.0 # tot_cls_loss = 0.0 # num_iter = 0 # # running_corrects = 0 # count = [0.] * 51 # # # Iterate over data. # for bno, (inputs, vid_path, start_pts, end_pts, labels) in enumerate(dataloaders[phase]): # num_iter += 1 # # wrap them in Variable # inputs = inputs.permute(0, 2, 1, 3, 4).float() # for PIL and ToTensor ## inputs = inputs.permute(0, 4, 1, 2, 3).float() # for Raw Crops # inputs = inputs.to(device) ## t = inputs.size(2) # labels = labels.to(device) # # iter_counts = Counter(labels.tolist()) # for k,v in iter_counts.items(): # count[k]+=v # # optimizer.zero_grad() # # per_frame_logits = i3d(inputs) # get B x N_CLASSES X 1 # per_frame_logits = per_frame_logits.squeeze(2) # # upsample to input size ## per_frame_logits = F.upsample(per_frame_logits, t, mode='linear') # # # compute localization loss ## loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels) ## tot_loc_loss += loc_loss.data[0] # # # compute classification loss (with max-pooling along time B x C x T) ## cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0]) ## tot_cls_loss += cls_loss.data[0] # cls_loss = F.cross_entropy(per_frame_logits, labels) # ## loss = (0.5*loc_loss + 0.5*cls_loss)/num_steps_per_update # loss = cls_loss #/num_steps_per_update # tot_loss += loss.item() ## loss.backward() # ## print("{} : bno : {}".format(phase, bno)) # # backward + optimize only if in training phase # if phase == 'train': # loss.backward() # optimizer.step() # # running_corrects += torch.sum(torch.max(per_frame_logits, 1)[1] == labels.data) # ### if num_iter == num_steps_per_update and phase == 'train': ## if phase == 'train': ## steps += 1 ## num_iter = 0 ## optimizer.step() ## optimizer.zero_grad() ## lr_sched.step() ## if steps % 10 == 0: ## print('{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/(10*num_steps_per_update), tot_cls_loss/(10*num_steps_per_update), tot_loss/10)) ## # save model ## torch.save(i3d.state_dict(), save_model+str(steps).zfill(6)+'.pt') ## tot_loss = tot_loc_loss = tot_cls_loss = 0. ## if (bno + 1) % 10 == 0: ## print('{} : {}/{} Loss: {:.4f} Corrects: {:.4f}'.format(phase, ## bno, len(dataloaders[phase]), tot_loc_loss, running_corrects)) # if bno == 1000: # break # if phase == 'train': # lr_sched.step() # print("Category Weights : {}".format(count)) # epoch_loss = tot_loss / (16*(bno+1)) #len(dataloaders[phase].dataset) # epoch_acc = running_corrects.double() / (16*(bno+1)) # len(dataloaders[phase].dataset) # print('{} Loss: {:.6f} Acc: {:.6f} LR: {}'.format(phase, epoch_loss, epoch_acc, # lr_sched.get_last_lr()[0])) # ## if phase == 'val': ## print('{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/num_iter, tot_cls_loss/num_iter, (tot_loss*num_steps_per_update)/num_iter) ) # # if (epoch+1) % 10 == 0: # torch.save(i3d.state_dict(), os.path.join(log_path, save_model+str(epoch+1).zfill(6)+'.pt')) i3d.load_state_dict(torch.load(os.path.join(log_path, save_model+str(num_epochs).zfill(6)+'.pt'))) end = time.time() print("Total Execution time for {} epoch : {}".format(num_epochs, (end-start))) ########################################################################### # Predictions predict(i3d, dataloaders, 16, 'test')