def __init__(self, root, mode, test_dir, train_dir, save_model_withname=None,\ save_error_withname=None, checkpoint=None): self.root = root self.mode = mode self.test_dir = test_dir self.train_dir = train_dir self.save_model_withname = save_model_withname self.save_error_withname = save_error_withname self.checkpoint = checkpoint self.batch_size = 50 self.learning_rate = 0.0001 self.validation_loop = 0 if(self.mode=='train'): self.writer = tensorboardX.SummaryWriter(comment="train") else: self.writer = tensorboardX.SummaryWriter(comment="test") # setup dataset self.train_transforms = transforms.Compose([videotransforms.RandomCrop(112), videotransforms.RandomHorizontalFlip(),]) self.test_transforms = transforms.Compose([videotransforms.CenterCrop(112)]) self.dataset = VisualTactile(self.root, self.train_dir, self.train_transforms) self.dataloader = torch.utils.data.DataLoader(self.dataset, batch_size=self.batch_size, shuffle=True, num_workers=1, pin_memory=True) self.val_dataset = VisualTactile(self.root, self.test_dir, self.test_transforms) self.val_dataloader = torch.utils.data.DataLoader(self.val_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True) # self.dataloaders = {'train': self.dataloader, 'val': self.val_dataloader} # self.datasets = {'train': self.dataset, 'val': self.val_dataset} self.model, self.optimizer, self.scheduler = self.load_model(self.checkpoint)
def extract_feature(args): transform = transforms.Compose([videotransforms.RandomCrop(224)]) dataset = Dataset(args.train_split, 'val', args.root, args.frame_nb, args.interval, transform) dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, shuffle=True, num_workers=24, # 24 on jobs pin_memory=True) if args.resnet_nb == 50: resnet = torchvision.models.resnet50(pretrained=True) print('load resnet50 pretrained model...') elif args.resnet_nb == 101: resnet = torchvision.models.resnet101(pretrained=True) print('load resnet101 pretrained model...') elif args.resnet_nb == 152: resnet = torchvision.models.resnet152(pretrained=True) print('load resnet152 pretrained model...') else: raise ValueError( 'resnet_nb should be in [50|101|152] but got {}').format( args.resnet_nb) i3resnet = I3ResNet(copy.deepcopy(resnet), args.frame_nb, args.class_nb, side_task=False, conv_class=True) # print(i3resnet.layer3[0].downsample[1]) state_dict = torch.load(args.model_path) new_state_dict = OrderedDict() for k, v in state_dict.items(): name = k[7:] # remove 'module'. new_state_dict[name] = v i3resnet.load_state_dict(new_state_dict) print('loaded saved state_dict...') i3resnet.eval() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") i3resnet = i3resnet.to(device) with torch.no_grad(): hook = Hook(i3resnet.layer3) print('registered Hook...') for i, data in enumerate(dataloader): vid, img_cpu, action_cpu, reson_cpu = data img = Variable(img_cpu.to(device)) action = Variable(action_cpu.to(device)) pred = i3resnet(img) feature = hook.output.cpu().data.numpy() feature = np.squeeze(feature) np.save((args.save_path + '{}.npy'.format(vid[0])), feature) print('Saved feature numbers:', i + 1) print('finished extracting features')
def load_data(dataset_path, batch_size=5, num_workers=10): # setup dataset train_transforms = transforms.Compose([ videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset_train = VidorDataset(dataset_path, 'training', train_transforms) cls_weights = dataset_train.get_weights() dataloader_train = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True) dataset_val = VidorDataset(dataset_path, 'validation', test_transforms) dataloader_val = torch.utils.data.DataLoader(dataset_val, batch_size=1, shuffle=True, num_workers=num_workers, pin_memory=True) dataloaders = {'train': dataloader_train, 'val': dataloader_val} datasets = {'train': dataset_train, 'val': dataset_val} return datasets, dataloaders, np.asarray(1 - cls_weights, dtype=np.float32)
def train(num_epoch=100, root='/home/selfdriving/mrcnn/bdd12k/', \ train_split='/home/selfdriving/I3D/data/bdd12k.json', batch_size=4, save_model='models/', \ frame_nb=64,class_nb=7, resnet_nb=50): # setup dataset transform = transforms.Compose([ videotransforms.RandomCrop(224) ]) dataset = Dataset(train_split, 'train', root, transform) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=16, pin_memory=True) if args.val: val_dataset = Dataset(train_split, 'val', root, transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=16, pin_memory=True) # dataloaders = {'train': dataloader, 'val': val_dataloader} # datasets = {'train': dataset, 'val': val_dataset} # setup the model if args.resnet_nb == 50: resnet = torchvision.models.resnet50(pretrained=True) elif args.resnet_nb == 101: resnet = torchvision.models.resnet101(pretrained=True) elif args.resnet_nb == 152: resnet = torchvision.models.resnet152(pretrained=True) else: raise ValueError('resnet_nb should be in [50|101|152] but got {}' ).format(args.resnet_nb) i3resnet = I3ResNet(copy.deepcopy(resnet), args.frame_nb, args.class_nb, conv_class=True) # set CPU/GPU devices i3resnet.train() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") i3resnet = i3resnet.to(device) i3resnet = nn.DataParallel(i3resnet) #multiple GPUs class_weights = [0.4,2,2,2,2,2,1] w = torch.FloatTensor(class_weights).cuda() criterion = nn.BCEWithLogitsLoss(pos_weight=w).cuda() optimizer = optim.Adam(i3resnet.parameters(), lr=0.0001, weight_decay=0.001) # train it for epoch in range(0, num_epoch): print('Epoch {}/{}'.format(epoch, num_epoch)) print('-' * 10) lossArr = [] AccuracyArr = [] # Iterate over data. for i, data in enumerate(dataloader): tic = time.time() # get the inputs inputs, labels = data # wrap them in Variable inputs = Variable(inputs.to(device)) #4x3x64x224x224 labels = Variable(labels.to(device)) #4x7 optimizer.zero_grad() pred = i3resnet(inputs) #4x7 loss = criterion(pred, labels) loss.backward() optimizer.step() loss_cpu = np.array(loss.cpu().data.item()) lossArr.append(loss_cpu) meanLoss = np.mean(np.array(lossArr)) # Calculate accuracy predict = torch.sigmoid(pred) >= 0.5 f1 = f1_score(labels.cpu().data.numpy(), predict.cpu().data.numpy(), average='samples') AccuracyArr.append(f1) if i % 10 == 0: toc = time.time() print('time elapsed', toc - tic) #print('prediction:', pred) print('prediction logits:{}'.format(predict)) print('ground truth:{}'.format(labels.cpu().data.numpy())) print('Epoch %d Iteration %d: Loss %.5f Accumulated Loss %.5f' % ( epoch, i, lossArr[-1], meanLoss)) print('Epoch %d Iteration %d: F1 %.5f Accumulated F1 %.5f' % ( epoch, i, AccuracyArr[-1], np.mean(np.array(AccuracyArr)))) # if epoch in [int(0.5*num_epoch), int(0.7*num_epoch)] and i==0: # print('The learning rate is being decreased at Iteration %d', i) # for param_group in optimizer.param_groups: # param_group['lr'] /= 10 # if i >= args.MaxIteration: # break if (epoch + 1) % 5 == 0: torch.save(i3resnet.state_dict(), (save_model + 'net_%d.pth' % (epoch + 1))) if args.val and (epoch + 1)% 1 == 0: print("Validation...") run_test(val_dataloader, i3resnet, device) torch.save(i3resnet.state_dict(), (save_model + 'net_Final.pth'))
PIN_MEMORY = True print('LR =', LR) print('BATCH_SIZE =', BATCH_SIZE) print('CLIP_SIZE =', CLIP_SIZE) print('EPOCHS =', EPOCHS) print('SAVE_DIR =', SAVE_DIR) # Book-keeping if not os.path.exists(SAVE_DIR): os.makedirs(SAVE_DIR) with open(SAVE_DIR + 'info.txt', 'w+') as f: f.write('LR = {}\nBATCH_SIZE = {}\nEPOCHS = {}\n'.format(LR, BATCH_SIZE, EPOCHS)) # Transforms train_transforms = transforms.Compose([videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) # Datasets and Dataloaders train_dataset = Dataset(train_split, 'training', root, mode, train_transforms) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=36, pin_memory=True) val_dataset = Dataset(train_split, 'testing', root, mode, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=36, pin_memory=True) dataloaders = {'train': train_dataloader, 'val': val_dataloader} # Load pre-trained I3D model i3d = InceptionI3d(400, in_channels=3) # pre-trained model has 400 output classes
def run(init_lr=0.1, max_steps=1e8, mode='rgb', dataset='thumos', root_train='/mnt/data_a/alex/PyTorch_I3D/thumos/validation/', root_eval='/mnt/data_a/alex/PyTorch_I3D/thumos/test/', train_split='/mnt/data_a/alex/PyTorch_I3D/thumos/validation/validation_thumos.json', eval_split='/mnt/data_a/alex/PyTorch_I3D/thumos/test/test_thumos.json', batch_size=4, batch_size_eval=4, save_model='', snippets=64, saving_steps=5000, num_steps_per_update=1, num_classes=65, crf=False, num_updates_crf=1, reg_crf=-1, use_cls=False, pairwise_cond_crf=False, reg_type='l2'): # setup dataset train_transforms = transforms.Compose([ videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(train_split, 'training', root_train, mode, snippets, train_transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True, drop_last=True) val_dataset = Dataset(eval_split, 'testing', root_eval, mode, snippets, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size_eval, shuffle=True, num_workers=8, pin_memory=True, drop_last=True) dataloaders = {'train': dataloader, 'val': val_dataloader} datasets = {'train': dataset, 'val': val_dataset} # setup model steps = 0 epoch = 0 if not os.path.exists(args.save_model): subprocess.call('mkdir ' + args.save_model, shell=True) configure(args.save_model + "tensorboard_logger", flush_secs=5) # resume the training or load the pre-trained I3D checkpoint = -1 try: checkpoint = last_checkpoint(args.save_model) except: print("Loading the pre-trained I3D") if mode == 'flow': i3d = InceptionI3d(400, in_channels=2, use_crf=crf, num_updates_crf=num_updates_crf, pairwise_cond_crf=pairwise_cond_crf) total_dict = i3d.state_dict() partial_dict = torch.load('models/flow_imagenet.pt') total_dict.update(partial_dict) i3d.load_state_dict(total_dict) else: i3d = InceptionI3d(400, in_channels=3, use_crf=crf, num_updates_crf=num_updates_crf, pairwise_cond_crf=pairwise_cond_crf) total_dict = i3d.state_dict() partial_dict = torch.load('models/rgb_imagenet.pt') total_dict.update(partial_dict) i3d.load_state_dict(total_dict) i3d.replace_logits(num_classes) if (checkpoint != -1): if mode == 'flow': i3d = InceptionI3d(num_classes, in_channels=2, use_crf=crf, num_updates_crf=num_updates_crf, pairwise_cond_crf=pairwise_cond_crf) else: i3d = InceptionI3d(num_classes, in_channels=3, use_crf=crf, num_updates_crf=num_updates_crf, pairwise_cond_crf=pairwise_cond_crf) i3d.load_state_dict(torch.load(args.save_model + checkpoint)) steps = int(checkpoint[:-3]) if dataset == 'thumos': epoch = int(steps * snippets * batch_size * num_steps_per_update / 1214016) else: epoch = int(steps * snippets * batch_size * num_steps_per_update / 5482688) # push the pipeline on multiple gpus if possible i3d.cuda() i3d = nn.DataParallel(i3d) # setup optimizer lr = init_lr optimizer = optim.SGD(i3d.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=[1000], gamma=0.1) if steps > 0: for i in range(steps): lr_sched.step() # train the model while steps < max_steps: epoch += 1 print('-' * 10) print('Epoch {}'.format(epoch)) print('Step {}/{}'.format(steps, max_steps)) print('-' * 10) # each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': print('Entering training loop...') i3d.train() else: print('Entering validation loop...') i3d.eval() time_init_eval = time.time() cumul_pred = Cumulator(num_classes) cumul_labels = Cumulator(num_classes) tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 tot_loss_updt = 0.0 tot_loc_loss_updt = 0.0 tot_cls_loss_updt = 0.0 tot_reg_loss_updt = 0.0 num_iter = 0 optimizer.zero_grad() count_batch = 0 gap_train = 0 print("Losses initialized to 0") # Iterate over data. for data in dataloaders[phase]: time_init_batch = time.time() count_batch += 1 num_iter += 1 # get the inputs inputs, labels = data # wrap them in Variable inputs = Variable(inputs.cuda()) t = inputs.size(2) labels = Variable(labels.cuda()) # forward if crf: per_frame_logits_ante_crf, per_frame_logits = i3d(inputs) else: per_frame_logits = i3d(inputs) # upsample to input size per_frame_logits = F.upsample(per_frame_logits, t, mode='linear') if crf: per_frame_logits_ante_crf = F.upsample( per_frame_logits_ante_crf, t, mode='linear') # accumulate predictions and ground truths pred_np = pt_var_to_numpy(nn.Sigmoid()(per_frame_logits)) cumul_pred.append(pred_np) labels_np = pt_var_to_numpy(labels) cumul_labels.append(labels_np) # compute localization loss if crf: loc_loss = F.binary_cross_entropy_with_logits( per_frame_logits, labels) + F.binary_cross_entropy_with_logits( per_frame_logits_ante_crf, labels) else: loc_loss = F.binary_cross_entropy_with_logits( per_frame_logits, labels) tot_loc_loss += loc_loss.data[0] tot_loc_loss_updt += loc_loss.data[0] # compute classification loss (with max-pooling along time B x C x T) if crf: cls_loss = F.binary_cross_entropy_with_logits( torch.max(per_frame_logits, dim=2)[0], torch.max( labels, dim=2)[0]) + F.binary_cross_entropy_with_logits( torch.max(per_frame_logits_ante_crf, dim=2)[0], torch.max(labels, dim=2)[0]) else: cls_loss = F.binary_cross_entropy_with_logits( torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0]) tot_cls_loss += cls_loss.data[0] tot_cls_loss_updt += cls_loss.data[0] # compute regularization loss for the crf module if crf and (reg_crf > 0 and not pairwise_cond_crf): reg_loss = get_reg_loss(i3d, 'crf', reg_type) tot_reg_loss_updt += reg_loss.data[0] elif crf and (reg_crf > 0 and pairwise_cond_crf): reg_loss = get_reg_loss(i3d, 'psi_0', reg_type) + get_reg_loss( i3d, 'psi_1', reg_type) tot_reg_loss_updt += reg_crf * reg_loss.data[0] else: reg_loss = 0 # put all the losses together if use_cls: loss = (0.5 * loc_loss + 0.5 * cls_loss + reg_crf * reg_loss) / num_steps_per_update else: loss = (loc_loss + reg_crf * reg_loss) / num_steps_per_update tot_loss += loss.data[0] tot_loss_updt += loss.data[0] loss.backward() if num_iter == num_steps_per_update and phase == 'train': steps += 1 num_iter = 0 optimizer.step() optimizer.zero_grad() lr_sched.step() examples_processed_updt = num_steps_per_update * batch_size * snippets examples_processed_tot = count_batch * batch_size * snippets map_train = map_calculator(cumul_pred.accumuled[1:], cumul_labels.accumuled[1:]) gap_train = ap_calculator( cumul_pred.accumuled[1:].flatten(), cumul_labels.accumuled[1:].flatten()) print( 'TRAINING - Epoch: {} Step: {} Examples processed {} Loc Loss: {:.6f} Cls Loss: {:.6f} Tot Loss: {:.6f} Reg Loss: {:.6f} mAP: {:.6f} GAP: {:.6f}' .format( epoch, steps, examples_processed_tot, tot_loc_loss_updt / examples_processed_updt, tot_cls_loss_updt / examples_processed_updt, tot_loss_updt / (batch_size * snippets), reg_crf * tot_reg_loss_updt / examples_processed_updt, map_train, gap_train)) log_value('Training_loc_loss', tot_loc_loss_updt / examples_processed_updt, steps) log_value('Training_cls_loss', tot_cls_loss_updt / examples_processed_updt, steps) log_value('Training_reg_loss', tot_reg_loss_updt / examples_processed_updt, steps) log_value('Training_tot_loss', tot_loss_updt / (batch_size * snippets), steps) log_value('Training_mAP', map_train, steps) log_value('Training_GAP', gap_train, steps) tot_loss_updt, tot_loc_loss_updt, tot_cls_loss_updt, tot_reg_loss_updt = 0.0, 0.0, 0.0, 0.0 cumul_pred.clear() cumul_labels.clear() cumul_pred = Cumulator(num_classes) cumul_labels = Cumulator(num_classes) if ((steps % saving_steps) == 0) & (phase == 'train') & (num_iter == 0): # save model print("EPOCH: {} Step: {} - Saving model...".format( epoch, steps)) torch.save(i3d.module.state_dict(), save_model + str(steps).zfill(6) + '.pt') tot_loss = tot_loc_loss = tot_cls_loss = 0. if phase == 'val': time_end_batch = time.time() examples_processed_tot = count_batch * batch_size_eval * snippets print( 'EVAL - Epoch: {} Step: {} Examples processed {} - Time for batch: {}' .format(epoch, steps, examples_processed_tot, time_end_batch - time_init_batch)) log_value('Evaluation time', time_end_batch - time_init_batch, examples_processed_tot) if phase == 'val': examples_processed_tot = count_batch * batch_size_eval * snippets map_val = map_calculator(cumul_pred.accumuled[1:], cumul_labels.accumuled[1:]) gap_val = ap_calculator(cumul_pred.accumuled[1:].flatten(), cumul_labels.accumuled[1:].flatten()) time_end_eval = time.time() print( 'EVAL - Epoch: {} Step: {} Loc Loss: {:.6f} Cls Loss: {:.6f} Tot Loss: {:.6f} mAP: {:.4f} GAP: {:.4f} Total time: {}' .format( epoch, steps, tot_loc_loss / examples_processed_tot, tot_cls_loss / examples_processed_tot, tot_loss_updt * num_steps_per_update / examples_processed_tot, map_val, gap_val, time_end_eval - time_init_eval)) log_value('Validation_subset_loc_loss', tot_loc_loss / examples_processed_tot, steps) log_value('Validation_subset_cls_loss', tot_cls_loss / examples_processed_tot, steps) log_value( 'Validation_subset_tot_loss', tot_loss_updt * num_steps_per_update / examples_processed_tot) log_value('Validation_subset_mAP', map_val, steps) log_value('Validation_subset_GAP', gap_val, steps) cumul_pred.clear() cumul_labels.clear()
def train(): args = get_parse() cabin_video_dir = args.cabin_video_dir face_video_dir = args.face_video_dir train_data_path = args.train_data_path val_data_path = args.val_data_path train_batch_size = args.train_batch_size val_batch_size = args.val_batch_size num_epochs = args.num_epochs learning_rate = args.learning_rate weight_decay = args.weight_decay display_steps = args.display_steps ckp_dir = args.ckp_dir save_path = args.save_path num_classes = args.num_classes weight = args.weight if torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' if not os.path.exists(ckp_dir): os.makedirs(ckp_dir) print('Start to load data') train_transforms = transforms.Compose([ videotransforms.RandomCrop(224), videotransforms.ToTensor(), videotransforms.ClipNormalize() ]) val_transforms = transforms.Compose([ videotransforms.CenterCrop(224), videotransforms.ToTensor(), videotransforms.ClipNormalize() ]) train_dataset = IVBSSDataset(cabin_video_dir, face_video_dir, train_data_path, train_transforms) val_dataset = IVBSSDataset(cabin_video_dir, face_video_dir, val_data_path, val_transforms) train_dataloader = DataLoader(train_dataset, batch_size=train_batch_size, sampler=RandomSampler(train_dataset, replacement=True), collate_fn=collate_fn, drop_last=True) total_steps = num_epochs * len(train_dataloader) print('Total number of training samples is {0}'.format(len(train_dataset))) print('Total number of validation samples is {0}'.format(len(val_dataset))) print('Total number of training steps is {0}'.format(total_steps)) model = TAL_Net(num_classes) optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=weight_decay) # optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) start_epoch = 0 if args.pretrained_I3D_model is not None: print('Load pretrained I3D model') pretrained_I3D_model = torch.load(args.pretrained_I3D_model) model.I3D_1.load_state_dict(pretrained_I3D_model) model.I3D_2.load_state_dict(pretrained_I3D_model) if args.ckp_path is not None: print('Load checkpoint') start_epoch, model, optimizer, scheduler = load_ckp( args.ckp_path, model, optimizer, scheduler) model.to(device) model.train() print('Start to train') num_step = 0 best_acc = 0.0 for epoch in range(start_epoch, num_epochs): running_loss = 0.0 class_running_loss = 0.0 chunk_inclusion_running_loss = 0.0 for i, (cabin_imgs, face_imgs, labels, start_labels, end_labels) in enumerate(train_dataloader): cabin_imgs = cabin_imgs.to(device) face_imgs = face_imgs.to(device) labels = labels.to(device) start_labels = start_labels.to(device) end_labels = end_labels.to(device) optimizer.zero_grad() loss, class_loss, chunk_inclusion_loss = model( cabin_imgs, face_imgs, labels, start_labels, end_labels, weight)[:3] loss.backward() optimizer.step() running_loss += loss.item() class_running_loss += class_loss.item() chunk_inclusion_running_loss += chunk_inclusion_loss.item() if (i + 1) % display_steps == 0: print( 'epoch:{0}/{1}, step:{2}/{3}, loss:{4:.4f}, class_loss:{5:.4f}, chunk_inclusion_loss:{6:.4f}' .format(epoch + 1, num_epochs, i + 1, len(train_dataloader), running_loss / display_steps, class_running_loss / display_steps, chunk_inclusion_running_loss / display_steps)) running_loss = 0.0 class_running_loss = 0.0 chunk_inclusion_running_loss = 0.0 num_step += 1 writer.add_scalars( 'Loss/train', { 'total_loss': loss, 'class_loss': class_loss, 'chunk_inclusion_loss': chunk_inclusion_loss }, num_step) scheduler.step() print('Start to validate') # eval_loss, eval_class_loss, eval_chunk_inclusion_loss, class_accuracy = eval(train_dataset, train_batch_size, model, weight, device) eval_loss, eval_class_loss, eval_chunk_inclusion_loss, class_accuracy = eval( val_dataset, val_batch_size, model, weight, device) writer.add_scalars( 'Loss/valid', { 'total_loss': eval_loss, 'class_loss': eval_class_loss, 'chunk_inclusion_loss': eval_chunk_inclusion_loss }, epoch) writer.add_scalar('Accuracy/valid', class_accuracy, epoch) print( 'Toal loss on validation dataset: {0:.4f}, class loss on validation dataset: {1:.4f}, chunk inclusion loss on validation dataset: {2:.4f}, class accuracy on validation dataset: {3:.4f}' .format(eval_loss, eval_class_loss, eval_chunk_inclusion_loss, class_accuracy)) is_best = class_accuracy > best_acc best_acc = max(class_accuracy, best_acc) checkpoint = { 'epoch': epoch + 1, 'model': model.state_dict(), 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() } ckp_name = 'epoch_' + str(epoch + 1) + '.pt' save_ckp(checkpoint, ckp_dir, ckp_name, is_best, save_path) print('Save the checkpoint after {} epochs'.format(epoch + 1)) writer.close()
def run(init_lr=0.1, max_steps=64e3, mode='rgb', root='../../SSBD/ssbd_clip_segment/data/', train_split='../../SSBD/Annotations/annotations_charades.json', batch_size=1, save_model=''): # setup dataset train_transforms = transforms.Compose([ videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(train_split, 'training', root, mode, train_transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True) val_dataset = Dataset(train_split, 'testing', root, mode, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True) dataloaders = {'train': dataloader, 'val': val_dataloader} datasets = {'train': dataset, 'val': val_dataset} # dataloaders = {'train': dataloader} # datasets = {'train': dataset} # setup the model xdc = torch.hub.load('HumamAlwassel/XDC', 'xdc_video_encoder', pretraining='r2plus1d_18_xdc_ig65m_kinetics', num_classes=3) # if mode == 'flow': # i3d = InceptionI3d(400, in_channels=2) # i3d.load_state_dict(torch.load('models/flow_imagenet.pt')) # else: # i3d = InceptionI3d(400, in_channels=3) # i3d.load_state_dict(torch.load('models/rgb_imagenet.pt')) # i3d.replace_logits(8) # #i3d.load_state_dict(torch.load('/ssd/models/000920.pt')) # i3d.cuda() # i3d = nn.DataParallel(i3d) xdc.cuda() xdc = nn.DataParallel(xdc).cuda() for name, param in xdc.named_parameters(): if 'fc' not in name and '4.1' not in name: param.requires_grad = False lr = init_lr optimizer = optim.SGD(xdc.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000]) num_steps_per_update = 4 # accum gradient steps = 0 best_val = 0 # new_flag = 0 # train it while steps < max_steps: #for epoch in range(num_epochs): print('Step {}/{}'.format(steps, max_steps)) print('-' * 10) # new_state_dict = OrderedDict() # state_dict = torch.load(save_model+'.pt') # for k, v in state_dict.items(): # name = "module."+k # add module. # new_state_dict[name] = v # xdc.load_state_dict(new_state_dict) # new_flag = 0 # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': xdc.train(True) else: xdc.train(False) # Set model to evaluate mode tot_loss = 0.0 # tot_loc_loss = 0.0 # tot_cls_loss = 0.0 num_iter = 0 total = 0 n = 0 optimizer.zero_grad() # Iterate over data. for data in dataloaders[phase]: num_iter += 1 # get the inputs inputs, labels = data # wrap them in Variable inputs = Variable(inputs.cuda()) t = inputs.size(2) labels = Variable(labels.cuda()) per_frame_logits = xdc(inputs) # print(per_frame_logits.shape) # print(labels.shape) # upsample to input size # per_frame_logits = F.upsample(per_frame_logits, t, mode='linear') # compute localization loss # loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels) # tot_loc_loss += loc_loss.data.item() # compute classification loss (with max-pooling along time B x C x T) # cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0]) # print(torch.max(per_frame_logits, dim=2)[0]) # print(torch.max(labels, dim=2)[0]) correct = per_frame_logits.argmax(1).eq(labels.argmax(1)) total += correct.float().sum().item() n += batch_size # tot_cls_loss += cls_loss.data.item() loss = F.binary_cross_entropy_with_logits( per_frame_logits, labels) / num_steps_per_update tot_loss += loss.data.item() loss.backward() if num_iter == num_steps_per_update and phase == 'train': steps += 1 num_iter = 0 optimizer.step() optimizer.zero_grad() lr_sched.step() if steps % 10 == 0: print('{} Tot Loss: {:.4f} Accuracy: {:.4f}'.format( phase, tot_loss / 10, total / n)) # save model # if(steps % 10000 == 0): # torch.save(xdc.module.state_dict(), save_model+str(steps).zfill(6)+'.pt') # tot_loss = tot_loc_loss = tot_cls_loss = 0. tot_loss = 0 total = 0 n = 0 if phase == 'val': print('{} Tot Loss: {:.4f} Accuracy: {:.4f}'.format( phase, (tot_loss * num_steps_per_update) / num_iter, total / n)) if (total / n > best_val): best_val = total / n torch.save(xdc.module.state_dict(), save_model + '.pt')
def run( dataset_path, annotation_path, init_lr, frames_per_clip, mode, logdir, frame_skip, batch_size, refine, refine_epoch, pretrained_model, max_steps, ): os.makedirs(logdir, exist_ok=True) # setup dataset train_transforms = transforms.Compose( [ videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip(), ] ) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) train_dataset = Dataset( dataset_path=dataset_path, annotation_path=annotation_path, transform=train_transforms, index_filename="train_dataset_index.txt", frame_skip=frame_skip, frames_per_clip=frames_per_clip, ) print("Number of clips in the train dataset:{}".format(len(train_dataset))) test_dataset = Dataset( dataset_path=dataset_path, annotation_path=annotation_path, transform=test_transforms, index_filename="test_dataset_index.txt", frame_skip=frame_skip, frames_per_clip=frames_per_clip, ) print("Number of clips in the test dataset:{}".format(len(test_dataset))) weights = utils.make_weights_for_balanced_classes(train_dataset.clip_list, train_dataset.clip_label_count) sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights)) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, sampler=sampler, num_workers=3, pin_memory=True ) test_dataloader = torch.utils.data.DataLoader( test_dataset, batch_size=batch_size, shuffle=True, num_workers=3, pin_memory=True ) # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) i3d.load_state_dict(torch.load('models/flow_' + pretrained_model + '.pt')) else: i3d = InceptionI3d(157, in_channels=3) i3d.load_state_dict(torch.load('models/rgb_' + pretrained_model + '.pt')) num_classes = len(train_dataset.action_name_list) i3d.replace_logits(num_classes) for name, param in i3d.named_parameters(): # freeze i3d parameters if 'logits' in name: param.requires_grad = True elif 'Mixed_5c' in name: param.requires_grad = True else: param.requires_grad = False if refine: if refine_epoch == 0: raise ValueError("You set the refine epoch to 0. No need to refine, just retrain.") refine_model_filename = os.path.join(logdir, str(refine_epoch).zfill(6) + '.pt') checkpoint = torch.load(refine_model_filename) i3d.load_state_dict(checkpoint["model_state_dict"]) i3d.cuda() i3d = nn.DataParallel(i3d) lr = init_lr optimizer = optim.Adam(i3d.parameters(), lr=lr, weight_decay=1E-6) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [15, 30, 45, 60]) if refine: lr_sched.load_state_dict(checkpoint["lr_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) train_writer = SummaryWriter(os.path.join(logdir, 'train')) test_writer = SummaryWriter(os.path.join(logdir, 'test')) num_steps_per_update = 4 * 5 # accum gradient - try to have number of examples per update match original code 8*5*4 # eval_steps = 5 steps = 0 # train it n_examples = 0 train_num_batch = len(train_dataloader) test_num_batch = len(test_dataloader) refine_flag = True while steps <= max_steps: # for epoch in range(num_epochs): print('Step {}/{}'.format(steps, max_steps)) print('-' * 10) if steps <= refine_epoch and refine and refine_flag: lr_sched.step() steps += 1 n_examples += len(train_dataset.clip_list) continue else: refine_flag = False # Each epoch has a training and validation phase test_batchind = -1 test_fraction_done = 0.0 test_enum = enumerate(test_dataloader) tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 num_iter = 0 optimizer.zero_grad() # Iterate over data. avg_acc = [] for train_batchind, data in enumerate(train_dataloader): num_iter += 1 # get the inputs inputs, labels, vid_idx, frame_pad = data # wrap them in Variable inputs = Variable(inputs.cuda(), requires_grad=True) labels = Variable(labels.cuda()) t = inputs.size(2) per_frame_logits = i3d(inputs) per_frame_logits = F.interpolate(per_frame_logits, t, mode='linear', align_corners=True) # compute localization loss loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels) tot_loc_loss += loc_loss.item() # compute classification loss (with max-pooling along time B x C x T) cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0]) tot_cls_loss += cls_loss.item() loss = (0.5 * loc_loss + 0.5 * cls_loss) / num_steps_per_update tot_loss += loss.item() loss.backward() acc = utils.accuracy_v2(torch.argmax(per_frame_logits, dim=1), torch.argmax(labels, dim=1)) # acc = utils.accuracy(per_frame_logits, labels) avg_acc.append(acc.item()) train_fraction_done = (train_batchind + 1) / train_num_batch print('[{}] train Acc: {}, Loss: {:.4f} [{} / {}]'.format(steps, acc.item(), loss.item(), train_batchind, len(train_dataloader))) if num_iter == num_steps_per_update or train_batchind == len(train_dataloader) - 1: n_steps = num_steps_per_update if train_batchind == len(train_dataloader) - 1: n_steps = num_iter n_examples += batch_size * n_steps print('updating the model...') print('train Total Loss: {:.4f}'.format(tot_loss / n_steps)) optimizer.step() optimizer.zero_grad() train_writer.add_scalar('loss', tot_loss / n_steps, n_examples) train_writer.add_scalar('cls loss', tot_cls_loss / n_steps, n_examples) train_writer.add_scalar('loc loss', tot_loc_loss / n_steps, n_examples) train_writer.add_scalar('Accuracy', np.mean(avg_acc), n_examples) train_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], n_examples) num_iter = 0 tot_loss = 0. if test_fraction_done <= train_fraction_done and test_batchind + 1 < test_num_batch: i3d.train(False) # Set model to evaluate mode test_batchind, data = next(test_enum) inputs, labels, vid_idx, frame_pad = data # wrap them in Variable inputs = Variable(inputs.cuda(), requires_grad=True) labels = Variable(labels.cuda()) with torch.no_grad(): per_frame_logits = i3d(inputs) per_frame_logits = F.interpolate(per_frame_logits, t, mode='linear', align_corners=True) # compute localization loss loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels) # compute classification loss (with max-pooling along time B x C x T) cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0]) loss = (0.5 * loc_loss + 0.5 * cls_loss) / num_steps_per_update acc = utils.accuracy_v2(torch.argmax(per_frame_logits, dim=1), torch.argmax(labels, dim=1)) print('[{}] test Acc: {}, Loss: {:.4f} [{} / {}]'.format(steps, acc.item(), loss.item(), test_batchind, len(test_dataloader))) test_writer.add_scalar('loss', loss.item(), n_examples) test_writer.add_scalar('cls loss', loc_loss.item(), n_examples) test_writer.add_scalar('loc loss', cls_loss.item(), n_examples) test_writer.add_scalar('Accuracy', acc.item(), n_examples) test_fraction_done = (test_batchind + 1) / test_num_batch i3d.train(True) if steps % 2 == 0: # save model torch.save({"model_state_dict": i3d.module.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "lr_state_dict": lr_sched.state_dict()}, logdir + str(steps).zfill(6) + '.pt') steps += 1 lr_sched.step() train_writer.close() test_writer.close()
def train(init_lr, max_steps, mode, root_folder, train_split, batch_size, load_model, save_model): train_transforms = transforms.Compose([videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip()]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(train_split, 'training', root_folder, mode, train_transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=36, pin_memory=True) val_dataset = Dataset(train_split, 'testing', root_folder, mode, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=36, pin_memory=True) dataloaders = {'train': dataloader, 'val': val_dataloader} i3d = InceptionI3d(400, in_channels=2 if mode == 'flow' else 3) # setup the model i3d.load_state_dict(torch.load('models/{}_imagenet.pt'.format(mode))) i3d.replace_logits(157) if load_model: i3d.load_state_dict(torch.load(load_model)) i3d.cuda() i3d = nn.DataParallel(i3d) lr = init_lr optimizer = optim.SGD(i3d.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000]) steps = 0 num_steps_per_update = 4 # accum gradient while steps < max_steps: # train it print('Step {:6d} / {}'.format(steps, max_steps)) print('-' * 10) for phase in ['train', 'val']: # each epoch has a training and validation phase i3d.train(phase == 'train') # eval only during validation phase num_iter, tot_loss, tot_loc_loss, tot_cls_loss = 0, 0.0, 0.0, 0.0 optimizer.zero_grad() for data in dataloaders[phase]: # iterate over data num_iter += 1 inputs, labels = data # get the inputs inputs = Variable(inputs.cuda()) # wrap them in Variable labels = Variable(labels.cuda()) t = inputs.size(2) per_frame_logits = i3d(inputs) # upsample to input size per_frame_logits = F.upsample(per_frame_logits, t, mode='linear') loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels) # compute localization loss tot_loc_loss += loc_loss.data[0] cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0]) tot_cls_loss += cls_loss.data[0] # compute classification loss (with max-pooling along time B x C x T) loss = (0.5 * loc_loss + 0.5 * cls_loss) / num_steps_per_update tot_loss += loss.data[0] loss.backward() if num_iter == num_steps_per_update and phase == 'train': steps += 1 num_iter = 0 optimizer.step() optimizer.zero_grad() lr_sched.step() if steps % 10 == 0: print('{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format( phase, tot_loc_loss / (10 * num_steps_per_update), tot_cls_loss / (10 * num_steps_per_update), tot_loss / 10)) torch.save(i3d.module.state_dict(), save_model + str(steps).zfill(6)+'.pt') # save model tot_loss = tot_loc_loss = tot_cls_loss = 0. if phase == 'val': print('{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format( phase, tot_loc_loss / num_iter, tot_cls_loss / num_iter, (tot_loss * num_steps_per_update) / num_iter))
def main(): best_prec1 = 0 with open( 'logs/' + args.dataset + '/' + args.arch + '_' + args.mode + '_validation.txt', 'a') as f: f.write("=============================================") f.write('\n') f.write("lr: ") f.write(str(args.lr)) f.write(" lr_step: ") f.write(str(args.lr_steps)) f.write(" dataset: ") f.write(str(args.dataset)) f.write(" modality: ") f.write(str(args.mode)) f.write(" dropout: ") f.write(str(args.dropout)) f.write(" batch size: ") f.write(str(args.batch_size)) f.write('\n') if args.dataset == 'ucf101': num_class = 101 data_length = 64 image_tmpl = "frame{:06d}.jpg" elif args.dataset == 'hmdb51': num_class = 51 data_length = 64 image_tmpl = "img_{:05d}.jpg" elif args.dataset == 'kinetics': num_class = 400 data_length = 64 image_tmpl = "img_{:05d}.jpg" else: raise ValueError('Unknown dataset ' + args.dataset) val_logger = Logger( 'logs/' + args.dataset + '/' + args.arch + '_' + args.mode + '_val.log', ['epoch', 'acc']) # define loss function (criterion) and optimizer #======================data transform============= normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_transforms = transforms.Compose([ videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip() ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) #=======================design the dataset============== train_dataset = I3dDataSet("", args.train_list, num_segments=1, new_length=data_length, modality=args.mode, dataset=args.dataset, image_tmpl=image_tmpl if args.mode in ["rgb", "RGBDiff"] else args.flow_prefix + "{}_{:05d}.jpg", transform=train_transforms, test_mode=False) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=True) val_dataset = I3dDataSet("", args.val_list, num_segments=1, new_length=data_length, modality=args.mode, dataset=args.dataset, image_tmpl=image_tmpl if args.mode in ["rgb", "RGBDiff"] else args.flow_prefix + "{}_{:05d}.jpg", random_shift=False, transform=test_transforms, test_mode=False) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=8, pin_memory=True) dataloaders = {'train': train_loader, 'val': val_loader} datasets = {'train': train_dataset, 'val': val_dataset} #=============================set the model ================== # setup the model if args.mode == 'flow': if args.arch == 'i3d': from net.i3d import I3D i3d = I3D(modality='flow', num_classes=num_class, dropout_prob=args.dropout) elif args.arch == 'bilinear_i3d': from net.bilinear_i3d import I3D i3d = I3D(modality='flow', num_classes=num_class, dropout_prob=args.dropout) elif args.arch == 'se_i3d': from net.se_i3d import I3D i3d = I3D(modality='flow', num_classes=num_class, dropout_prob=args.dropout) elif args.arch == 'se_bilinear_i3d': from net.se_bilinear_i3d import I3D i3d = I3D(modality='flow', num_classes=num_class, dropout_prob=args.dropout) else: Exception("not support now!") i3d.eval() pretrain_dict = torch.load('pretrained_models/model_flow.pth') model_dict = i3d.state_dict() weight_dict = weight_transform(model_dict, pretrain_dict) i3d.load_state_dict(weight_dict) else: #i3d = InceptionI3d(400, in_channels=3) if args.arch == 'i3d': from net.i3d import I3D i3d = I3D(modality='rgb', num_classes=num_class, dropout_prob=args.dropout) elif args.arch == 'se_i3d': from net.se_i3d import I3D i3d = I3D(modality='rgb', num_classes=num_class, dropout_prob=args.dropout) elif args.arch == 'bilinear_i3d': from net.bilinear_i3d import I3D i3d = I3D(modality='rgb', num_classes=num_class, dropout_prob=args.dropout) elif args.arch == 'se_bilinear_i3d': from net.se_bilinear_i3d import I3D i3d = I3D(modality='rgb', num_classes=num_class, dropout_prob=args.dropout) else: Exception("not support now!") i3d.eval() pretrain_dict = torch.load('pretrained_models/model_rgb.pth') model_dict = i3d.state_dict() weight_dict = weight_transform(model_dict, pretrain_dict) i3d.load_state_dict(weight_dict) i3d.cuda() #print(i3d) #============================set SGD, critization and lr ================== optimizer = torch.optim.SGD(i3d.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, dampening=0, nesterov=False) model = nn.DataParallel(i3d) criterion = torch.nn.NLLLoss().cuda() disturb = DisturbLabel(alpha=10, C=51) # criterion = FocalLoss(gamma = 0).cuda() #print(model) writer = SummaryWriter() #create log folders for plot timer = Timer() for epoch in range(1, args.epochs): timer.tic() adjust_learning_rate(optimizer, epoch, args.lr_steps) # train for one epoch train_prec1, train_loss = train(train_loader, model, criterion, optimizer, epoch, disturb) writer.add_scalar('Train/Accu', train_prec1, epoch) writer.add_scalar('Train/Loss', train_loss, epoch) # evaluate on validation set if (epoch + 1) % args.eval_freq == 0 or epoch == args.epochs - 1: prec1, val_loss = validate(val_loader, model, criterion, (epoch + 1) * len(train_loader)) writer.add_scalar('Val/Accu', prec1, epoch) writer.add_scalar('Val/Loss', val_loss, epoch) writer.add_scalars('data/Acc', { 'train_prec1': train_prec1, 'val_prec1': prec1 }, epoch) writer.add_scalars('data/Loss', { 'train_loss': train_loss, 'val_loss': val_loss }, epoch) #scheduler.step(val_loss) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best, best_prec1) val_logger.log({'epoch': epoch, 'acc': prec1}) timer.toc() left_time = timer.average_time * (args.epochs - epoch) print("best_prec1 is: {}".format(best_prec1)) print("left time is: {}".format(timer.format(left_time))) with open( 'logs/' + args.dataset + '/' + args.arch + '_' + args.mode + '_validation.txt', 'a') as f: f.write(str(epoch)) f.write(" ") f.write(str(train_prec1)) f.write(" ") f.write(str(prec1)) f.write(" ") f.write(timer.format(timer.diff)) f.write('\n') writer.export_scalars_to_json("./all_scalars.json") writer.close()
import torch from pytorch_i3d import InceptionI3d from torchvision import transforms import videotransforms from mit_data import MITDataset, make_label_binarizer INDEX_FILE = "experiment/binary_class/binary_class.csv" SPLIT_FILE = "experiment/binary_class/split.csv" batch_size = 1 train_transforms = transforms.Compose([ videotransforms.RandomCrop(225), videotransforms.RandomHorizontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = MITDataset(mode="train", transforms=train_transforms, index_file=INDEX_FILE, split_file=SPLIT_FILE) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=10, pin_memory=True) val_dataset = MITDataset(mode="val", transforms=test_transforms, index_file=INDEX_FILE, split_file=SPLIT_FILE)
def eval(args): transform = transforms.Compose([videotransforms.RandomCrop(224)]) val_dataset = Dataset(args.train_split, 'val', args.root, args.frame_nb, args.interval, transform) val_dataloader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size, shuffle=True, num_workers=24, # on jobs pin_memory=True) if args.resnet_nb == 50: resnet = torchvision.models.resnet50(pretrained=True) print('load resnet50 pretrained model...') elif args.resnet_nb == 101: resnet = torchvision.models.resnet101(pretrained=True) print('load resnet101 pretrained model...') elif args.resnet_nb == 152: resnet = torchvision.models.resnet152(pretrained=True) print('load resnet152 pretrained model...') else: raise ValueError( 'resnet_nb should be in [50|101|152] but got {}').format( args.resnet_nb) i3resnet = I3ResNet(copy.deepcopy(resnet), args.frame_nb, args.class_nb, conv_class=True) state_dict = torch.load(args.model_path) new_state_dict = OrderedDict() for k, v in state_dict.items(): name = k[7:] # remove 'module'. new_state_dict[name] = v i3resnet.load_state_dict(new_state_dict) print('loaded saved state_dict...') i3resnet.eval() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") i3resnet = i3resnet.to(device) # i3resnet = nn.DataParallel(i3resnet) AccuracyArr = [] accuracy = np.zeros((1, args.class_nb)) with torch.no_grad(): for i, data in enumerate(val_dataloader): tic = time.time() # tic = time.time() # Read data img_cpu, label_cpu = data img = Variable(img_cpu.to(device)) label = Variable(label_cpu.to(device)) pred = i3resnet(img) # Calculate accuracy predict = torch.sigmoid(pred) > 0.5 f1_sample = f1_score(label_cpu.data.numpy(), predict.cpu().data.numpy(), average='samples') # here!!! f1 = f1_score(label_cpu.data.numpy(), predict.cpu().data.numpy(), average=None) AccuracyArr.append(f1_sample) accuracy = np.vstack((accuracy, f1)) if i % 10 == 0: toc = time.time() print('validation dataset batch:', i) print('prediction logits:{}'.format( predict.cpu().data.numpy())) print('ground truth:{}'.format(label_cpu.data.numpy())) print('f1 score:', f1_sample, 'accumulated f1 score:', np.mean(np.array(AccuracyArr))) # print('f1 average:', np.mean(accuracy, axis=0)) print('Time elapsed:', toc - tic) torch.cuda.empty_cache() print("Finished Validation")
def run(init_lr=0.01, max_steps=200, mode='rgb', root='/media/pritesh/Entertainment/Visual-Tactile_Dataset/dataset/',\ train_split='train.txt', test_split='test.txt', batch_size=5, save_model=''): writer = tensorboardX.SummaryWriter() # setup dataset train_transforms = transforms.Compose([ videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(train_split, root, mode, train_transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=3, pin_memory=True) val_dataset = Dataset(test_split, root, mode, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=3, pin_memory=True) dataloaders = {'train': dataloader, 'val': val_dataloader} datasets = {'train': dataset, 'val': val_dataset} # setup the model sm = InceptionI3d(400, in_channels=3) sm.load_state_dict(torch.load('models/rgb_imagenet.pt')) #tm = InceptionI3d(400, in_channels=2) #tm.load_state_dict(torch.load('models/flow_imagenet.pt')) sm.replace_logits(1) sm = freeze_network_layer(sm) #add your network here fusedNet = FusionNet(sm) if torch.cuda.is_available(): fusedNet.cuda() fusedNet = nn.DataParallel(fusedNet) lr = init_lr optimizer = optim.SGD(fusedNet.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [50, 100, 150, 200]) steps = 0 with open('i3d_video.txt', 'w') as file: file.write("train and validation loss file\n") # train it while steps < max_steps: #for epoch in range(num_epochs): print('Step {}/{}'.format(steps, max_steps)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'val']: print('phase : {}'.format(phase)) if phase == 'train': fusedNet.train(True) else: fusedNet.train(False) # Set model to evaluate mode tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 num_iter = 0 count = 0 optimizer.zero_grad() # Iterate over data. for data in dataloaders[phase]: num_iter += 1 # get the inputs f_vid, l_vid, tactile, pos, labels = data if torch.cuda.is_available(): inputs = Variable(f_vid.cuda()) t = inputs.size(2) labels = Variable(labels.cuda()) else: inputs = Variable(f_vid) t = inputs.size(2) labels = Variable(labels) per_frame_logits = fusedNet(inputs.float()) #print('prediction output = ', per_frame_logits.shape) #print('labels = ',labels.shape) # compute classification loss (with max-pooling along time B x C x T) per_frame_logits = per_frame_logits.squeeze(1) cls_loss = F.binary_cross_entropy_with_logits( per_frame_logits.double(), labels.double()) tot_cls_loss += cls_loss.item() cls_loss.backward() print('{} Loss: {:.4f} and lr: {}'.format( phase, tot_cls_loss / num_iter, init_lr)) with open('i3d_video.txt', 'a') as file: file.write("%f\n" % (tot_cls_loss / num_iter)) optimizer.step() optimizer.zero_grad() if phase == 'val': writer.add_scalar('error/' + phase, (tot_cls_loss / num_iter), num_iter) else: writer.add_scalar('error/' + phase, (tot_cls_loss / num_iter), num_iter) if (steps % 50 == 0): torch.save( fusedNet.module.state_dict(), save_model + phase + str(steps).zfill(6) + '.pt') save_checkpoint(fusedNet, optimizer, lr_sched, steps) #save error at every epoch writer.add_scalar('errorAtEpoch/' + phase, (tot_cls_loss / num_iter), steps) tot_cls_loss = 0. #if(steps%50 == 0): # torch.save(fusedNet.module.state_dict(), save_model+phase+str(steps).zfill(6)+'.pt') # save_checkpoint(fusedNet, optimizer, lr_sched, steps) steps += 1 lr_sched.step()
def run(num_vids, init_lr=0.1, max_steps=64e3, mode='rgb', root='/ssd/Charades_v1_rgb', train_split='charades/charades.json', val_split='charades/charades.json', batch_size=8 * 5, save_model='', num_classes=2): # setup dataset train_transforms = transforms.Compose([ videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) val_dataset = Dataset(val_split, 'testing', root, mode, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True) dataloaders = {'train': None, 'val': val_dataloader} datasets = {'train': None, 'val': val_dataset} # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) else: i3d = InceptionI3d(400, in_channels=3) i3d.replace_logits(2) i3d.load_state_dict(torch.load(save_model)) #i3d.load_state_dict(torch.load('/ssd/models/000920.pt')) i3d.cuda() i3d = nn.DataParallel(i3d) lr = init_lr optimizer = optim.SGD(i3d.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000]) num_steps_per_update = 4 # accum gradient steps = 0 # train it while steps < max_steps: #for epoch in range(num_epochs): print('Step {}/{}'.format(steps, max_steps)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['val']: if phase == 'train': i3d.train(True) else: i3d.train(False) # Set model to evaluate mode tot_acc = 0.0 tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 num_iter = 0 optimizer.zero_grad() # Iterate over data. for data in dataloaders[phase]: num_iter += 1 print(num_iter) # get the inputs inputs, labels = data # wrap them in Variable inputs = Variable(inputs.cuda()) t = inputs.size(2) labels = Variable(labels.cuda()) per_frame_logits = i3d(inputs) # upsample to input size per_frame_logits = F.upsample(per_frame_logits, t, mode='linear') logits, _ = torch.max(per_frame_logits, dim=2) labels, _ = torch.max(labels, dim=2) # compute classification loss (with max-pooling along time B x C x T) cls_loss = F.binary_cross_entropy_with_logits(logits, labels) tot_cls_loss += cls_loss.item() loss = (cls_loss) / num_steps_per_update tot_loss += loss.item() loss.backward() predictions = torch.nn.Softmax(dim=-1)(logits) bin_predictions = predictions >= 0.5 acc = (bin_predictions * labels.byte()).float().sum() / batch_size tot_acc += acc if num_iter % 1 == 0: print("{}/1500".format(num_iter)) if phase == 'val': print('{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'. format(phase, tot_loc_loss / num_iter, tot_cls_loss / num_iter, (tot_loss * num_steps_per_update) / num_iter))
def train(args): # Init wandb run = wandb.init(name=args.save_dir[len('../runs/'):], config=args, project='sign-language-recognition') # Create directory for model checkpoints and log if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) # Save args with open(os.path.join(args.save_dir, 'args.json'), 'w') as f: json.dump(vars(args), f, sort_keys=True, indent=2) # Logger logger = create_logger(args.save_dir) # Set gpu if torch.cuda.is_available(): i = get_free_gpu() device = get_device(gpu=i) else: device = 'cpu' logger.info('using device: {}'.format(device)) # Prepare early stop stopped = False best_epoch = 0 best_loss = torch.Tensor([float('Inf')]) # Data if args.freeze_vgg: real_batch_size = 3 else: real_batch_size = 2 # can't fit more into gpu memory json_file = os.path.join(args.data_path, 'WLASL_v0.3.json') videos_folder = os.path.join(args.data_path, 'videos') keypoints_folder = os.path.join(args.data_path, 'keypoints') train_transforms = transforms.Compose([videotransforms.RandomCrop(224)]) val_transforms = train_transforms # Debug data if args.debug_dataset: train_dataset = WLASL(json_file=json_file, videos_folder=videos_folder, keypoints_folder=keypoints_folder, transforms=train_transforms, split='train', subset=args.subset) train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=real_batch_size, sampler=DebugSampler( args.debug_dataset, len(train_dataset))) val_dl = train_dl else: train_dataset = WLASL(json_file=json_file, videos_folder=videos_folder, keypoints_folder=keypoints_folder, transforms=train_transforms, split='train', subset=args.subset) train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=real_batch_size, shuffle=True) val_dataset = WLASL(json_file=json_file, videos_folder=videos_folder, keypoints_folder=keypoints_folder, transforms=val_transforms, split='val', subset=args.subset) val_dl = torch.utils.data.DataLoader(val_dataset, batch_size=real_batch_size, shuffle=True) logger.info('data loaded') # Model, loss, optimizer m = Conv2dRNN(args).to(device) optimizer = torch.optim.Adam(m.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss() # Resume train start_epoch = 0 if args.resume_train: checkpoint = torch.load(os.path.join(args.save_dir, 'checkpoint.pt.tar'), map_location=torch.device('cpu')) best_epoch = checkpoint['epoch'] m.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) m = m.to(device) best_loss = checkpoint['best_val_loss'] start_epoch = best_epoch + 1 # Change learning rate for g in optimizer.param_groups: g['lr'] = args.lr logger.info( 'Resuming training from epoch {} with best loss {:.4f}'.format( start_epoch, best_loss)) # learning rate scheduler scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=args.lr_schedule_factor, patience=args.lr_schedule_patience, threshold=args.lr_schedule_threshold) # Watch model with wandb run.watch(m, log='all', log_freq=5) # Print args logger.info('using args: \n' + json.dumps(vars(args), sort_keys=True, indent=2)) # Train loop for t in range(args.n_epochs): t += start_epoch # Train losses = AverageMeter() batch_time = AverageMeter() m.train() start_t = time.time() for i, batch in enumerate(train_dl): # Run the forward pass multiple times and accumulate gradient (to be able to use large batch size) X = batch['X'].to(device) label = batch['label'].to(device) # [per frame logits, mean of all frames logits] logits = m(X) # Create label for each logit label = torch.cat([l.repeat(logits.shape[1], 1) for l in label], dim=0) # Squeeze time sequence and batch into one dimension logits = logits.reshape(logits.shape[0] * logits.shape[1], logits.shape[2]) loss = criterion(logits, label.squeeze()) loss.backward() losses.update(loss.item()) if (i % (args.batch_size // real_batch_size)) == 0: # Optimize with accumulated gradient optimizer.step() optimizer.zero_grad() batch_time.update(time.time() - start_t) start_t = time.time() train_loss = losses.avg # Validate with torch.no_grad(): top1 = AverageMeter() top5 = AverageMeter() top10 = AverageMeter() losses = AverageMeter() m.eval() for batch in val_dl: X = batch['X'].to(device) label = batch['label'].to(device) # [per frame logits, mean of all frames logits] logits = m(X) # Create label for each logit label = torch.cat( [l.repeat(logits.shape[1], 1) for l in label], dim=0) # Squeeze time sequence and batch into one dimension logits = logits.reshape(logits.shape[0] * logits.shape[1], logits.shape[2]) losses.update(criterion(logits, label.squeeze()).item()) # Update metrics acc1, acc5, acc10 = topk_accuracy(logits, label, topk=(1, 5, 10)) top1.update(acc1.item()) top5.update(acc5.item()) top10.update(acc10.item()) val_loss = losses.avg # Save best model if val_loss < best_loss: best_loss, best_epoch = val_loss, t save_best(args, t, m, optimizer, best_loss) # Check early stop if t >= best_epoch + args.early_stop: logger.info('EARLY STOP') break # Log info logger.info( 'epoch: {} train loss: {:.4f} val loss: {:.4f} top1acc {:.4f} top5acc {:.4f} top10acc {:.4f} lr: {:.2e} time per batch {:.1f} s' .format(t + 1, train_loss, val_loss, top1.avg, top5.avg, top10.avg, optimizer.param_groups[0]['lr'], batch_time.avg)) # Wandb log run.log({ 'train_loss': train_loss, 'val_loss': val_loss, 'top1_acc': top1.avg, 'top5_acc': top5.avg, 'top10_acc': top10.avg, 'lr': optimizer.param_groups[0]['lr'] }) # Scheduler step if args.use_lr_scheduler: scheduler.step(val_loss)
def run(num_vids, sync_bn=True, init_lr=0.1, max_steps=64e3, mode='rgb', root='/ssd/Charades_v1_rgb', train_split='charades/charades.json', val_split='charades/charades.json', batch_size=8 * 5, save_model='', num_classes=2): # setup dataset train_transforms = transforms.Compose([ videotransforms.Tile(64), videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(train_split, 'training', root, mode, train_transforms) sampler = torch.utils.data.distributed.DistributedSampler( dataset, num_replicas=args.ngpu, rank=args.local_rank, ) def remove_bad_vids(batch): batch = list(filter(lambda x: x is not None, batch)) return torch.utils.data.dataloader.default_collate(batch) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=2, pin_memory=True, sampler=sampler, drop_last=True, collate_fn=remove_bad_vids) val_dataset = Dataset(val_split, 'testing', root, mode, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True, drop_last=True) dataloaders = {'train': dataloader, 'val': val_dataloader} datasets = {'train': dataset, 'val': val_dataset} # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) i3d.load_state_dict(torch.load('models/flow_imagenet.pt')) else: i3d = InceptionI3d(400, in_channels=3) i3d.load_state_dict(torch.load('models/rgb_imagenet.pt')) i3d.replace_logits(2) if sync_bn: print("Using SyncBatchNorm") i3d = torch.nn.SyncBatchNorm.convert_sync_batchnorm(i3d) if args.phase == 'val': print("Loading model {}".format(args.save_model)) i3d.load_state_dict(torch.load(args.save_model)) i3d.cuda() i3d = torch.nn.parallel.DistributedDataParallel( i3d, device_ids=[args.local_rank], output_device=args.local_rank, ) # Send model to its device device = torch.device('cuda:{}'.format(args.local_rank)) i3d = i3d.to(device) lr = init_lr optimizer = optim.SGD(i3d.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000]) num_steps_per_update = 4 # accum gradient steps = 0 # train it while steps < max_steps: #for epoch in range(num_epochs): if args.local_rank == 0: print('Step {}/{}'.format(steps, max_steps)) print('-' * 10) # Each epoch has a training and validation phase for phase in [args.phase]: #, 'val']: if phase == 'train': i3d.train(True) else: i3d.train(False) # Set model to evaluate mode if args.local_rank != 0: break tot_acc = 0.0 tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 num_iter = 0 optimizer.zero_grad() # Iterate over data. #for data in dataloaders[phase]: for it, data in tqdm(enumerate(dataloaders[phase]), total=num_vids // (args.ngpu * batch_size), ncols=100): num_iter += 1 inputs, labels = data # Send input to device inputs, labels = inputs.to(device), labels.to(device) # wrap them in Variable inputs = Variable(inputs.cuda()) t = inputs.size(2) labels = Variable(labels.cuda()) per_frame_logits = i3d(inputs) # upsample to input size per_frame_logits = F.upsample(per_frame_logits, t, mode='linear') logits, _ = torch.max(per_frame_logits, dim=2) labels, _ = torch.max(labels, dim=2) # compute classification loss (with max-pooling along time B x C x T) cls_loss = F.binary_cross_entropy_with_logits(logits, labels) tot_cls_loss += cls_loss.item() loss = (cls_loss) / num_steps_per_update tot_loss += loss.item() loss.backward() predictions = torch.nn.Softmax(dim=-1)(logits) bin_predictions = predictions >= 0.5 acc = (bin_predictions * labels.byte()).float().sum() / batch_size tot_acc += acc if num_iter % 10 == 0 and phase == 'val': print( '{} Accuracy: {:.4f} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}' .format(phase, tot_acc / num_iter, tot_loc_loss / num_iter, tot_cls_loss / num_iter, (tot_loss * num_steps_per_update) / num_iter)) if num_iter == num_steps_per_update and phase == 'train': steps += 1 num_iter = 0 optimizer.step() optimizer.zero_grad() lr_sched.step() if args.local_rank == 0 and steps % 10 == 0: print( '{} Accuracy: {:.4f} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}' .format(phase, tot_acc / (num_steps_per_update), tot_loc_loss / (10 * num_steps_per_update), tot_cls_loss / (10 * num_steps_per_update), tot_loss / 10)) # save model os.system('mkdir -p {}'.format(save_model)) torch.save( i3d.module.state_dict(), os.path.join(save_model, str(steps).zfill(6)) + '.pt') tot_acc = tot_loss = tot_loc_loss = tot_cls_loss = 0. if phase == 'val': print( '{} Accuracy: {:.4f} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}' .format(phase, tot_acc / num_iter, tot_loc_loss / num_iter, tot_cls_loss / num_iter, (tot_loss * num_steps_per_update) / num_iter))
def run(init_lr=0.1, max_steps=64e3, mode='rgb', root='/storage/truppr/CHARADES/Charades_v1_rgb', train_split='charades/charades.json', batch_size=16, save_model=''): # setup dataset train_transforms = transforms.Compose([ videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) # print(root) print("creating training set...") dataset = Dataset(train_split, 'training', root, mode, train_transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=18, pin_memory=True) print("creating validation set...") val_dataset = Dataset(train_split, 'testing', root, mode, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=18, pin_memory=True) dataloaders = {'train': dataloader, 'val': val_dataloader} datasets = {'train': dataset, 'val': val_dataset} # setup the model print("setting up the model...") if mode == 'flow' or mode == 'rgb': if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) i3d.load_state_dict(torch.load('models/flow_imagenet.pt')) elif mode == 'rgb': i3d = InceptionI3d(400, in_channels=3) i3d.load_state_dict(torch.load('models/rgb_imagenet.pt')) i3d.replace_logits(157) # number of classes... originally 157 i3d.cuda(0) i3d = nn.DataParallel(i3d) elif mode == 'both': i3d_rgb = InceptionI3d(400, in_channels=3) i3d_rgb.load_state_dict(torch.load('models/rgb_imagenet.pt')) i3d_flow = InceptionI3d(400, in_channels=2) i3d_flow.load_state_dict(torch.load('models/flow_imagenet.pt')) i3d_rgb.replace_logits(157) # number of classes... originally 157 i3d_flow.replace_logits(157) i3d_rgb.cuda(0) i3d_flow.cuda(0) i3d_rgb = nn.DataParallel(i3d_rgb) i3d_flow = nn.DataParallel(i3d_flow) lr = init_lr if mode == 'both': optimizer_rgb = optim.SGD(i3d_rgb.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) optimizer_flow = optim.SGD(i3d_flow.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) lr_sched_rgb = optim.lr_scheduler.MultiStepLR(optimizer_rgb, [300, 1000]) lr_sched_flow = optim.lr_scheduler.MultiStepLR(optimizer_flow, [300, 1000]) else: optimizer = optim.SGD(i3d.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000]) num_steps_per_update = 4 # accum gradient steps = 0 # train it while steps < max_steps: #for epoch in range(num_epochs): # print 'Step {}/{}'.format(steps, max_steps) # print '-' * 10 print('Step ' + str(steps) + '/' + str(max_steps)) print('-' * 25) # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': print("training model...") if mode == 'both': i3d_rgb.train(True) i3d_flow.train(True) optimizer_rgb.zero_grad() optimizer_flow.zero_grad() else: i3d.train(True) optimizer.zero_grad() else: print("validating model...") if mode == 'both': i3d_rgb.train(False) i3d_flow.train(False) optimizer_rgb.zero_grad() optimizer_flow.zero_grad() else: i3d.train(False) # Set model to evaluate mode optimizer.zero_grad() tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 num_iter = 0 # optimizer.zero_grad() print("zeroed...") # print(len(dataloaders["train"])) # print(dataloaders["train"]) # Iterate over data. for data in dataloaders[phase]: # print("starting iter...") num_iter += 1 # get the inputs inputs, labels = data print("data size: ", inputs.shape, " label: ", labels) # wrap them in Variable inputs = Variable(inputs.cuda()) t = inputs.size(2) labels = Variable(labels.cuda()) torch.set_printoptions(profile="full") print("labels:\n", labels) print("labels:\n", labels.shape) print("Inputs: \n", inputs.shape) torch.set_printoptions(profile="default") if mode == 'both': per_frame_logits = i3d_rgb(inputs) per_flows_logits = i3d_flow(flow_inputs) else: per_frame_logits = i3d(inputs) # upsample to input size per_frame_logits = F.upsample(per_frame_logits, t, mode='linear') # compute localization loss loc_loss = F.binary_cross_entropy_with_logits( per_frame_logits, labels) tot_loc_loss += loc_loss.item() # compute classification loss (with max-pooling along time B x C x T) cls_loss = F.binary_cross_entropy_with_logits( torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0]) tot_cls_loss += cls_loss.item() loss = (0.5 * loc_loss + 0.5 * cls_loss) / num_steps_per_update tot_loss += loss.item() loss.backward() if num_iter == num_steps_per_update and phase == 'train': steps += 1 num_iter = 0 optimizer.step() optimizer.zero_grad() lr_sched.step() if steps % 10 == 0: # print '{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/(10*num_steps_per_update), tot_cls_loss/(10*num_steps_per_update), tot_loss/10) print( str(phase) + ' Loc Loss: ' + str(tot_loc_loss / (10 * num_steps_per_update)) + ' Cls Loss: ' + str(tot_cls_loss / (10 * num_steps_per_update)) + ' Tot Loss: ' + str(tot_loss / 10)) # save model torch.save( i3d.module.state_dict(), save_model + str(steps).zfill(6) + '-' + str(tot_loss / 10) + '.pt') tot_loss = tot_loc_loss = tot_cls_loss = 0. #else: # print(str(phase) + ' Loc Loss: ' + str(tot_loc_loss/(10*num_steps_per_update)) + ' Cls Loss: ' + str(tot_cls_loss/(10*num_steps_per_update)) + ' Tot Loss: ' + str(tot_loss/10)) if phase == 'val': print( str(phase) + ' Loc Loss: ' + str(tot_loc_loss / num_iter).zfill(4) + ' Cls Loss: ' + str(tot_cls_loss / num_iter).zfill(4) + ' Tot Loss: ' + str((tot_loss * num_steps_per_update) / num_iter).zfill(4)) # print '{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/num_iter, tot_cls_loss/num_iter, (tot_loss*num_steps_per_update)/num_iter) print("whoops...")
def run(init_lr=0.0001, max_steps=64e3, frames_per_clip=16, dataset_path='/media/sitzikbs/6TB/ANU_ikea_dataset/', train_filename='train_cross_env.txt', testset_filename='test_cross_env.txt', db_filename='../ikea_dataset_frame_labeler/ikea_annotation_db', logdir='', frame_skip=1, batch_size=8, camera='dev3', refine=False, refine_epoch=0, load_mode='vid', input_type='rgb', model_name='c3d'): os.makedirs(logdir, exist_ok=True) # setup dataset img_size = 112 if model_name == 'c3d' else 160 #224 train_transforms = transforms.Compose([videotransforms.RandomCrop(img_size), videotransforms.RandomHorizontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(img_size)]) train_dataset = Dataset(dataset_path, db_filename=db_filename, train_filename=train_filename, transform=train_transforms, set='train', camera=camera, frame_skip=frame_skip, frames_per_clip=frames_per_clip, resize=None, mode=load_mode, input_type=input_type) print("Number of clips in the dataset:{}".format(len(train_dataset))) weights = utils.make_weights_for_balanced_classes(train_dataset.clip_set, train_dataset.clip_label_count) sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights)) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=sampler, num_workers=6, pin_memory=False) test_dataset = Dataset(dataset_path, db_filename=db_filename, train_filename=train_filename, test_filename=testset_filename, transform=test_transforms, set='test', camera=camera, frame_skip=frame_skip, frames_per_clip=frames_per_clip, resize=None, mode=load_mode, input_type=input_type) test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=6, pin_memory=False) # setup the model num_classes = train_dataset.num_classes if model_name == 'c3d': model = c3d.C3D() model.load_state_dict(torch.load('c3d.pickle')) model.replace_logits(num_classes) elif model_name == 'p3d': model = p3d.P3D199(pretrained=True, modality='RGB', num_classes=num_classes) else: raise ValueError("unsupported model") if refine: if refine_epoch == 0: raise ValueError("You set the refine epoch to 0. No need to refine, just retrain.") refine_model_filename = os.path.join(logdir, str(refine_epoch).zfill(6)+'.pt') checkpoint = torch.load(refine_model_filename) model.load_state_dict(checkpoint["model_state_dict"]) model.cuda() model = nn.DataParallel(model) lr = init_lr optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1E-6) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [10, 20, 30, 40]) criterion = nn.CrossEntropyLoss() # standard crossentropy loss for classification if refine: lr_sched.load_state_dict(checkpoint["lr_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) train_writer = SummaryWriter(os.path.join(logdir, 'train')) test_writer = SummaryWriter(os.path.join(logdir, 'test')) num_steps_per_update = 4 # accum gradient - try to have number of examples per update match original code 8*5*4 # eval_steps = 5 steps = 0 # train it n_examples = 0 train_num_batch = len(train_dataloader) test_num_batch = len(test_dataloader) refine_flag = True while steps < max_steps:#for epoch in range(num_epochs): print('Step {}/{}'.format(steps, max_steps)) print('-' * 10) if steps <= refine_epoch and refine and refine_flag: lr_sched.step() steps += 1 n_examples += len(train_dataset.clip_set) continue else: refine_flag = False # Each epoch has a training and validation phase test_batchind = -1 test_fraction_done = 0.0 test_enum = enumerate(test_dataloader, 0) tot_loss = 0.0 num_iter = 0 optimizer.zero_grad() # Iterate over data. avg_acc = [] for train_batchind, data in enumerate(train_dataloader): num_iter += 1 # get the inputs inputs, labels, vid_idx, frame_pad = data # wrap them in Variable inputs = Variable(inputs.cuda(), requires_grad=True) labels = Variable(labels.cuda()) labels = torch.argmax(labels, dim=1) logits = model(inputs) t = inputs.size(2) per_frame_logits = torch.nn.functional.interpolate(logits.unsqueeze(-1), t, mode='linear', align_corners=True) probs = torch.nn.functional.softmax(per_frame_logits, dim=1) preds = torch.max(probs, 1)[1] loss = criterion(per_frame_logits, labels) tot_loss += loss.item() loss.backward() acc = utils.accuracy_v2(torch.argmax(per_frame_logits, dim=1), labels) avg_acc.append(acc.item()) train_fraction_done = (train_batchind + 1) / train_num_batch print('[{}] train Acc: {}, Loss: {:.4f} [{} / {}]'.format(steps, acc.item(), loss.item(), train_batchind, len(train_dataloader))) if (num_iter == num_steps_per_update or train_batchind == len(train_dataloader)-1) : n_steps = num_steps_per_update if train_batchind == len(train_dataloader)-1: n_steps = num_iter n_examples += batch_size*n_steps print('updating the model...') print('train Total Loss: {:.4f}'.format(tot_loss / n_steps)) optimizer.step() optimizer.zero_grad() train_writer.add_scalar('loss', tot_loss / n_steps, n_examples) train_writer.add_scalar('Accuracy', np.mean(avg_acc), n_examples) train_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], n_examples) num_iter = 0 tot_loss = 0. if test_fraction_done <= train_fraction_done and test_batchind + 1 < test_num_batch: model.train(False) # Set model to evaluate mode test_batchind, data = next(test_enum) inputs, labels, vid_idx, frame_pad = data # wrap them in Variable inputs = Variable(inputs.cuda(), requires_grad=True) labels = Variable(labels.cuda()) labels = torch.argmax(labels, dim=1) with torch.no_grad(): logits = model(inputs) t = inputs.size(2) per_frame_logits = torch.nn.functional.interpolate(logits.unsqueeze(-1), t, mode='linear', align_corners=True) probs = torch.nn.functional.softmax(per_frame_logits, dim=1) preds = torch.max(probs, 1)[1] loss = criterion(per_frame_logits, labels) acc = utils.accuracy_v2(torch.argmax(per_frame_logits, dim=1), labels) print('[{}] test Acc: {}, Loss: {:.4f} [{} / {}]'.format(steps, acc.item(), loss.item(), test_batchind, len(test_dataloader))) test_writer.add_scalar('loss', loss.item(), n_examples) test_writer.add_scalar('Accuracy', acc.item(), n_examples) test_fraction_done = (test_batchind + 1) / test_num_batch model.train(True) if steps % 2 == 0: # save model torch.save({"model_state_dict": model.module.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "lr_state_dict": lr_sched.state_dict()}, logdir + str(steps).zfill(6) + '.pt') steps += 1 lr_sched.step() train_writer.close() test_writer.close()
def run(init_lr=0.1, max_step=64e3, mode='rgb', root='/ssd/Charades_v1_rgb', train_split='charades/charades.json', batch_size=8*5, save_model=''): # setup dataset train_transforms = transforms.Compose([videotransforms.RandomCrop(224), videotransforms.RandomHorisontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.RandomCrop(224)]) dataset = Dataset(train_split, 'training', root, mode, train_transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=36, pin_memory=True) val_dataset = Dataset(train_split, 'testing', root, mode, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=36, pin_memory=True) dataloaders = {'train': dataloader, 'val': val_dataloader} datasets = {'train': dataset, 'val': val_dataset} # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) i3d.load_state_dict(torch.load('models/flow_imagenet.pt')) else: i3d = InceptionI3d(400, in_channels=3) i3d.load_state_dict(torch.load('models/rgb_imagenet.pt')) i3d.replace_logits(157) #i3d.load_state_dict(torch.load('/ssd/models/000920.pt')) i3d.cuda() i3d = nn.DataParallel(i3d) lr = init_lr optimizer = optim.SGD(i3d.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000]) num_steps_per_update = 4 # accum gradient steps = 0 # train it while steps < max_steps:#for epoch in range(num_epochs): print('Step {}/{}'.format(steps, max_steps)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': i3d.train(True) else: i3d.train(False) # Set model to evaluate mode tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 num_iter = 0 optimizer.zero_grad() # Iterate over data. for data in dataloaders[phase]: num_iter += 1 # get the inputs inputs, labels = data # wrap them in Variable inputs = Variable(inputs.cuda()) t = inputs.size(2) labels = Variable(labels.cuda()) per_frame_logits = i3d(inputs) # upsample to input size per_frame_logits = F.upsample(per_frame_logits, t, mode='linear') # compute localization loss loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels) tot_loc_loss += loc_loss.data[0] # compute classification loss (with max-pooling along time B x C x T) cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0]) tot_cls_loss += cls_loss.data[0] loss = (0.5*loc_loss + 0.5*cls_loss)/num_steps_per_update tot_loss += loss.data[0] loss.backward() if num_iter == num_steps_per_update and phase == 'train': steps += 1 num_iter = 0 optimizer.step() optimizer.zero_grad() lr_sched.step() if steps % 10 == 0: print '{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/(10*num_steps_per_update), tot_cls_loss/(10*num_steps_per_update), tot_loss/10) # save model torch.save(i3d.module.state_dict(), save_model+str(steps).zfill(6)+'.pt') tot_loss = tot_loc_loss = tot_cls_loss = 0. if phase == 'val': print '{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/num_iter, tot_cls_loss/num_iter, (tot_loss*num_steps_per_update)/num_iter)
def run(configs, mode='rgb', root='/ssd/Charades_v1_rgb', train_split='charades/charades.json', save_model='', weights=None, datasets=None): print(configs) # setup dataset ### Standard torch augmentation methods for the data train_transforms = transforms.Compose([ videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) ## Create the dataset for internal representation - open up the file for ## more details ## Returns a torch.utils.data.Dataset instance if datasets is None: print('Setting up training dataset...') dataset = Dataset(train_split, 'train', root, mode, train_transforms) # Same for test dataset print('Setting up validation dataset...') val_dataset = Dataset(train_split, 'test', root, mode, test_transforms) else: print('Loading in datasets...') dataset = datasets['train'] val_dataset = datasets['test'] # Store data loaders and datasets # Create torch dataloader ### Set num workers to 0 due to docker bug ## https://github.com/pytorch/pytorch/issues/1355#issuecomment-555091916 dataloader = torch.utils.data.DataLoader(dataset, batch_size=configs.batch_size, shuffle=True, num_workers=0, pin_memory=True) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=configs.batch_size, shuffle=True, num_workers=0, pin_memory=False) dataloaders = { 'train': dataloader, 'test': val_dataloader } #### what is this dic doing? datasets = {'train': dataset, 'test': val_dataset} # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) i3d.load_state_dict(torch.load('weights/flow_imagenet.pt')) else: ## Set up model with 400 classes and 3 input channels i3d = InceptionI3d(400, in_channels=3) i3d.load_state_dict(torch.load('weights/rgb_imagenet.pt')) # Replace last layer from 400 to 2000 neurons because the # pretrained weights had 400 output neurons to begin with. # Load in the model, then replace last layer with 2000 neurons # for transfer learning num_classes = dataset.num_classes i3d.replace_logits(num_classes) if weights: print('loading weights {}'.format(weights)) i3d.load_state_dict(torch.load(weights)) # Transfer to CUDA and make training parallel i3d.cuda() i3d = nn.DataParallel(i3d) # Get LR, weight decay and set up optimizer lr = configs.init_lr weight_decay = configs.adam_weight_decay optimizer = optim.Adam(i3d.parameters(), lr=lr, weight_decay=weight_decay) # Should just be 1 num_steps_per_update = configs.update_per_step # accum gradient steps = 0 epoch = 0 best_val_score = 0 # train it ## LR scheduler # Monitoring trend in loss, so we need to look at min / decreasing # patience=5 means number of epochs where if there is no improvement, reduce # the LR # Factor is to decrease the learning rate by this much if there is no # improvement # new_lr = factor * old_lr scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.3) # Stores sparse cross entropy loss ce_loss = nn.CrossEntropyLoss() ce2_loss = nn.CrossEntropyLoss() num_epochs = 400 while steps < configs.max_steps and epoch < num_epochs: # for epoch in range(num_epochs): print('Epoch #{}/{}'.format(epoch + 1, num_epochs)) print() print('Step {}/{}'.format(steps, configs.max_steps)) print('-' * 10) epoch += 1 # Each epoch has a training and validation(test?) phase for phase in ['train', 'test']: #collected_vids = [] if phase == 'train': i3d.train(True) ####i3d? --> Line #93 else: i3d.train(False) # Set model to evaluate mode tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 num_iter = 0 optimizer.zero_grad() confusion_matrix = np.zeros((num_classes, num_classes), dtype=np.int) # Iterate over data. ### Batch ## batch_size x num_channels x time_steps x 224 x 224 # num_channels - 3 - RGB # time_steps - 64 consecutive frames # 224 x 224 is the size of the frame num_steps_loader = len(dataloaders[phase]) for data in dataloaders[phase]: num_iter += 1 # get the inputs if data == -1: # bracewell does not compile opencv with ffmpeg, strange errors occur resulting in no video loaded continue # inputs, labels, vid, src = data # labels - batch_size where each element is between 0 and 1999 inputs, labels, vid = data # If there are no examples, continue if inputs.shape[0] == 0: continue gt_numpy = labels[:, 0] # Remove faulty videos inputs = inputs[gt_numpy != -1] labels = labels[gt_numpy != -1] # wrap them in Variable inputs = inputs.cuda() # batch_size x 3 x 64 x 224 x 224 # batch_size = 6 in this case # Each example in the batch is one action # 64 frames to describe said gloss t = inputs.size(2) #### ?? # labels - batch_size x 64 # Each row contains the action for the 64 frames in one video labels = labels.cuda() ### Perform forward prop per_frame_logits = i3d(inputs, pretrained=False) ## Input size - batch_size x num_channels x time_steps x 224 x 224 ### Output size - batch_size x num_classes x 7 x 1 x 1 ### 6 x 2000 x 7 x 1 x 1 ### Dimensions are squeezed out # upsample to input size ## Because we are pooling in 3D we will not have 64 frames to # deal with so upsample so that the output tensor is # batch_size x 2000 x 64 # Predicted outputs - predicting the action per frame per example #per_frame_logits = F.upsample(per_frame_logits, t, mode='linear') # Replaced due to deprecation per_frame_logits = F.interpolate(per_frame_logits, t, mode='linear', align_corners=True) #### ?? # compute localization loss ## This is computing the loss over all 64 frames #loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels) loc_loss = ce_loss(per_frame_logits, labels) tot_loc_loss += loc_loss.data.item() # Along the 64 frames for a gloss, find the best performing # one # Going from batch_size x 2000 x 64 # Down to batch_size x 2000 --> You are assigning the best # probability for that particular action to happen # Matrix M - batch_size x 2000 # # M(i, j) --> For video i, what is the probability that action j # occurs predictions = torch.max(per_frame_logits, dim=2)[0] # Find batch_size x 2000 tensor with each location telling # us the probability of a particular gloss for a frame #gt = torch.max(labels, dim=2)[0] # Recall that labels is batch_size x 64 # Because each row is the same ID for all columns, just get the first column gt = labels[..., 0] # compute classification loss (with max-pooling along time B x C # x T) ### why didn't they just use predictions and gt? ## Using the most confident prediction #cls_loss = F.binary_cross_entropy_with_logits(predictions, gt) cls_loss = ce2_loss(predictions, gt) tot_cls_loss += cls_loss.data.item() # Calculate confusion matrix - row is the ground truth # column is the prediction # Used to calculate the per class accuracy for i in range(per_frame_logits.shape[0]): confusion_matrix[gt[i], torch.argmax(predictions[i]).item()] += 1 #confusion_matrix[torch.argmax(gt[i]).item(), torch.argmax(predictions[i]).item()] += 1 # Loss is a combination of the localization loss and # classification loss loss = (0.5 * loc_loss + 0.5 * cls_loss) / num_steps_per_update tot_loss += loss.data.item() if num_iter == num_steps_per_update // 2: print(epoch, steps, loss.data.item()) loss.backward() if num_iter == num_steps_per_update and phase == 'train': steps += 1 num_iter = 0 optimizer.step() optimizer.zero_grad() # lr_sched.step() if steps % 10 == 0: acc = float(np.trace(confusion_matrix)) / np.sum( confusion_matrix) print('Step {}/{} within epoch - max steps: {}'.format( steps, num_steps_loader, configs.max_steps)) print( 'Epoch {} {} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f} Accu :{:.4f}' .format(epoch, phase, tot_loc_loss / (10 * num_steps_per_update), tot_cls_loss / (10 * num_steps_per_update), tot_loss / 10, acc)) tot_loss = tot_loc_loss = tot_cls_loss = 0. if phase == 'test': val_score = float( np.trace(confusion_matrix)) / np.sum(confusion_matrix) if val_score > best_val_score or epoch % 2 == 0: best_val_score = val_score model_name = save_model + "nslt_" + str( num_classes) + "_" + str(steps).zfill( 6) + '_%3f.pt' % val_score torch.save(i3d.module.state_dict(), model_name) print(model_name) print( 'VALIDATION: {} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f} Accu :{:.4f}' .format(phase, tot_loc_loss / num_iter, tot_cls_loss / num_iter, (tot_loss * num_steps_per_update) / num_iter, val_score)) scheduler.step(tot_loss * num_steps_per_update / num_iter)
def run(configs, mode='rgb', root='/ssd/Charades_v1_rgb', train_split='charades/charades.json', save_model='', num_classes=None, weights=None): print(configs) # setup dataset train_transforms = transforms.Compose([ videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(train_split, 'train', root, mode, num_classes=num_classes, transforms=train_transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=configs.batch_size, shuffle=True, num_workers=4, pin_memory=True) val_dataset = Dataset(train_split, 'test', root, mode, num_classes=num_classes, transforms=test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=configs.batch_size, shuffle=True, num_workers=4, pin_memory=False) dataloaders = {'train': dataloader, 'test': val_dataloader} datasets = {'train': dataset, 'test': val_dataset} # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) i3d.load_state_dict(torch.load('weights/flow_imagenet.pt')) else: i3d = InceptionI3d(400, in_channels=3) i3d.load_state_dict(torch.load('weights/rgb_imagenet.pt')) num_classes = dataset.num_classes i3d.replace_logits(num_classes) if weights: print('loading weights {}'.format(weights)) i3d.load_state_dict(torch.load(weights)) i3d.cuda() i3d = nn.DataParallel(i3d) lr = configs.init_lr weight_decay = configs.adam_weight_decay optimizer = optim.Adam(i3d.parameters(), lr=lr, weight_decay=weight_decay) num_steps_per_update = configs.update_per_step # accum gradient steps = 0 epoch = 0 best_val_score = 0 # train it scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.3) while steps < configs.max_steps and epoch < 400: # for epoch in range(num_epochs): print('Step {}/{}'.format(steps, configs.max_steps)) print('-' * 10) epoch += 1 # Each epoch has a training and validation phase for phase in ['train', 'test']: collected_vids = [] if phase == 'train': i3d.train(True) else: i3d.train(False) # Set model to evaluate mode tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 num_iter = 0 optimizer.zero_grad() confusion_matrix = np.zeros((num_classes, num_classes), dtype=np.int) # Iterate over data. for data in dataloaders[phase]: num_iter += 1 # get the inputs if data == -1: # bracewell does not compile opencv with ffmpeg, strange errors occur resulting in no video loaded continue # inputs, labels, vid, src = data inputs, labels, vid = data # wrap them in Variable inputs = inputs.cuda() t = inputs.size(2) labels = labels.cuda() per_frame_logits = i3d(inputs, pretrained=False) # upsample to input size per_frame_logits = F.upsample(per_frame_logits, t, mode='linear') # compute localization loss loc_loss = F.binary_cross_entropy_with_logits( per_frame_logits, labels) tot_loc_loss += loc_loss.data.item() predictions = torch.max(per_frame_logits, dim=2)[0] gt = torch.max(labels, dim=2)[0] # compute classification loss (with max-pooling along time B x C x T) cls_loss = F.binary_cross_entropy_with_logits( torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0]) tot_cls_loss += cls_loss.data.item() for i in range(per_frame_logits.shape[0]): confusion_matrix[torch.argmax(gt[i]).item(), torch.argmax(predictions[i]).item()] += 1 loss = (0.5 * loc_loss + 0.5 * cls_loss) / num_steps_per_update tot_loss += loss.data.item() if num_iter == num_steps_per_update // 2: print(epoch, steps, loss.data.item()) loss.backward() if num_iter == num_steps_per_update and phase == 'train': steps += 1 num_iter = 0 optimizer.step() optimizer.zero_grad() # lr_sched.step() if steps % 10 == 0: acc = float(np.trace(confusion_matrix)) / np.sum( confusion_matrix) print( 'Epoch {} {} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f} Accu :{:.4f}' .format(epoch, phase, tot_loc_loss / (10 * num_steps_per_update), tot_cls_loss / (10 * num_steps_per_update), tot_loss / 10, acc)) tot_loss = tot_loc_loss = tot_cls_loss = 0. if phase == 'test': val_score = float( np.trace(confusion_matrix)) / np.sum(confusion_matrix) if val_score > best_val_score or epoch % 2 == 0: best_val_score = val_score model_name = save_model + "nslt_" + str( num_classes) + "_" + str(steps).zfill( 6) + '_%3f.pt' % val_score torch.save(i3d.module.state_dict(), model_name) print(model_name) print( 'VALIDATION: {} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f} Accu :{:.4f}' .format(phase, tot_loc_loss / num_iter, tot_cls_loss / num_iter, (tot_loss * num_steps_per_update) / num_iter, val_score)) scheduler.step(tot_loss * num_steps_per_update / num_iter)
def run(init_lr=0.0001, max_steps=200, mode='rgb', root='/media/pritesh/Entertainment/Visual-Tactile_Dataset/dataset/',\ train_split='train.txt', test_split='test.txt', batch_size=5, save_model=''): # setup dataset train_transforms = transforms.Compose([ videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(train_split, root, mode, train_transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=3, pin_memory=True) val_dataset = Dataset(test_split, root, mode, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=3, pin_memory=True) dataloaders = {'train': dataloader, 'val': val_dataloader} datasets = {'train': dataset, 'val': val_dataset} # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) i3d.load_state_dict(torch.load('models/flow_imagenet.pt')) else: i3d = InceptionI3d(400, in_channels=3) i3d.load_state_dict(torch.load('models/rgb_imagenet.pt')) i3d.replace_logits(1) # #i3d.load_state_dict(torch.load('/ssd/models/000920.pt')) i3d.cuda() i3d = nn.DataParallel(i3d) lr = init_lr optimizer = optim.SGD(i3d.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) # lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000]) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [100, 140]) num_steps_per_update = 4 # accum gradient steps = 0 with open('i3d_video.txt', 'w') as file: file.write("train and validation loss file\n") # train it while steps < max_steps: #for epoch in range(num_epochs): print('Step {}/{}'.format(steps, max_steps)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'val']: print('phase : {}'.format(phase)) if phase == 'train': i3d.train(True) else: i3d.train(False) # Set model to evaluate mode tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 num_iter = 0 count = 0 optimizer.zero_grad() # Iterate over data. for data in dataloaders[phase]: num_iter += 1 # get the inputs f_vid, l_vid, tactile, pos, labels = data # wrap them in Variable # inputs = Variable(f_vid) # t = inputs.size(2) # labels = Variable(labels) inputs = Variable(f_vid.cuda()) t = inputs.size(2) labels = Variable(labels.cuda()) # print("go to i3d") per_frame_logits = i3d(inputs.float()) # print('Output = ', per_frame_logits.shape) # upsample to input size #per_frame_logits = F.upsample(per_frame_logits, t, mode='linear') per_frame_logits = F.interpolate(per_frame_logits, t, mode='linear', align_corners=False) #per_frame_logits = F.interpolate(per_frame_logits, 7, mode='linear', align_corners=False) #per_frame_logits = per_frame_logits.squeeze(2) #per_frame_logits = per_frame_logits.squeeze(1) #print('Output after interpolation = ', per_frame_logits.shape) #print('labels = ',labels.shape) # compute localization loss #loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels.float()) #tot_loc_loss += loc_loss.data[0] #print("computing classification loss") # compute classification loss (with max-pooling along time B x C x T) per_frame_logits = torch.max(per_frame_logits, dim=2)[0] per_frame_logits = per_frame_logits.squeeze(1) cls_loss = F.binary_cross_entropy_with_logits( per_frame_logits.double(), labels.double()) #cls_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels) #cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0]) #tot_cls_loss += cls_loss.data[0] tot_cls_loss += cls_loss.item() cls_loss.backward() print('{} Loss: {:.4f} and lr: {}'.format( phase, tot_cls_loss / num_iter, init_lr)) with open('i3d_video.txt', 'a') as file: file.write("%f\n" % (tot_cls_loss / num_iter)) optimizer.step() optimizer.zero_grad() lr_sched.step() #print(tot_cls_loss) #loss = (0.5*loc_loss + 0.5*cls_loss)/num_steps_per_update #tot_loss += loss.data[0] #loss.backward() # if num_iter == num_steps_per_update and phase == 'train': #if num_iter == num_steps_per_update: #steps += 1 # count = count + num_steps_per_update # num_iter = 0 # optimizer.step() # optimizer.zero_grad() # lr_sched.step() # print('{} Loss: {:.4f} and lr: {}'.format(phase,tot_cls_loss/count,init_lr)) # with open('i3d_video.txt', 'a') as file: # file.write("%f\n" %(tot_cls_loss/count)) # if phase == 'val': # print('Final {} Loss: {:.4f}'.format(phase,tot_cls_loss/steps)) torch.save(i3d.module.state_dict(), save_model + phase + str(steps).zfill(6) + '.pt') tot_cls_loss = 0. steps += 1