def __init__(self, root, mode, test_dir, train_dir, save_model_withname=None,\ save_error_withname=None, checkpoint=None): self.root = root self.mode = mode self.test_dir = test_dir self.train_dir = train_dir self.save_model_withname = save_model_withname self.save_error_withname = save_error_withname self.checkpoint = checkpoint self.batch_size = 50 self.learning_rate = 0.0001 self.validation_loop = 0 if(self.mode=='train'): self.writer = tensorboardX.SummaryWriter(comment="train") else: self.writer = tensorboardX.SummaryWriter(comment="test") # setup dataset self.train_transforms = transforms.Compose([videotransforms.RandomCrop(112), videotransforms.RandomHorizontalFlip(),]) self.test_transforms = transforms.Compose([videotransforms.CenterCrop(112)]) self.dataset = VisualTactile(self.root, self.train_dir, self.train_transforms) self.dataloader = torch.utils.data.DataLoader(self.dataset, batch_size=self.batch_size, shuffle=True, num_workers=1, pin_memory=True) self.val_dataset = VisualTactile(self.root, self.test_dir, self.test_transforms) self.val_dataloader = torch.utils.data.DataLoader(self.val_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True) # self.dataloaders = {'train': self.dataloader, 'val': self.val_dataloader} # self.datasets = {'train': self.dataset, 'val': self.val_dataset} self.model, self.optimizer, self.scheduler = self.load_model(self.checkpoint)
def run(max_steps=64e3,load_model='',root='/l/vision/v7/wang617/taiwan', batch_size=1, save_dir=''): # setup dataset test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(root,test_transforms, save_dir=save_dir) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=8) i3d = InceptionI3d(400, in_channels=3) #i3d.replace_logits(157) i3d.load_state_dict(torch.load(load_model)) i3d.cuda() i3d.train(False) # Set model to evaluate mode count = 0 start = time.time() for data in dataloader: # get the inputs inputs, label, name = data label = str(label.numpy()[0]) b,c,t,h,w = inputs.shape inputs = Variable(inputs.cuda(), volatile=True) features = i3d.extract_features(inputs) np.save(os.path.join(save_dir,name[0]),features.squeeze().data.cpu().numpy()) f = open('/l/vision/v7/wang617/taiwan_data/i3d_feature_list.txt','a') f.writelines([name[0],',',label,'\n']) count = count +1 if count%100 ==0: current = time.time() print('Count {:2},|' 'running time:{:.2f} sec'.format(count,current-start)) f.close()
def run(max_steps=64e3, mode='rgb', root='', split='', batch_size=1, load_model='', save_dir=''): # setup dataset test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(split, 'testing', root, mode, test_transforms, num=-1, save_dir=save_dir) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=1, pin_memory=True) # val_dataset = Dataset(split, 'testing', root, mode, test_transforms, num=-1, save_dir=save_dir) # val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True) dataloaders = {'train': dataloader}#, 'val': val_dataloader} datasets = {'train': dataset}#, 'val': val_dataset} # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) else: i3d = InceptionI3d(400, in_channels=3) i3d.replace_logits(400) i3d.load_state_dict(torch.load(load_model)) i3d.cuda() for phase in ['train']: i3d.train(False) # Set model to evaluate mode tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 # Iterate over data. for data in dataloaders[phase]: # get the inputs inputs, labels, name = data if os.path.exists(os.path.join(save_dir, name[0]+'.npy')): continue i=0 for input in inputs: i+=1 b,c,t,h,w = input.shape if t > 1600: features = [] for start in range(1, t-56, 1600): end = min(t-1, start+1600+56) start = max(1, start-48) ip = Variable(torch.from_numpy(input.numpy()[:,:,start:end]).cuda(), volatile=True) features.append(i3d.extract_features(ip).squeeze(0).permute(1,2,3,0).data.cpu().numpy()) np.save(os.path.join(save_dir, name[0]), np.concatenate(features, axis=0)) else: # wrap them in Variable input = Variable(input.cuda(), volatile=True) features = i3d.extract_features(input) new_path = os.path.join(save_dir, name[0], mode) if not os.path.exists(new_path): os.makedirs(new_path) np.save(os.path.join(new_path, str(i)), features.squeeze(0).permute(1,2,3,0).data.cpu().numpy())
def run(max_steps=64e3, mode='rgb', root='/gpfs/home/lhe/xuexinwei/xxw/super-events-cvpr18-master/data/charades/Charades_v1_rgb', split='/gpfs/home/lhe/xuexinwei/xxw/super-events-cvpr18-master/data/charades/charades.json', batch_size=1, load_model='/gpfs/home/lhe/xuexinwei/xxw/super-events-cvpr18-master/pytorch-i3d/models/rgb_charades.pt', save_dir='/gpfs/home/lhe/xuexinwei/xxw/super-events-cvpr18-master/data/charades/charades_features'): # setup dataset #root = '/ssd2/charades/Charades_v1_rgb', split = 'charades/charades.json', batch_size = 1, load_model = '', save_dir = '' # root = '/gpfs/home/lhe/xxw/xxw/super-events-cvpr18-master/data/charades/Charades_v1_rgb' test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) # print ( mode,root,split,batch_size) dataset = Dataset(split, 'training', root, mode, test_transforms, save_dir=save_dir) #num=-1, dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True) val_dataset = Dataset(split, 'testing', root, mode, test_transforms, save_dir=save_dir)#num=-1, val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True) dataloaders = {'train': dataloader, 'val': val_dataloader} datasets = {'train': dataset, 'val': val_dataset} # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) else: i3d = InceptionI3d(400, in_channels=3) i3d.replace_logits(157) i3d.load_state_dict(torch.load(load_model)) i3d.cuda() for phase in ['train', 'val']: i3d.train(False) # Set model to evaluate mode tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 # Iterate over data. for data in dataloaders[phase]: # get the inputs inputs, labels, name = data if os.path.exists(os.path.join(save_dir, name[0]+'.npy')): continue b,c,t,h,w = inputs.shape if t > 1600: features = [] for start in range(1, t-56, 1600): end = min(t-1, start+1600+56) start = max(1, start-48) ip = Variable(torch.from_numpy(inputs.numpy()[:,:,start:end]).cuda(), volatile=True) features.append(i3d.extract_features(ip).squeeze(0).permute(1,2,3,0).data.cpu().numpy()) np.save(os.path.join(save_dir, name[0]), np.concatenate(features, axis=0)) else: # wrap them in Variable inputs = Variable(inputs.cuda(), volatile=True) features = i3d.extract_features(inputs) np.save(os.path.join(save_dir, name[0]), features.squeeze(0).permute(1,2,3,0).data.cpu().numpy())
def run(init_lr=0.1, max_steps=64e3, mode='rgb', root='/ssd/Charades_v1_rgb', train_split='charades/charades.json', batch_size=3 * 15, save_model='', weights=None, num_classes=0): # setup dataset test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) data = make_eval_json() class_map = make_label_map() val_dataset = Dataset(train_split, 'test', root, mode, data, num_classes, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=2, pin_memory=False) dataloaders = {'test': val_dataloader} datasets = {'test': val_dataset} # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) i3d.load_state_dict(torch.load('weights/flow_imagenet.pt')) else: i3d = InceptionI3d(400, in_channels=3) i3d.load_state_dict(torch.load('weights/rgb_imagenet.pt')) i3d.replace_logits(num_classes) i3d.load_state_dict( torch.load(weights) ) # nslt_2000_000700.pt nslt_1000_010800 nslt_300_005100.pt(best_results) nslt_300_005500.pt(results_reported) nslt_2000_011400 #i3d.cuda() #i3d = nn.DataParallel(i3d) i3d.eval() preds = [] for data in dataloaders["test"]: inputs, labels, video_id = data # inputs: b, c, t, h, w per_frame_logits = i3d(inputs) predictions = torch.max(per_frame_logits, dim=2)[0] out_labels = np.argsort(predictions.cpu().detach().numpy()[0]) out_probs = np.sort(predictions.cpu().detach().numpy()[0]) print(class_map[out_labels[-1]]) preds.append(class_map[out_labels[-1]]) return preds
def predict_video(model_path, video_path, device): # Load model m = load_model(model_path).to(device) # Load rgb frames from video frames = load_rgb_frames_from_video(video_path, 0, -1, True) crop = videotransforms.CenterCrop(224) frames = video_to_tensor(crop(frames)) logits = m(frames.unsqueeze(0).to(device)) return logits[0, -1]
def calculate_confusion_matrix(): args = get_parse() cabin_video_dir = args.cabin_video_dir face_video_dir = args.face_video_dir test_data_path = args.test_data_path batch_size = args.batch_size num_classes = args.num_classes weight = args.weight print('Start to load data') test_transforms = transforms.Compose([ videotransforms.CenterCrop(224), videotransforms.ToTensor(), videotransforms.ClipNormalize() ]) test_dataset = IVBSSDataset(cabin_video_dir, face_video_dir, test_data_path, test_transforms) print('Total number of test samples is {0}'.format(len(test_dataset))) test_dataloader = DataLoader(test_dataset, batch_size=batch_size, sampler=SequentialSampler(test_dataset), collate_fn=collate_fn) model = TAL_Net(num_classes) print('Load checkpoint') model = load_ckp(args.ckp_path, model) model.cuda() model.eval() print('Start to calculate confusion matrix') all_predicts = [] all_labels = [] for i, (cabin_imgs, face_imgs, labels, start_labels, end_labels) in enumerate(test_dataloader): cabin_imgs = cabin_imgs.cuda() face_imgs = face_imgs.cuda() with torch.no_grad(): class_scores, start_scores, end_scores = model( cabin_imgs, face_imgs) class_preds = torch.argmax(class_scores, dim=1) class_preds = class_preds.cpu().numpy() labels = labels.numpy() all_predicts.append(class_preds) all_labels.append(labels) all_predicts = np.concatenate(all_predicts) all_labels = np.concatenate(all_labels) cf_matrix = confusion_matrix(all_labels, all_predicts) normalized_confusion_matrix = confusion_matrix(all_labels, all_predicts, normalize='true') return cf_matrix, normalized_confusion_matrix
def load_data(dataset_path, batch_size=1): test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset_train = VidorDataset(dataset_path, 'training', test_transforms) dataloader_train = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, shuffle=False, num_workers=5, pin_memory=True) dataset_val = VidorDataset(dataset_path, 'validation', test_transforms) dataloader_val = torch.utils.data.DataLoader(dataset_val, batch_size=1, shuffle=False, num_workers=5, pin_memory=True) dataloaders = {'train': dataloader_train, 'val': dataloader_val} datasets = {'train': dataset_train, 'val': dataset_val} return datasets, dataloaders
def predict(): args = get_parse() cabin_video_dir = args.cabin_video_dir test_data_path = args.test_data_path # batch_size = args.batch_size num_classes = args.num_classes print('Start to load data') test_transforms = transforms.Compose( [videotransforms.CenterCrop(224), videotransforms.ToTensor()]) test_dataset = IVBSSDataset(face_video_dir, cabin_video_dir, test_data_path, test_transforms) test_dataloader = DataLoader(test_dataset, batch_size=batch_size, sampler=SequentialSampler(test_dataset), collate_fn=collate_fn) model = TemporalActionLocalization(num_classes, pretrained_I3D_model) print('Load checkpoint') model = load_ckp(args.ckp_path, model) model.cuda() model.eval() print('Start to test') test_loss = 0.0 test_steps = 0 for i, (face_imgs, cabin_imgs, labels) in enumerate(test_dataloader): face_imgs = face_imgs.cuda() cabin_imgs = cabin_imgs.cuda() for k, v in labels.items(): labels[k] = v.cuda() loss = model(face_imgs, cabin_imgs, labels) test_loss += loss.item() test_steps += 1 avg_test_loss = test_loss / test_steps return avg_test_loss
def load_data(dataset_path, batch_size=5, num_workers=10): # setup dataset train_transforms = transforms.Compose([ videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset_train = VidorDataset(dataset_path, 'training', train_transforms) cls_weights = dataset_train.get_weights() dataloader_train = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True) dataset_val = VidorDataset(dataset_path, 'validation', test_transforms) dataloader_val = torch.utils.data.DataLoader(dataset_val, batch_size=1, shuffle=True, num_workers=num_workers, pin_memory=True) dataloaders = {'train': dataloader_train, 'val': dataloader_val} datasets = {'train': dataset_train, 'val': dataset_val} return datasets, dataloaders, np.asarray(1 - cls_weights, dtype=np.float32)
def run(init_lr=0.1, max_steps=64e3, mode='rgb', root='/ssd/Charades_v1_rgb', train_split='charades/new_data.json', batch_size=8, save_model=''): # setup dataset print("Inside Run") train_transforms = transforms.Compose([ videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) print("Train Dataset") dataset = Dataset(train_split, 'training', root, mode, train_transforms) print("Train Dataset DataLoader") dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True) print("Test Dataset") val_dataset = Dataset(train_split, 'testing', root, mode, test_transforms) print("Test Dataset DataLoader") val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True) dataloaders = {'train': dataloader, 'val': val_dataloader} datasets = {'train': dataset, 'val': val_dataset} # setup the model if mode == 'flow': print("Running flow") i3d = InceptionI3d(400, in_channels=2) print("loading flow dict") i3d.load_state_dict(torch.load('models/flow_imagenet.pt')) else: print("Running rgb") i3d = InceptionI3d(400, in_channels=3) print("loading dict") # i3d.load_state_dict(torch.load('models/rgb_imagenet.pt')) i3d.replace_logits(2) print("Replaced logits") #i3d.load_state_dict(torch.load('/ssd/models/000920.pt')) i3d.cuda() i3d = nn.DataParallel(i3d) print("Parallel Data") lr = init_lr print("Initializing SGD") optimizer = optim.SGD(i3d.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) print("Scheduling some multistep") lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000]) num_steps_per_update = 4 # accum gradient steps = 0 # train it print("Starting Training") while steps < max_steps: #for epoch in range(num_epochs): print 'Step {}/{}'.format(steps, max_steps) print '-' * 10 # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': print("Training") i3d.train(True) else: print("Validation") i3d.train(False) # Set model to evaluate mode tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 num_iter = 0 optimizer.zero_grad() # Iterate over data. for data in dataloaders[phase]: num_iter += 1 # get the inputs inputs, labels = data # wrap them in Variable inputs = Variable(inputs.cuda()) t = inputs.size(2) labels = Variable(labels.cuda()) per_frame_logits = i3d(inputs) # upsample to input size per_frame_logits = F.upsample(per_frame_logits, t, mode='linear') # compute localization loss loc_loss = F.binary_cross_entropy_with_logits( per_frame_logits, labels) tot_loc_loss += loc_loss.data # compute classification loss (with max-pooling along time B x C x T) cls_loss = F.binary_cross_entropy_with_logits( torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0]) tot_cls_loss += cls_loss.data loss = (0.5 * loc_loss + 0.5 * cls_loss) / num_steps_per_update tot_loss += loss.data loss.backward() if num_iter == num_steps_per_update and phase == 'train': steps += 1 num_iter = 0 optimizer.step() optimizer.zero_grad() lr_sched.step() if steps % 10 == 0: print '{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format( phase, tot_loc_loss / (10 * num_steps_per_update), tot_cls_loss / (10 * num_steps_per_update), tot_loss / 10) # save model torch.save(i3d.module.state_dict(), save_model + str(steps).zfill(6) + '.pt') tot_loss = tot_loc_loss = tot_cls_loss = 0. if phase == 'val': print '{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format( phase, tot_loc_loss / num_iter, tot_cls_loss / num_iter, (tot_loss * num_steps_per_update) / num_iter)
def predict_events(cabin_video_path, face_video_path, args): if torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' checkpoint = args.checkpoint clip_length = args.clip_length clip_stride = args.clip_stride batch_size = args.batch_size num_classes = args.num_classes threshold = args.threshold cabin_clips, face_clips, indices_in_cabin_clips = clip_generation(cabin_video_path, face_video_path, clip_length, clip_stride) model = TAL_Net(num_classes) ckp = torch.load(checkpoint) model.load_state_dict(ckp['model']) model.to(device) model.eval() clip_transforms = transforms.Compose([videotransforms.CenterCrop(224), videotransforms.ToTensor(), videotransforms.ClipNormalize() ]) all_clips = [] all_predict_classes = [] all_start_scores = [] all_end_scores = [] n = len(cabin_clips) // batch_size for i in range(n): cabin_video_frames_batch = [] face_video_frames_batch = [] for j in range(i * batch_size, (i + 1) * batch_size): cabin_clip = cabin_clips[j] cabin_video_frames = load_rgb_frames(cabin_video_path, cabin_clip) cabin_video_frames = clip_transforms(cabin_video_frames) cabin_video_frames_batch.append(cabin_video_frames) face_clip = face_clips[j] face_video_frames = load_rgb_frames(face_video_path, face_clip) face_video_frames = clip_transforms(face_video_frames) face_video_frames_batch.append(face_video_frames) cabin_video_frames_batch = torch.stack(cabin_video_frames_batch) face_video_frames_batch = torch.stack(face_video_frames_batch) cabin_video_frames_batch = cabin_video_frames_batch.to(device) face_video_frames_batch = face_video_frames_batch.to(device) with torch.no_grad(): class_scores, start_scores, end_scores = model(cabin_video_frames_batch, face_video_frames_batch) pred_classes = torch.argmax(class_scores, dim=1) pred_classes = pred_classes.cpu().numpy() start_scores = start_scores.cpu().numpy() end_scores = end_scores.cpu().numpy() all_predict_classes.append(pred_classes) all_start_scores.append(start_scores) all_end_scores.append(end_scores) if len(cabin_clips) % batch_size != 0: cabin_video_frames_batch = [] face_video_frames_batch = [] for k in range(n * batch_size, len(cabin_clips)): cabin_clip = cabin_clips[k] cabin_video_frames = load_rgb_frames(cabin_video_path, cabin_clip) cabin_video_frames = clip_transforms(cabin_video_frames) cabin_video_frames_batch.append(cabin_video_frames) face_clip = face_clips[k] face_video_frames = load_rgb_frames(face_video_path, face_clip) face_video_frames = clip_transforms(face_video_frames) face_video_frames_batch.append(face_video_frames) cabin_video_frames_batch = torch.stack(cabin_video_frames_batch) face_video_frames_batch = torch.stack(face_video_frames_batch) cabin_video_frames_batch = cabin_video_frames_batch.to(device) face_video_frames_batch = face_video_frames_batch.to(device) with torch.no_grad(): class_scores, start_scores, end_scores = model(cabin_video_frames_batch, face_video_frames_batch) pred_classes = torch.argmax(class_scores, dim=1) pred_classes = pred_classes.cpu().numpy() start_scores = start_scores.cpu().numpy() end_scores = end_scores.cpu().numpy() all_predict_classes.append(pred_classes) all_start_scores.append(start_scores) all_end_scores.append(end_scores) all_predict_classes = np.concatenate(all_predict_classes) all_start_scores = np.concatenate(all_start_scores) all_end_scores = np.concatenate(all_end_scores) print(all_predict_classes) # refined chunk aggregation cabin_frames = os.listdir(cabin_video_path) cabin_frame_length = len(cabin_frames) cabin_indices = np.arange(start=0, stop=cabin_frame_length - clip_stride + 1, step=clip_stride) indices_in_shorter_clips = [list(range(idx, idx + clip_stride)) for idx in cabin_indices] # remainder = cabin_frame_length % clip_stride # if remainder != 0: # indices_in_shorter_clips.append(list(range(cabin_frame_length-remainder, cabin_frame_length))) print(len(indices_in_shorter_clips)) print(len(indices_in_cabin_clips)) shorter_clip_predict_classes = [] for i in range(len(indices_in_shorter_clips)): if i == 0: shorter_clip_predict_classes.append(all_predict_classes[0]) elif i == 1: l = [all_predict_classes[0], all_predict_classes[1]] shorter_clip_predict_classes.append(max(set(l), key = l.count)) elif i == 2: l = [all_predict_classes[0], all_predict_classes[1], all_predict_classes[2]] shorter_clip_predict_classes.append(max(set(l), key = l.count)) elif i < len(indices_in_cabin_clips): l = [all_predict_classes[j] for j in range(i-3, i+1)] shorter_clip_predict_classes.append(max(set(l), key = l.count)) elif i == len(indices_in_cabin_clips): index = len(indices_in_cabin_clips) - 1 l = [all_predict_classes[index-2], all_predict_classes[index-1], all_predict_classes[index]] shorter_clip_predict_classes.append(max(set(l), key = l.count)) elif i == len(indices_in_cabin_clips) + 1: index = len(indices_in_cabin_clips) - 1 l = [all_predict_classes[index-1], all_predict_classes[index]] shorter_clip_predict_classes.append(max(set(l), key = l.count)) elif i == len(indices_in_cabin_clips) + 2: index = len(indices_in_cabin_clips) - 1 shorter_clip_predict_classes.append(all_predict_classes[index]) print(shorter_clip_predict_classes) # extract start and end peaks start_peak_indices = [] end_peak_indices = [] if all_start_scores[0] > all_start_scores[1]: start_peak_indices.append(0) for i in range(1, len(all_start_scores) - 1): if all_start_scores[i] > all_start_scores[i - 1]: if all_start_scores[i] > all_start_scores[i + 1]: start_peak_indices.append(i) if all_end_scores[i] > all_end_scores[i - 1]: if all_end_scores[i] > all_end_scores[i + 1]: end_peak_indices.append(i) if all_end_scores[-1] > all_end_scores[-2]: end_peak_indices.append(len(cabin_clips) - 1) j = 0 copy_start_peak_indices = start_peak_indices.copy() while j < len(start_peak_indices) - 1: index1 = copy_start_peak_indices[j] index2 = copy_start_peak_indices[j + 1] if index1 + 4 < index2: j += 1 else: if all_start_scores[start_peak_indices[j]] > all_start_scores[start_peak_indices[j + 1]]: copy_start_peak_indices[j] = index2 copy_start_peak_indices.pop(j + 1) start_peak_indices.pop(j + 1) else: copy_start_peak_indices.pop(j) start_peak_indices.pop(j) k = 0 copy_end_peak_indices = end_peak_indices.copy() while k < len(end_peak_indices) - 1: index1 = copy_end_peak_indices[k] index2 = copy_end_peak_indices[k + 1] if index1 + 4 < index2: k += 1 else: if all_end_scores[end_peak_indices[k]] > all_end_scores[end_peak_indices[k + 1]]: copy_end_peak_indices[k] = index2 copy_end_peak_indices.pop(k + 1) end_peak_indices.pop(k + 1) else: copy_end_peak_indices.pop(k) end_peak_indices.pop(k) selected_starts = [] selected_ends = [] for start_indice in start_peak_indices: if all_start_scores[start_indice] > threshold: selected_starts.append(start_indice) for end_indice in end_peak_indices: if all_end_scores[end_indice] > threshold: selected_ends.append(end_indice+3) print(selected_starts) print(selected_ends) rough_clip_groups = defaultdict(list) for i in range(len(shorter_clip_predict_classes)): if shorter_clip_predict_classes[i] != 0: rough_clip_groups[shorter_clip_predict_classes[i]].append(i) print(rough_clip_groups) # all_refined_clip_groups = dict() # for key in rough_clip_groups.keys(): # clip_group = rough_clip_groups[key] # refined_groups = [] # previous = 0 # i = 0 # while i < len(clip_group) - 1: # if clip_group[i] in selected_starts: # previous = i # elif clip_group[i] in selected_ends: # refined_groups.append(clip_group[previous:(index+1)]) # j = i + 1 # while j < len(clip_group) - 1: # if clip_group[j] - clip_group[j-1] == 1: # j += 1 # else: # previous = j # i = j # break # elif clip_group[i] + 2 < clip_group[i+1]: # refined_groups.append(clip_group[previous:(i+1)]) # previous = i+1 # i += 1 # print(previous, i) # if previous < len(clip_group) - 1: # refined_groups.append(clip_group[previous:]) # all_refined_clip_groups[key] = refined_groups # print(all_refined_clip_groups) all_refined_clip_groups = dict() for key in rough_clip_groups.keys(): clip_group = rough_clip_groups[key] refined_groups = [] previous = 0 i = 0 while i < len(clip_group) - 1: if clip_group[i] + 2 < clip_group[i+1]: refined_groups.append(clip_group[previous:(i+1)]) previous = i+1 i += 1 refined_groups.append(clip_group[previous:]) all_refined_clip_groups[key] = refined_groups print(all_refined_clip_groups) keys = list(all_refined_clip_groups) if len(keys) == 2: k1 = keys[0] k2 = keys[1] groups1 = all_refined_clip_groups[k1] groups2 = all_refined_clip_groups[k2] i = 0 j = 0 while i < len(groups1): while j < len(groups2): min_index1 = min(groups1[i]) max_index1 = max(groups1[i]) min_index2 = min(groups2[j]) max_index2 = max(groups2[j]) set1 = set(range(min_index1, max_index1+1)) set2 = set(range(min_index2, max_index2+1)) if set1.issubset(set2) == True: groups1.remove(groups1[i]) if i >= len(groups1): break elif set2.issubset(set1) == True: groups2.remove(groups2[j]) else: if max_index1 > max_index2: j += 1 else: break i += 1 filtered_all_clip_groups = { k1:groups1, k2:groups2 } else: filtered_all_clip_groups = all_refined_clip_groups print(filtered_all_clip_groups) # add start and end information final_all_clip_groups = {} for key in filtered_all_clip_groups.keys(): clip_groups = filtered_all_clip_groups[key] all_clip_groups = [] for clip_group in clip_groups: if len(clip_group) > 6: start_clip = min(clip_group) end_clip = max(clip_group) for selected_start in selected_starts: if selected_start > start_clip and selected_start < start_clip + 3: start_clip = selected_start for selected_end in selected_ends: if selected_end > end_clip - 3 and selected_end < end_clip: end_clip = selected_end clip_group = list(range(start_clip, end_clip+1)) all_clip_groups.append(clip_group) final_all_clip_groups[key] = all_clip_groups all_clip_frame_groups = {} for key in final_all_clip_groups.keys(): final_groups = final_all_clip_groups[key] clip_frame_groups = [] for group in final_groups: clip_frame_group = set() for index in group: clip_frame_group = clip_frame_group.union(set(indices_in_shorter_clips[index])) start_frame = min(clip_frame_group) + 1 end_frame = max(clip_frame_group) + 1 clip_frame_groups.append([start_frame, end_frame]) all_clip_frame_groups[key] = clip_frame_groups return all_clip_frame_groups
def run(init_lr=0.01, max_steps=200, mode='rgb', root='/media/pritesh/Entertainment/Visual-Tactile_Dataset/dataset/',\ train_split='train.txt', test_split='test.txt', batch_size=1, save_model=''): print(train_split, test_split) writer = tensorboardX.SummaryWriter() # setup dataset test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(train_split, root, mode, test_transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True) val_dataset = Dataset(test_split, root, mode, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True) dataloaders = {'train': dataloader, 'val': val_dataloader} datasets = {'train': dataset, 'val': val_dataset} # setup the model sm = InceptionI3d(400, in_channels=3) sm.replace_logits(1) #add your network here fusedNet = FusionNet(sm) if torch.cuda.is_available(): fusedNet.cuda() fusedNet = nn.DataParallel(fusedNet) lr = init_lr optimizer = optim.SGD(fusedNet.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [50, 100, 150, 200]) if torch.cuda.is_available(): data = torch.load(save_model) else: data = torch.load(save_model, map_location=lambda storage, loc: storage) fusedNet.load_state_dict(data['model_state']) optimizer.load_state_dict(data['optimizer_state']) lr_sched.load_state_dict(data['scheduler_state']) steps = 0 with open('inference_V.txt', 'w') as file: file.write("train and validation loss file\n") # train it # Each epoch has a training and validation phase fusedNet.train(False) # Set model to evaluate mode for phase in ['train', 'val']: print('phase : {}'.format(phase)) tot_cls_loss = 0.0 num_iter = 0 count = 0 # optimizer.zero_grad() with open('inference_V.txt', 'a') as file: file.write("---------------\n") # Iterate over data. for data in dataloaders[phase]: num_iter += 1 # get the inputs f_vid, l_vid, tactile, pos, labels = data if torch.cuda.is_available(): rgb_inputs = Variable(f_vid.cuda()) t = rgb_inputs.size(2) labels = Variable(labels.cuda()) else: rgb_inputs = Variable(f_vid) t = rgb_inputs.size(2) labels = Variable(labels) out = fusedNet(rgb_inputs.float()) #print('prediction output = ', per_frame_logits.shape) #print('labels = ',labels.shape) # compute classification loss (with max-pooling along time B x C x T) out = out.squeeze(1) cls_loss = F.binary_cross_entropy_with_logits( out.double(), labels.double()) tot_cls_loss += cls_loss.item() # cls_loss.backward() print('{} Loss: {:.4f} and lr: {}'.format(phase, tot_cls_loss / num_iter, init_lr)) with open('inference_V.txt', 'a') as file: file.write("%f\n" % (tot_cls_loss / num_iter)) # optimizer.step() # optimizer.zero_grad() if phase == 'val': writer.add_scalar('inference_error/' + phase, (tot_cls_loss / num_iter), num_iter) else: writer.add_scalar('inference_error/' + phase, (tot_cls_loss / num_iter), num_iter)
def run(init_lr=0.01, max_steps=200, mode='rgb', root='/media/pritesh/Entertainment/Visual-Tactile_Dataset/dataset/',\ train_split='train.txt', test_split='test.txt', batch_size=5, save_model=''): writer = tensorboardX.SummaryWriter() # setup dataset train_transforms = transforms.Compose([ videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(train_split, root, mode, train_transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=3, pin_memory=True) val_dataset = Dataset(test_split, root, mode, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=3, pin_memory=True) dataloaders = {'train': dataloader, 'val': val_dataloader} datasets = {'train': dataset, 'val': val_dataset} # setup the model sm = InceptionI3d(400, in_channels=3) sm.load_state_dict(torch.load('models/rgb_imagenet.pt')) #tm = InceptionI3d(400, in_channels=2) #tm.load_state_dict(torch.load('models/flow_imagenet.pt')) sm.replace_logits(1) sm = freeze_network_layer(sm) #add your network here fusedNet = FusionNet(sm) if torch.cuda.is_available(): fusedNet.cuda() fusedNet = nn.DataParallel(fusedNet) lr = init_lr optimizer = optim.SGD(fusedNet.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [50, 100, 150, 200]) steps = 0 with open('i3d_video.txt', 'w') as file: file.write("train and validation loss file\n") # train it while steps < max_steps: #for epoch in range(num_epochs): print('Step {}/{}'.format(steps, max_steps)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'val']: print('phase : {}'.format(phase)) if phase == 'train': fusedNet.train(True) else: fusedNet.train(False) # Set model to evaluate mode tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 num_iter = 0 count = 0 optimizer.zero_grad() # Iterate over data. for data in dataloaders[phase]: num_iter += 1 # get the inputs f_vid, l_vid, tactile, pos, labels = data if torch.cuda.is_available(): inputs = Variable(f_vid.cuda()) t = inputs.size(2) labels = Variable(labels.cuda()) else: inputs = Variable(f_vid) t = inputs.size(2) labels = Variable(labels) per_frame_logits = fusedNet(inputs.float()) #print('prediction output = ', per_frame_logits.shape) #print('labels = ',labels.shape) # compute classification loss (with max-pooling along time B x C x T) per_frame_logits = per_frame_logits.squeeze(1) cls_loss = F.binary_cross_entropy_with_logits( per_frame_logits.double(), labels.double()) tot_cls_loss += cls_loss.item() cls_loss.backward() print('{} Loss: {:.4f} and lr: {}'.format( phase, tot_cls_loss / num_iter, init_lr)) with open('i3d_video.txt', 'a') as file: file.write("%f\n" % (tot_cls_loss / num_iter)) optimizer.step() optimizer.zero_grad() if phase == 'val': writer.add_scalar('error/' + phase, (tot_cls_loss / num_iter), num_iter) else: writer.add_scalar('error/' + phase, (tot_cls_loss / num_iter), num_iter) if (steps % 50 == 0): torch.save( fusedNet.module.state_dict(), save_model + phase + str(steps).zfill(6) + '.pt') save_checkpoint(fusedNet, optimizer, lr_sched, steps) #save error at every epoch writer.add_scalar('errorAtEpoch/' + phase, (tot_cls_loss / num_iter), steps) tot_cls_loss = 0. #if(steps%50 == 0): # torch.save(fusedNet.module.state_dict(), save_model+phase+str(steps).zfill(6)+'.pt') # save_checkpoint(fusedNet, optimizer, lr_sched, steps) steps += 1 lr_sched.step()
def ensemble(mode, root, train_split, weights, num_classes): # setup dataset test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) # test_transforms = transforms.Compose([]) val_dataset = Dataset(train_split, 'test', root, mode, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=2, pin_memory=False) dataloaders = {'test': val_dataloader} datasets = {'test': val_dataset} # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) i3d.load_state_dict(torch.load('weights/flow_imagenet.pt')) else: i3d = InceptionI3d(400, in_channels=3) i3d.load_state_dict(torch.load('weights/rgb_imagenet.pt')) i3d.replace_logits(num_classes) i3d.load_state_dict(torch.load(weights)) # nslt_2000_000700.pt nslt_1000_010800 nslt_300_005100.pt(best_results) nslt_300_005500.pt(results_reported) nslt_2000_011400 i3d.cuda() i3d = nn.DataParallel(i3d) i3d.eval() correct = 0 correct_5 = 0 correct_10 = 0 # confusion_matrix = np.zeros((num_classes,num_classes), dtype=np.int) top1_fp = np.zeros(num_classes, dtype=np.int) top1_tp = np.zeros(num_classes, dtype=np.int) top5_fp = np.zeros(num_classes, dtype=np.int) top5_tp = np.zeros(num_classes, dtype=np.int) top10_fp = np.zeros(num_classes, dtype=np.int) top10_tp = np.zeros(num_classes, dtype=np.int) for data in dataloaders["test"]: inputs, labels, video_id = data # inputs: b, c, t, h, w t = inputs.size(2) num = 64 if t > num: num_segments = math.floor(t / num) segments = [] for k in range(num_segments): segments.append(inputs[:, :, k*num: (k+1)*num, :, :]) segments = torch.cat(segments, dim=0) per_frame_logits = i3d(segments) predictions = torch.mean(per_frame_logits, dim=2) if predictions.shape[0] > 1: predictions = torch.mean(predictions, dim=0) else: per_frame_logits = i3d(inputs) predictions = torch.mean(per_frame_logits, dim=2)[0] out_labels = np.argsort(predictions.cpu().detach().numpy()) if labels[0].item() in out_labels[-5:]: correct_5 += 1 top5_tp[labels[0].item()] += 1 else: top5_fp[labels[0].item()] += 1 if labels[0].item() in out_labels[-10:]: correct_10 += 1 top10_tp[labels[0].item()] += 1 else: top10_fp[labels[0].item()] += 1 if torch.argmax(predictions).item() == labels[0].item(): correct += 1 top1_tp[labels[0].item()] += 1 else: top1_fp[labels[0].item()] += 1 print(video_id, float(correct) / len(dataloaders["test"]), float(correct_5) / len(dataloaders["test"]), float(correct_10) / len(dataloaders["test"])) top1_per_class = np.mean(top1_tp / (top1_tp + top1_fp)) top5_per_class = np.mean(top5_tp / (top5_tp + top5_fp)) top10_per_class = np.mean(top10_tp / (top10_tp + top10_fp)) print('top-k average per class acc: {}, {}, {}'.format(top1_per_class, top5_per_class, top10_per_class))
def run(configs, mode='rgb', root='/ssd/Charades_v1_rgb', train_split='charades/charades.json', save_model='', num_classes=None, weights=None): print(configs) # setup dataset train_transforms = transforms.Compose([ videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(train_split, 'train', root, mode, num_classes=num_classes, transforms=train_transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=configs.batch_size, shuffle=True, num_workers=4, pin_memory=True) val_dataset = Dataset(train_split, 'test', root, mode, num_classes=num_classes, transforms=test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=configs.batch_size, shuffle=True, num_workers=4, pin_memory=False) dataloaders = {'train': dataloader, 'test': val_dataloader} datasets = {'train': dataset, 'test': val_dataset} # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) i3d.load_state_dict(torch.load('weights/flow_imagenet.pt')) else: i3d = InceptionI3d(400, in_channels=3) i3d.load_state_dict(torch.load('weights/rgb_imagenet.pt')) num_classes = dataset.num_classes i3d.replace_logits(num_classes) if weights: print('loading weights {}'.format(weights)) i3d.load_state_dict(torch.load(weights)) i3d.cuda() i3d = nn.DataParallel(i3d) lr = configs.init_lr weight_decay = configs.adam_weight_decay optimizer = optim.Adam(i3d.parameters(), lr=lr, weight_decay=weight_decay) num_steps_per_update = configs.update_per_step # accum gradient steps = 0 epoch = 0 best_val_score = 0 # train it scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.3) while steps < configs.max_steps and epoch < 400: # for epoch in range(num_epochs): print('Step {}/{}'.format(steps, configs.max_steps)) print('-' * 10) epoch += 1 # Each epoch has a training and validation phase for phase in ['train', 'test']: collected_vids = [] if phase == 'train': i3d.train(True) else: i3d.train(False) # Set model to evaluate mode tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 num_iter = 0 optimizer.zero_grad() confusion_matrix = np.zeros((num_classes, num_classes), dtype=np.int) # Iterate over data. for data in dataloaders[phase]: num_iter += 1 # get the inputs if data == -1: # bracewell does not compile opencv with ffmpeg, strange errors occur resulting in no video loaded continue # inputs, labels, vid, src = data inputs, labels, vid = data # wrap them in Variable inputs = inputs.cuda() t = inputs.size(2) labels = labels.cuda() per_frame_logits = i3d(inputs, pretrained=False) # upsample to input size per_frame_logits = F.upsample(per_frame_logits, t, mode='linear') # compute localization loss loc_loss = F.binary_cross_entropy_with_logits( per_frame_logits, labels) tot_loc_loss += loc_loss.data.item() predictions = torch.max(per_frame_logits, dim=2)[0] gt = torch.max(labels, dim=2)[0] # compute classification loss (with max-pooling along time B x C x T) cls_loss = F.binary_cross_entropy_with_logits( torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0]) tot_cls_loss += cls_loss.data.item() for i in range(per_frame_logits.shape[0]): confusion_matrix[torch.argmax(gt[i]).item(), torch.argmax(predictions[i]).item()] += 1 loss = (0.5 * loc_loss + 0.5 * cls_loss) / num_steps_per_update tot_loss += loss.data.item() if num_iter == num_steps_per_update // 2: print(epoch, steps, loss.data.item()) loss.backward() if num_iter == num_steps_per_update and phase == 'train': steps += 1 num_iter = 0 optimizer.step() optimizer.zero_grad() # lr_sched.step() if steps % 10 == 0: acc = float(np.trace(confusion_matrix)) / np.sum( confusion_matrix) print( 'Epoch {} {} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f} Accu :{:.4f}' .format(epoch, phase, tot_loc_loss / (10 * num_steps_per_update), tot_cls_loss / (10 * num_steps_per_update), tot_loss / 10, acc)) tot_loss = tot_loc_loss = tot_cls_loss = 0. if phase == 'test': val_score = float( np.trace(confusion_matrix)) / np.sum(confusion_matrix) if val_score > best_val_score or epoch % 2 == 0: best_val_score = val_score model_name = save_model + "nslt_" + str( num_classes) + "_" + str(steps).zfill( 6) + '_%3f.pt' % val_score torch.save(i3d.module.state_dict(), model_name) print(model_name) print( 'VALIDATION: {} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f} Accu :{:.4f}' .format(phase, tot_loc_loss / num_iter, tot_cls_loss / num_iter, (tot_loss * num_steps_per_update) / num_iter, val_score)) scheduler.step(tot_loss * num_steps_per_update / num_iter)
def run(cfg): # setup dataset test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(root=cfg['data_dir'], mode=cfg['mode'], transforms=test_transforms, num=-1, save_dir=cfg['save_dir']) dataloader = torch.utils.data.DataLoader(dataset, batch_size=cfg['batch_size'], shuffle=True, num_workers=16, pin_memory=True) # setup the model if cfg['mode'] == 'flow': i3d = InceptionI3d(400, in_channels=2) else: i3d = InceptionI3d(400, in_channels=3) # i3d.replace_logits(157) i3d.load_state_dict(torch.load(cfg['load_model'])) i3d.cuda() i3d.train(False) # Set model to evaluate mode tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 map_dir = cfg['save_dir'] + '_map' if not os.path.exists(cfg['save_dir']): os.mkdir(cfg['save_dir']) if not os.path.exists(map_dir): os.mkdir(map_dir) # Iterate over data. for data in tqdm(dataloader): # get the inputs inputs, name = data mov = '_'.join(name[0].split('_')[:-1]) if not os.path.exists(os.path.join(cfg['save_dir'], mov)): os.mkdir(os.path.join(cfg['save_dir'], mov)) elif os.path.exists( os.path.join(cfg['save_dir'], mov, name[0] + '.npy')): continue if not os.path.exists(os.path.join(map_dir, mov)): os.mkdir(os.path.join(map_dir, mov)) b, c, t, h, w = inputs.shape #print('LOG: {} shape: {}'.format(name[0], inputs.shape)) if t > 1600: features = [] maps = [] for start in range(1, t - 56, 1600): end = min(t - 1, start + 1600 + 56) do_end_crop = True if end == start + 1600 + 56 else False start = max(1, start - 48) do_start_crop = True if start != 1 else False ip = Variable(torch.from_numpy( inputs.numpy()[:, :, start:end]).cuda(), volatile=True) map_pool, avg_pool = i3d.extract_features(ip) map_pool = map_pool.squeeze(0).permute(1, 2, 3, 0).data.cpu().numpy() avg_pool = avg_pool.squeeze(0).squeeze(-1).squeeze(-1).permute( -1, 0).data.cpu().numpy() if do_end_crop: #print('LOG: do end crop') map_pool = map_pool[:-6, :, :, :] avg_pool = avg_pool[:-6, :] if do_start_crop: #print('LOG: do start crop') map_pool = map_pool[6:, :, :, :] avg_pool = avg_pool[6:, :] maps.append(map_pool) features.append(avg_pool) #print('LOG: maps: {}, features: {}'.format(map_pool.shape, avg_pool.shape)) np.save(os.path.join(cfg['save_dir'], mov, name[0]), np.concatenate(features, axis=0)) np.save(os.path.join(map_dir, mov, name[0]), np.concatenate(maps, axis=0)) else: inputs = Variable(inputs.cuda(), volatile=True) map_pool, avg_pool = i3d.extract_features(inputs) #print('LOG: maps: {}, features: {}'.format(map_pool.shape, avg_pool.shape)) np.save( os.path.join(cfg['save_dir'], mov, name[0]), avg_pool.squeeze(0).squeeze(-1).squeeze(-1).permute( -1, 0).data.cpu().numpy()) np.save(os.path.join(map_dir, mov, name[0]), map_pool.squeeze(0).permute(1, 2, 3, 0).data.cpu().numpy())
def run(init_lr=0.1, max_steps=64e3, mode='rgb', root='../data/BL_and_PL', train_split='../ms-tcn/data/BL_and_PL/groundTruth/annotations.json', batch_size=1, save_model=''): # Transforms train_transforms = transforms.Compose([ videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip() ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) # Dataset and dataloader dataset = Dataset(train_split, 'training', root, mode, train_transforms, save_dir='saveds') dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True) dataloaders = {'train': dataloader} datasets = {'train': dataset} # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) i3d.load_state_dict(torch.load('models/flow_imagenet.pt')) else: i3d = InceptionI3d(400, in_channels=3) i3d.load_state_dict(torch.load('models/rgb_imagenet.pt')) # We have 13 different classes. i3d.replace_logits(13) #157) i3d.cuda() i3d = nn.DataParallel(i3d) # Optimization stuff lr = init_lr optimizer = optim.SGD(i3d.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000]) num_steps_per_update = 4 # accum gradient steps = 0 # train: while steps < max_steps: #for epoch in range(num_epochs): print('Step {}/{}'.format(steps, max_steps)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train']: #, 'val']: # Train model if in train phase i3d.train(phase == 'train') tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 num_iter = 0 optimizer.zero_grad() # Iterate over data. for data in dataloaders[phase]: num_iter += 1 # get the inputs inputs, labels, vid = data # wrap them in Variable inputs = Variable(inputs.cuda()) t = inputs.size(2) labels = Variable(labels.cuda()) per_frame_logits = i3d(inputs) # upsample to input size pdb.set_trace() per_frame_logits = F.upsample(per_frame_logits, t, mode='linear') # compute localization loss loc_loss = F.binary_cross_entropy_with_logits( per_frame_logits, labels) tot_loc_loss += loc_loss.item() #.data[0] # compute classification loss (with max-pooling along time B x C x T) cls_loss = F.binary_cross_entropy_with_logits( torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0]) tot_cls_loss += cls_loss.item() #.data[0] loss = (0.5 * loc_loss + 0.5 * cls_loss) / num_steps_per_update tot_loss += loss.item() #.data[0] loss.backward() if num_iter == num_steps_per_update and phase == 'train': steps += 1 num_iter = 0 optimizer.step() optimizer.zero_grad() lr_sched.step() if steps % 10 == 0: print( '{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}' .format(phase, tot_loc_loss / (10 * num_steps_per_update), tot_cls_loss / (10 * num_steps_per_update), tot_loss / 10)) # save model torch.save(i3d.module.state_dict(), save_model + str(steps).zfill(6) + '.pt') tot_loss = tot_loc_loss = tot_cls_loss = 0. if phase == 'val': print('{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'. format(phase, tot_loc_loss / num_iter, tot_cls_loss / num_iter, (tot_loss * num_steps_per_update) / num_iter))
def run(init_lr=0.1, max_steps=64e3, mode='rgb', root='/ssd/Charades_v1_rgb', train_split='charades/charades.json', batch_size=3 * 15, save_model='', weights=None): # setup dataset test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) val_dataset = Dataset(train_split, 'test', root, mode, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=2, pin_memory=False) dataloaders = {'test': val_dataloader} datasets = {'test': val_dataset} # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) i3d.load_state_dict(torch.load('weights/flow_imagenet.pt')) else: i3d = InceptionI3d(400, in_channels=3) i3d.load_state_dict(torch.load('weights/rgb_imagenet.pt')) i3d.replace_logits(num_classes) i3d.load_state_dict(torch.load(weights)) # nslt_2000_000700.pt nslt_1000_010800 nslt_300_005100.pt(best_results) nslt_300_005500.pt(results_reported) nslt_2000_011400 i3d.cuda() i3d = nn.DataParallel(i3d) i3d.eval() correct = 0 correct_5 = 0 correct_10 = 0 top1_fp = np.zeros(num_classes, dtype=np.int) top1_tp = np.zeros(num_classes, dtype=np.int) top5_fp = np.zeros(num_classes, dtype=np.int) top5_tp = np.zeros(num_classes, dtype=np.int) top10_fp = np.zeros(num_classes, dtype=np.int) top10_tp = np.zeros(num_classes, dtype=np.int) for data in dataloaders["test"]: inputs, labels, video_id = data # inputs: b, c, t, h, w per_frame_logits = i3d(inputs) predictions = torch.max(per_frame_logits, dim=2)[0] out_labels = np.argsort(predictions.cpu().detach().numpy()[0]) out_probs = np.sort(predictions.cpu().detach().numpy()[0]) if labels[0].item() in out_labels[-5:]: correct_5 += 1 top5_tp[labels[0].item()] += 1 else: top5_fp[labels[0].item()] += 1 if labels[0].item() in out_labels[-10:]: correct_10 += 1 top10_tp[labels[0].item()] += 1 else: top10_fp[labels[0].item()] += 1 if torch.argmax(predictions[0]).item() == labels[0].item(): correct += 1 top1_tp[labels[0].item()] += 1 else: top1_fp[labels[0].item()] += 1 print(video_id, float(correct) / len(dataloaders["test"]), float(correct_5) / len(dataloaders["test"]), float(correct_10) / len(dataloaders["test"])) # per-class accuracy top1_per_class = np.mean(top1_tp / (top1_tp + top1_fp)) top5_per_class = np.mean(top5_tp / (top5_tp + top5_fp)) top10_per_class = np.mean(top10_tp / (top10_tp + top10_fp)) print('top-k average per class acc: {}, {}, {}'.format(top1_per_class, top5_per_class, top10_per_class))
def run(max_steps=64e3, mode='rgb', root='/ssd2/charades/Charades_v1_rgb', batch_size=1, load_model='', save_dir=''): # setup dataset test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(root, mode, test_transforms, save_dir=save_dir) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True) #val_dataset = Dataset(split, 'testing', root, mode, test_transforms, num=-1, save_dir=save_dir) #val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True) dataloaders = {'train': dataloader} datasets = {'train': dataset} # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) else: i3d = InceptionI3d(400, in_channels=3) i3d.replace_logits(157) i3d.load_state_dict(torch.load(load_model)) i3d.cuda() if not os.path.exists(save_dir): os.makedirs(save_dir) for phase in ['train']: i3d.train(False) # Set model to evaluate mode tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 # Iterate over data. for data in tqdm(dataloaders[phase]): # get the inputs inputs, labels, name = data name = name[0].split('frames_hq/')[1:] parent_dir = os.path.join(save_dir, os.path.dirname(name[0])) if not os.path.exists(parent_dir): os.makedirs(parent_dir) #if os.path.exists(os.path.join(save_dir, name[0]+'.npy')): # continue b,c,t,h,w = inputs.shape if t > 128: features = [] for start in range(0, t, 128): end = min(t, start + 128) if (end - start < 8): start = start - 8 with torch.no_grad(): ip = Variable(torch.from_numpy(inputs.numpy()[:,:,start:end]).cuda()) features.append(i3d.extract_features(ip).squeeze(0).permute(1,2,3,0).data.cpu().numpy()) np.save(os.path.join(save_dir, name[0]), np.concatenate(features, axis=0)) else: # wrap them in Variable inputs = Variable(inputs.cuda(), volatile=True) features = i3d.extract_features(inputs) np.save(os.path.join(save_dir, name[0]), features.squeeze(0).permute(1,2,3,0).data.cpu().numpy())
args.video_dir)) if not os.path.exists(args.images_dir): os.makedirs(args.images_dir) if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) # create I3D model and load pre-trained model i3d_model = InceptionI3d(400, in_channels=3) if args.use_finetuned: i3d_model.replace_logits(157) # charades has 157 activity types i3d_model.load_state_dict(torch.load(args.load_model)) i3d_model.cuda() i3d_model.train(False) video_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) # load video ids video_ids = [] for filename in ["charades_sta_train.txt", "charades_sta_test.txt"]: with open(os.path.join(args.dataset_dir, filename), mode="r", encoding="utf-8") as f: for line in f: line = line.lstrip().rstrip() if len(line) == 0: continue vid = line.split("##")[0].split(" ")[0] video_ids.append(vid) video_ids = list(set(video_ids))
def run(init_lr=0.1, max_steps=1e8, mode='rgb', dataset='thumos', root_train='/mnt/data_a/alex/PyTorch_I3D/thumos/validation/', root_eval='/mnt/data_a/alex/PyTorch_I3D/thumos/test/', train_split='/mnt/data_a/alex/PyTorch_I3D/thumos/validation/validation_thumos.json', eval_split='/mnt/data_a/alex/PyTorch_I3D/thumos/test/test_thumos.json', batch_size=4, batch_size_eval=4, save_model='', snippets=64, saving_steps=5000, num_steps_per_update=1, num_classes=65, crf=False, num_updates_crf=1, reg_crf=-1, use_cls=False, pairwise_cond_crf=False, reg_type='l2'): # setup dataset train_transforms = transforms.Compose([ videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(train_split, 'training', root_train, mode, snippets, train_transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True, drop_last=True) val_dataset = Dataset(eval_split, 'testing', root_eval, mode, snippets, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size_eval, shuffle=True, num_workers=8, pin_memory=True, drop_last=True) dataloaders = {'train': dataloader, 'val': val_dataloader} datasets = {'train': dataset, 'val': val_dataset} # setup model steps = 0 epoch = 0 if not os.path.exists(args.save_model): subprocess.call('mkdir ' + args.save_model, shell=True) configure(args.save_model + "tensorboard_logger", flush_secs=5) # resume the training or load the pre-trained I3D checkpoint = -1 try: checkpoint = last_checkpoint(args.save_model) except: print("Loading the pre-trained I3D") if mode == 'flow': i3d = InceptionI3d(400, in_channels=2, use_crf=crf, num_updates_crf=num_updates_crf, pairwise_cond_crf=pairwise_cond_crf) total_dict = i3d.state_dict() partial_dict = torch.load('models/flow_imagenet.pt') total_dict.update(partial_dict) i3d.load_state_dict(total_dict) else: i3d = InceptionI3d(400, in_channels=3, use_crf=crf, num_updates_crf=num_updates_crf, pairwise_cond_crf=pairwise_cond_crf) total_dict = i3d.state_dict() partial_dict = torch.load('models/rgb_imagenet.pt') total_dict.update(partial_dict) i3d.load_state_dict(total_dict) i3d.replace_logits(num_classes) if (checkpoint != -1): if mode == 'flow': i3d = InceptionI3d(num_classes, in_channels=2, use_crf=crf, num_updates_crf=num_updates_crf, pairwise_cond_crf=pairwise_cond_crf) else: i3d = InceptionI3d(num_classes, in_channels=3, use_crf=crf, num_updates_crf=num_updates_crf, pairwise_cond_crf=pairwise_cond_crf) i3d.load_state_dict(torch.load(args.save_model + checkpoint)) steps = int(checkpoint[:-3]) if dataset == 'thumos': epoch = int(steps * snippets * batch_size * num_steps_per_update / 1214016) else: epoch = int(steps * snippets * batch_size * num_steps_per_update / 5482688) # push the pipeline on multiple gpus if possible i3d.cuda() i3d = nn.DataParallel(i3d) # setup optimizer lr = init_lr optimizer = optim.SGD(i3d.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=[1000], gamma=0.1) if steps > 0: for i in range(steps): lr_sched.step() # train the model while steps < max_steps: epoch += 1 print('-' * 10) print('Epoch {}'.format(epoch)) print('Step {}/{}'.format(steps, max_steps)) print('-' * 10) # each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': print('Entering training loop...') i3d.train() else: print('Entering validation loop...') i3d.eval() time_init_eval = time.time() cumul_pred = Cumulator(num_classes) cumul_labels = Cumulator(num_classes) tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 tot_loss_updt = 0.0 tot_loc_loss_updt = 0.0 tot_cls_loss_updt = 0.0 tot_reg_loss_updt = 0.0 num_iter = 0 optimizer.zero_grad() count_batch = 0 gap_train = 0 print("Losses initialized to 0") # Iterate over data. for data in dataloaders[phase]: time_init_batch = time.time() count_batch += 1 num_iter += 1 # get the inputs inputs, labels = data # wrap them in Variable inputs = Variable(inputs.cuda()) t = inputs.size(2) labels = Variable(labels.cuda()) # forward if crf: per_frame_logits_ante_crf, per_frame_logits = i3d(inputs) else: per_frame_logits = i3d(inputs) # upsample to input size per_frame_logits = F.upsample(per_frame_logits, t, mode='linear') if crf: per_frame_logits_ante_crf = F.upsample( per_frame_logits_ante_crf, t, mode='linear') # accumulate predictions and ground truths pred_np = pt_var_to_numpy(nn.Sigmoid()(per_frame_logits)) cumul_pred.append(pred_np) labels_np = pt_var_to_numpy(labels) cumul_labels.append(labels_np) # compute localization loss if crf: loc_loss = F.binary_cross_entropy_with_logits( per_frame_logits, labels) + F.binary_cross_entropy_with_logits( per_frame_logits_ante_crf, labels) else: loc_loss = F.binary_cross_entropy_with_logits( per_frame_logits, labels) tot_loc_loss += loc_loss.data[0] tot_loc_loss_updt += loc_loss.data[0] # compute classification loss (with max-pooling along time B x C x T) if crf: cls_loss = F.binary_cross_entropy_with_logits( torch.max(per_frame_logits, dim=2)[0], torch.max( labels, dim=2)[0]) + F.binary_cross_entropy_with_logits( torch.max(per_frame_logits_ante_crf, dim=2)[0], torch.max(labels, dim=2)[0]) else: cls_loss = F.binary_cross_entropy_with_logits( torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0]) tot_cls_loss += cls_loss.data[0] tot_cls_loss_updt += cls_loss.data[0] # compute regularization loss for the crf module if crf and (reg_crf > 0 and not pairwise_cond_crf): reg_loss = get_reg_loss(i3d, 'crf', reg_type) tot_reg_loss_updt += reg_loss.data[0] elif crf and (reg_crf > 0 and pairwise_cond_crf): reg_loss = get_reg_loss(i3d, 'psi_0', reg_type) + get_reg_loss( i3d, 'psi_1', reg_type) tot_reg_loss_updt += reg_crf * reg_loss.data[0] else: reg_loss = 0 # put all the losses together if use_cls: loss = (0.5 * loc_loss + 0.5 * cls_loss + reg_crf * reg_loss) / num_steps_per_update else: loss = (loc_loss + reg_crf * reg_loss) / num_steps_per_update tot_loss += loss.data[0] tot_loss_updt += loss.data[0] loss.backward() if num_iter == num_steps_per_update and phase == 'train': steps += 1 num_iter = 0 optimizer.step() optimizer.zero_grad() lr_sched.step() examples_processed_updt = num_steps_per_update * batch_size * snippets examples_processed_tot = count_batch * batch_size * snippets map_train = map_calculator(cumul_pred.accumuled[1:], cumul_labels.accumuled[1:]) gap_train = ap_calculator( cumul_pred.accumuled[1:].flatten(), cumul_labels.accumuled[1:].flatten()) print( 'TRAINING - Epoch: {} Step: {} Examples processed {} Loc Loss: {:.6f} Cls Loss: {:.6f} Tot Loss: {:.6f} Reg Loss: {:.6f} mAP: {:.6f} GAP: {:.6f}' .format( epoch, steps, examples_processed_tot, tot_loc_loss_updt / examples_processed_updt, tot_cls_loss_updt / examples_processed_updt, tot_loss_updt / (batch_size * snippets), reg_crf * tot_reg_loss_updt / examples_processed_updt, map_train, gap_train)) log_value('Training_loc_loss', tot_loc_loss_updt / examples_processed_updt, steps) log_value('Training_cls_loss', tot_cls_loss_updt / examples_processed_updt, steps) log_value('Training_reg_loss', tot_reg_loss_updt / examples_processed_updt, steps) log_value('Training_tot_loss', tot_loss_updt / (batch_size * snippets), steps) log_value('Training_mAP', map_train, steps) log_value('Training_GAP', gap_train, steps) tot_loss_updt, tot_loc_loss_updt, tot_cls_loss_updt, tot_reg_loss_updt = 0.0, 0.0, 0.0, 0.0 cumul_pred.clear() cumul_labels.clear() cumul_pred = Cumulator(num_classes) cumul_labels = Cumulator(num_classes) if ((steps % saving_steps) == 0) & (phase == 'train') & (num_iter == 0): # save model print("EPOCH: {} Step: {} - Saving model...".format( epoch, steps)) torch.save(i3d.module.state_dict(), save_model + str(steps).zfill(6) + '.pt') tot_loss = tot_loc_loss = tot_cls_loss = 0. if phase == 'val': time_end_batch = time.time() examples_processed_tot = count_batch * batch_size_eval * snippets print( 'EVAL - Epoch: {} Step: {} Examples processed {} - Time for batch: {}' .format(epoch, steps, examples_processed_tot, time_end_batch - time_init_batch)) log_value('Evaluation time', time_end_batch - time_init_batch, examples_processed_tot) if phase == 'val': examples_processed_tot = count_batch * batch_size_eval * snippets map_val = map_calculator(cumul_pred.accumuled[1:], cumul_labels.accumuled[1:]) gap_val = ap_calculator(cumul_pred.accumuled[1:].flatten(), cumul_labels.accumuled[1:].flatten()) time_end_eval = time.time() print( 'EVAL - Epoch: {} Step: {} Loc Loss: {:.6f} Cls Loss: {:.6f} Tot Loss: {:.6f} mAP: {:.4f} GAP: {:.4f} Total time: {}' .format( epoch, steps, tot_loc_loss / examples_processed_tot, tot_cls_loss / examples_processed_tot, tot_loss_updt * num_steps_per_update / examples_processed_tot, map_val, gap_val, time_end_eval - time_init_eval)) log_value('Validation_subset_loc_loss', tot_loc_loss / examples_processed_tot, steps) log_value('Validation_subset_cls_loss', tot_cls_loss / examples_processed_tot, steps) log_value( 'Validation_subset_tot_loss', tot_loss_updt * num_steps_per_update / examples_processed_tot) log_value('Validation_subset_mAP', map_val, steps) log_value('Validation_subset_GAP', gap_val, steps) cumul_pred.clear() cumul_labels.clear()
def run(max_steps=64e3, mode='rgb', root='../data/BL_and_PL', split='../data/BL_and_PL/new_annotations.json', batch_size=1, load_model='', save_dir='args.save_dir'): # setup dataset #test_transforms = transforms.Compose([videotransforms.RandomCrop(224), # videotransforms.RandomHorizontalFlip(), #]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(split, 'training', root, mode, test_transforms, num=-1, save_dir=save_dir) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True) dataloaders = {'train': dataloader} datasets = {'train': dataset} # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2, temp_window = args.temporal_window) input_padder = nn.ReplicationPad3d(padding=(0,0,0,0, args.temporal_window//2,args.temporal_window//2)) #, final_endpoint= 'Mixed_5c' else: i3d = InceptionI3d(400,in_channels=3, temp_window = args.temporal_window) input_padder = nn.ReplicationPad3d(padding=(0,0,0,0, args.temporal_window//2,args.temporal_window//2)) i3d.load_state_dict(torch.load(load_model)) i3d.cuda() for phase in ['train']:#, 'val']: i3d.train(False) # Set model to evaluate mode tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 # Iterate over data. for data in dataloaders[phase]: # get the inputs inputs, labels, name = data print('extracting {} features for {} and tw {}'.format(mode, name[0],args.temporal_window)) #if os.path.exists(os.path.join(save_dir, name[0]+'.npy')): # continue b,c,t,h,w = inputs.shape if t > 1600: features = [] for start in range(1, t-56, 1600): end = min(t-1, start+1600+56) start = max(1, start-48) ip = Variable(torch.from_numpy(inputs.numpy()[:,:,start:end]).cuda(), volatile=True) features.append(i3d.extract_features(ip).squeeze(0).permute(1,2,3,0).data.cpu().numpy()) np.save(os.path.join(save_dir, name[0]), np.concatenate(features, axis=0)) else: # Temporally pad inputs such that output temporal dimension conserved: no_frames = inputs.shape[2] inputs = input_padder(inputs) per_frame_features = []#torch.zeros((1,1024,1,1,1)) # We want per-frame features. Authors of MS-TCN slid temporal window over # each frame and input that to the network. for w in range(no_frames): windowed_inputs = inputs[:,:, w:(w+(args.temporal_window)), :,:].cuda() features = i3d.extract_features(windowed_inputs) per_frame_features.append(features.cpu().data) if w % 10 == 0: print(' {}'.format(w) ) np.save(os.path.join(save_dir, name[0]), np.concatenate(per_frame_features,axis=2)[0,:,:,0,0])
def predict_video(cabin_video_path, face_video_path, args): if torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' checkpoint = args.checkpoint clip_length = args.clip_length clip_stride = args.clip_stride batch_size = args.batch_size num_classes = args.num_classes threshold = args.threshold cabin_clips, face_clips, indices_in_cabin_clips = clip_generation(cabin_video_path, face_video_path, clip_length, clip_stride) model = TAL_Net(num_classes) ckp = torch.load(checkpoint) model.load_state_dict(ckp['model']) model.to(device) model.eval() clip_transforms = transforms.Compose([videotransforms.CenterCrop(224), videotransforms.ToTensor(), videotransforms.ClipNormalize() ]) all_clips = [] all_predict_classes = [] all_start_scores = [] all_end_scores = [] n = len(cabin_clips) // batch_size for i in range(n): cabin_video_frames_batch = [] face_video_frames_batch = [] for j in range(i * batch_size, (i + 1) * batch_size): cabin_clip = cabin_clips[j] cabin_video_frames = load_rgb_frames(cabin_video_path, cabin_clip) cabin_video_frames = clip_transforms(cabin_video_frames) cabin_video_frames_batch.append(cabin_video_frames) face_clip = face_clips[j] face_video_frames = load_rgb_frames(face_video_path, face_clip) face_video_frames = clip_transforms(face_video_frames) face_video_frames_batch.append(face_video_frames) cabin_video_frames_batch = torch.stack(cabin_video_frames_batch) face_video_frames_batch = torch.stack(face_video_frames_batch) cabin_video_frames_batch = cabin_video_frames_batch.to(device) face_video_frames_batch = face_video_frames_batch.to(device) with torch.no_grad(): class_scores, start_scores, end_scores = model(cabin_video_frames_batch, face_video_frames_batch) pred_classes = torch.argmax(class_scores, dim=1) pred_classes = pred_classes.cpu().numpy() start_scores = start_scores.cpu().numpy() end_scores = end_scores.cpu().numpy() all_predict_classes.append(pred_classes) all_start_scores.append(start_scores) all_end_scores.append(end_scores) if len(cabin_clips) % batch_size != 0: cabin_video_frames_batch = [] face_video_frames_batch = [] for k in range(n * batch_size, len(cabin_clips)): cabin_clip = cabin_clips[k] cabin_video_frames = load_rgb_frames(cabin_video_path, cabin_clip) cabin_video_frames = clip_transforms(cabin_video_frames) cabin_video_frames_batch.append(cabin_video_frames) face_clip = face_clips[k] face_video_frames = load_rgb_frames(face_video_path, face_clip) face_video_frames = clip_transforms(face_video_frames) face_video_frames_batch.append(face_video_frames) cabin_video_frames_batch = torch.stack(cabin_video_frames_batch) face_video_frames_batch = torch.stack(face_video_frames_batch) cabin_video_frames_batch = cabin_video_frames_batch.to(device) face_video_frames_batch = face_video_frames_batch.to(device) with torch.no_grad(): class_scores, start_scores, end_scores = model(cabin_video_frames_batch, face_video_frames_batch) pred_classes = torch.argmax(class_scores, dim=1) pred_classes = pred_classes.cpu().numpy() start_scores = start_scores.cpu().numpy() end_scores = end_scores.cpu().numpy() all_predict_classes.append(pred_classes) all_start_scores.append(start_scores) all_end_scores.append(end_scores) all_predict_classes = np.concatenate(all_predict_classes) all_start_scores = np.concatenate(all_start_scores) all_end_scores = np.concatenate(all_end_scores) # print(all_start_scores) # print(all_end_scores) # start_peak_indices = [] # end_peak_indices = [] # if all_start_scores[0] > all_start_scores[1]: # start_peak_indices.append(0) # for i in range(1, len(cabin_clips) - 1): # if all_start_scores[i] > all_start_scores[i - 1]: # if all_start_scores[i] > all_start_scores[i + 1]: # start_peak_indices.append(i) # if all_end_scores[i] > all_end_scores[i - 1]: # if all_end_scores[i] > all_end_scores[i + 1]: # end_peak_indices.append(i) # if all_end_scores[-1] > all_end_scores[-2]: # end_peak_indices.append(len(cabin_clips) - 1) # j = 0 # copy_start_peak_indices = start_peak_indices.copy() # while j < len(start_peak_indices) - 1: # index1 = copy_start_peak_indices[j] # index2 = copy_start_peak_indices[j + 1] # if index1 + 4 < index2: # j += 1 # else: # if all_start_scores[start_peak_indices[j]] > all_start_scores[start_peak_indices[j+1]]: # copy_start_peak_indices[j] = index2 # copy_start_peak_indices.pop(j + 1) # start_peak_indices.pop(j + 1) # else: # copy_start_peak_indices.pop(j) # start_peak_indices.pop(j) # k = 0 # copy_end_peak_indices = end_peak_indices.copy() # while k < len(end_peak_indices) - 1: # index1 = copy_end_peak_indices[k] # index2 = copy_end_peak_indices[k + 1] # if index1 + 4 < index2: # k += 1 # else: # if all_end_scores[end_peak_indices[k]] > all_end_scores[end_peak_indices[k+1]]: # copy_end_peak_indices[k] = index2 # copy_end_peak_indices.pop(k + 1) # end_peak_indices.pop(k + 1) # else: # copy_end_peak_indices.pop(k) # end_peak_indices.pop(k) selected_starts = [] selected_ends = [] for i in range(len(all_start_scores)): if all_start_scores[i] > threshold: selected_starts.append(i) for j in range(len(all_end_scores)): if all_end_scores[j] > threshold: selected_ends.append(j) return selected_starts, selected_ends, all_start_scores, indices_in_cabin_clips
def run(max_steps=64e3, mode='rgb', root='', split='', batch_size=1, save_dir=''): #tf.logging.set_verbosity(tf.logging.INFO) eval_type = mode imagenet_pretrained = False NUM_CLASSES = 400 if eval_type == 'rgb600': NUM_CLASSES = 600 if eval_type not in ['rgb', 'rgb600', 'flow', 'joint']: raise ValueError( 'Bad `eval_type`, must be one of rgb, rgb600, flow, joint') if eval_type == 'rgb600': kinetics_classes = [x.strip() for x in open(_LABEL_MAP_PATH_600)] else: kinetics_classes = [x.strip() for x in open(_LABEL_MAP_PATH)] if eval_type in ['rgb', 'rgb600', 'joint']: # RGB input has 3 channels. rgb_input = tf.placeholder(tf.float32, shape=(1, _SAMPLE_VIDEO_FRAMES, _IMAGE_SIZE, _IMAGE_SIZE, 3)) with tf.variable_scope('RGB'): rgb_model = i3d.InceptionI3d(NUM_CLASSES, spatial_squeeze=True, final_endpoint='Mixed_5c') rgb_logits, _ = rgb_model(rgb_input, is_training=False, dropout_keep_prob=1.0) rgb_variable_map = {} for variable in tf.global_variables(): if variable.name.split('/')[0] == 'RGB': if eval_type == 'rgb600': rgb_variable_map[variable.name.replace( ':0', '')[len('RGB/inception_i3d/'):]] = variable else: rgb_variable_map[variable.name.replace(':0', '')] = variable rgb_saver = tf.train.Saver(var_list=rgb_variable_map, reshape=True) if eval_type in ['flow', 'joint']: # Flow input has only 2 channels. flow_input = tf.placeholder(tf.float32, shape=(None, _SAMPLE_VIDEO_FRAMES, _IMAGE_SIZE, _IMAGE_SIZE, 2)) with tf.variable_scope('Flow'): flow_model = i3d.InceptionI3d(NUM_CLASSES, spatial_squeeze=True, final_endpoint='Mixed_5c') flow_logits, _ = flow_model(flow_input, is_training=False, dropout_keep_prob=1.0) flow_variable_map = {} for variable in tf.global_variables(): if variable.name.split('/')[0] == 'Flow': flow_variable_map[variable.name.replace(':0', '')] = variable flow_saver = tf.train.Saver(var_list=flow_variable_map, reshape=True) if eval_type == 'rgb' or eval_type == 'rgb600': model_logits = rgb_logits elif eval_type == 'flow': model_logits = flow_logits else: model_logits = rgb_logits + flow_logits #model_predictions = tf.nn.softmax(model_logits) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(split, 'training', root, mode, test_transforms, save_dir=save_dir) with tf.Session() as sess: feed_dict = {} while True: inputs, labels, name = dataset.next_batch() if name == '0': break i = 0 for input in inputs: i += 1 c, t, h, w = input.shape if eval_type in ['rgb', 'rgb600', 'joint']: if imagenet_pretrained: rgb_saver.restore(sess, _CHECKPOINT_PATHS['rgb_imagenet']) else: rgb_saver.restore(sess, _CHECKPOINT_PATHS[eval_type]) #tf.logging.info('RGB checkpoint restored') rgb_sample = input[np.newaxis, :] #tf.logging.info('RGB data loaded, shape=%s', str(rgb_sample.shape)) feed_dict[rgb_input] = rgb_sample if eval_type in ['flow', 'joint']: if imagenet_pretrained: flow_saver.restore(sess, _CHECKPOINT_PATHS['flow_imagenet']) else: flow_saver.restore(sess, _CHECKPOINT_PATHS['flow']) #tf.logging.info('Flow checkpoint restored') flow_sample = input[np.newaxis, :] # tf.logging.info('Flow data loaded, shape=%s', str(flow_sample.shape)) feed_dict[flow_input] = flow_sample out_logits = sess.run([model_logits], feed_dict=feed_dict) out_logits = out_logits[0] new_path = os.path.join(save_dir, name, mode) if not os.path.exists(new_path): os.makedirs(new_path) np.save(os.path.join(new_path, str(i)), out_logits.reshape(1024))
def run(max_steps=64e3, mode='flow', root='./frames', split='gt.json', batch_size=1, load_model='', save_dir=''): # setup dataset test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(split, 'training', root, mode, test_transforms, num=-1, save_dir=save_dir) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True) val_dataset = Dataset(split, 'test', root, mode, test_transforms, num=-1, save_dir=save_dir) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True) dataloaders = {'train': dataloader, 'val': val_dataloader} datasets = {'train': dataset, 'val': val_dataset} # setup the model if mode == 'flow': i3d = InceptionI3d(20, in_channels=2) else: i3d = InceptionI3d(20, in_channels=3) i3d.replace_logits(157) i3d.load_state_dict(torch.load(load_model)) i3d.cuda() for phase in ['train', 'val']: i3d.train(False) # Set model to evaluate mode tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 # Iterate over data. for data in dataloaders[phase]: # get the inputs inputs, labels, name = data # if os.path.exists(os.path.join(save_dir, name[0] + '.npy')): # continue b, c, t, h, w = inputs.shape if t > 16: features = [] for start in range(0, t, 16): end = min(t - 1, start + 16) if end < start + 16: break # start = max(1, start - 48) ip = Variable(torch.from_numpy( inputs.numpy()[:, :, start:end]).cuda(), volatile=True) feature = i3d.extract_features(ip) feature = torch.squeeze(feature) features.append(feature.data.cpu().numpy()) np.save(os.path.join(save_dir, name[0]), np.asarray(features)) else: # wrap them in Variable inputs = Variable(inputs.cuda(), volatile=True) features = i3d.extract_features(inputs) np.save( os.path.join(save_dir, name[0]), features.squeeze(0).permute(1, 2, 3, 0).data.cpu().numpy())
def main(): best_prec1 = 0 with open( 'logs/' + args.dataset + '/' + args.arch + '_' + args.mode + '_validation.txt', 'a') as f: f.write("=============================================") f.write('\n') f.write("lr: ") f.write(str(args.lr)) f.write(" lr_step: ") f.write(str(args.lr_steps)) f.write(" dataset: ") f.write(str(args.dataset)) f.write(" modality: ") f.write(str(args.mode)) f.write(" dropout: ") f.write(str(args.dropout)) f.write(" batch size: ") f.write(str(args.batch_size)) f.write('\n') if args.dataset == 'ucf101': num_class = 101 data_length = 64 image_tmpl = "frame{:06d}.jpg" elif args.dataset == 'hmdb51': num_class = 51 data_length = 64 image_tmpl = "img_{:05d}.jpg" elif args.dataset == 'kinetics': num_class = 400 data_length = 64 image_tmpl = "img_{:05d}.jpg" else: raise ValueError('Unknown dataset ' + args.dataset) val_logger = Logger( 'logs/' + args.dataset + '/' + args.arch + '_' + args.mode + '_val.log', ['epoch', 'acc']) # define loss function (criterion) and optimizer #======================data transform============= normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_transforms = transforms.Compose([ videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip() ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) #=======================design the dataset============== train_dataset = I3dDataSet("", args.train_list, num_segments=1, new_length=data_length, modality=args.mode, dataset=args.dataset, image_tmpl=image_tmpl if args.mode in ["rgb", "RGBDiff"] else args.flow_prefix + "{}_{:05d}.jpg", transform=train_transforms, test_mode=False) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=True) val_dataset = I3dDataSet("", args.val_list, num_segments=1, new_length=data_length, modality=args.mode, dataset=args.dataset, image_tmpl=image_tmpl if args.mode in ["rgb", "RGBDiff"] else args.flow_prefix + "{}_{:05d}.jpg", random_shift=False, transform=test_transforms, test_mode=False) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=8, pin_memory=True) dataloaders = {'train': train_loader, 'val': val_loader} datasets = {'train': train_dataset, 'val': val_dataset} #=============================set the model ================== # setup the model if args.mode == 'flow': if args.arch == 'i3d': from net.i3d import I3D i3d = I3D(modality='flow', num_classes=num_class, dropout_prob=args.dropout) elif args.arch == 'bilinear_i3d': from net.bilinear_i3d import I3D i3d = I3D(modality='flow', num_classes=num_class, dropout_prob=args.dropout) elif args.arch == 'se_i3d': from net.se_i3d import I3D i3d = I3D(modality='flow', num_classes=num_class, dropout_prob=args.dropout) elif args.arch == 'se_bilinear_i3d': from net.se_bilinear_i3d import I3D i3d = I3D(modality='flow', num_classes=num_class, dropout_prob=args.dropout) else: Exception("not support now!") i3d.eval() pretrain_dict = torch.load('pretrained_models/model_flow.pth') model_dict = i3d.state_dict() weight_dict = weight_transform(model_dict, pretrain_dict) i3d.load_state_dict(weight_dict) else: #i3d = InceptionI3d(400, in_channels=3) if args.arch == 'i3d': from net.i3d import I3D i3d = I3D(modality='rgb', num_classes=num_class, dropout_prob=args.dropout) elif args.arch == 'se_i3d': from net.se_i3d import I3D i3d = I3D(modality='rgb', num_classes=num_class, dropout_prob=args.dropout) elif args.arch == 'bilinear_i3d': from net.bilinear_i3d import I3D i3d = I3D(modality='rgb', num_classes=num_class, dropout_prob=args.dropout) elif args.arch == 'se_bilinear_i3d': from net.se_bilinear_i3d import I3D i3d = I3D(modality='rgb', num_classes=num_class, dropout_prob=args.dropout) else: Exception("not support now!") i3d.eval() pretrain_dict = torch.load('pretrained_models/model_rgb.pth') model_dict = i3d.state_dict() weight_dict = weight_transform(model_dict, pretrain_dict) i3d.load_state_dict(weight_dict) i3d.cuda() #print(i3d) #============================set SGD, critization and lr ================== optimizer = torch.optim.SGD(i3d.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, dampening=0, nesterov=False) model = nn.DataParallel(i3d) criterion = torch.nn.NLLLoss().cuda() disturb = DisturbLabel(alpha=10, C=51) # criterion = FocalLoss(gamma = 0).cuda() #print(model) writer = SummaryWriter() #create log folders for plot timer = Timer() for epoch in range(1, args.epochs): timer.tic() adjust_learning_rate(optimizer, epoch, args.lr_steps) # train for one epoch train_prec1, train_loss = train(train_loader, model, criterion, optimizer, epoch, disturb) writer.add_scalar('Train/Accu', train_prec1, epoch) writer.add_scalar('Train/Loss', train_loss, epoch) # evaluate on validation set if (epoch + 1) % args.eval_freq == 0 or epoch == args.epochs - 1: prec1, val_loss = validate(val_loader, model, criterion, (epoch + 1) * len(train_loader)) writer.add_scalar('Val/Accu', prec1, epoch) writer.add_scalar('Val/Loss', val_loss, epoch) writer.add_scalars('data/Acc', { 'train_prec1': train_prec1, 'val_prec1': prec1 }, epoch) writer.add_scalars('data/Loss', { 'train_loss': train_loss, 'val_loss': val_loss }, epoch) #scheduler.step(val_loss) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best, best_prec1) val_logger.log({'epoch': epoch, 'acc': prec1}) timer.toc() left_time = timer.average_time * (args.epochs - epoch) print("best_prec1 is: {}".format(best_prec1)) print("left time is: {}".format(timer.format(left_time))) with open( 'logs/' + args.dataset + '/' + args.arch + '_' + args.mode + '_validation.txt', 'a') as f: f.write(str(epoch)) f.write(" ") f.write(str(train_prec1)) f.write(" ") f.write(str(prec1)) f.write(" ") f.write(timer.format(timer.diff)) f.write('\n') writer.export_scalars_to_json("./all_scalars.json") writer.close()
def run(dataset_path, db_filename, model_path, output_path, frames_per_clip=16, testset_filename='test_cross_env.txt', trainset_filename='train_cross_env.txt', frame_skip=1, batch_size=8, device='dev3', arch='HCN', pose_path='predictions/pose2d/openpose'): pred_output_filename = os.path.join(output_path, 'pred.npy') json_output_filename = os.path.join(output_path, 'action_segments.json') # setup dataset test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) test_dataset = Dataset(dataset_path, db_filename=db_filename, test_filename=testset_filename, train_filename=trainset_filename, transform=test_transforms, set='test', camera=device, frame_skip=frame_skip, frames_per_clip=frames_per_clip, mode='img', pose_path=pose_path, arch=arch) test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=6, pin_memory=True) # setup the model num_classes = test_dataset.num_classes if arch == 'HCN': model = HCN.HCN(in_channel=2, num_joint=19, num_person=1, out_channel=64, window_size=frames_per_clip, num_class=num_classes) elif arch == 'ST_GCN': graph_args = {'layout': 'openpose', 'strategy': 'spatial'} # layout:'ntu-rgb+d' model = st_gcn.Model(in_channels=2, num_class=num_classes, graph_args=graph_args, edge_importance_weighting=True, dropout=0.5) else: raise ValueError("Unsupported architecture: please select HCN | ST_GCN") checkpoints = torch.load(model_path) model.load_state_dict(checkpoints["model_state_dict"]) # load trained model model.cuda() # model = nn.DataParallel(model) n_examples = 0 # Iterate over data. avg_acc = [] pred_labels_per_video = [[] for i in range(len(test_dataset.video_list))] logits_per_video = [[] for i in range(len(test_dataset.video_list))] for test_batchind, data in enumerate(test_dataloader): model.train(False) # get the inputs inputs, labels, vid_idx, frame_pad = data # wrap them in Variable inputs = Variable(inputs.cuda(), requires_grad=True) labels = Variable(labels.cuda()) t = inputs.size(2) logits = model(inputs) logits = torch.nn.functional.interpolate(logits.unsqueeze(-1), t, mode='linear', align_corners=True) # logits = F.interpolate(logits, t, mode='linear', align_corners=True) # b x classes x frames acc = i3d_utils.accuracy_v2(torch.argmax(logits, dim=1), torch.argmax(labels, dim=1)) avg_acc.append(acc.item()) n_examples += batch_size print('batch Acc: {}, [{} / {}]'.format(acc.item(), test_batchind, len(test_dataloader))) logits = logits.permute(0, 2, 1) # [ batch, classes, frames] -> [ batch, frames, classes] logits = logits.reshape(inputs.shape[0] * frames_per_clip, -1) pred_labels = torch.argmax(logits, 1).detach().cpu().numpy().tolist() logits = torch.nn.functional.softmax(logits, dim=1).detach().cpu().numpy().tolist() pred_labels_per_video, logits_per_video = \ utils.accume_per_video_predictions(vid_idx, frame_pad,pred_labels_per_video, logits_per_video, pred_labels, logits, frames_per_clip) pred_labels_per_video = [np.array(pred_video_labels) for pred_video_labels in pred_labels_per_video] logits_per_video = [np.array(pred_video_logits) for pred_video_logits in logits_per_video] np.save(pred_output_filename, {'pred_labels': pred_labels_per_video, 'logits': logits_per_video}) utils.convert_frame_logits_to_segment_json(logits_per_video, json_output_filename, test_dataset.video_list, test_dataset.action_list)
def predict_events(cabin_video_path, face_video_path, args): if torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' checkpoint = args.checkpoint clip_length = args.clip_length clip_stride = args.clip_stride batch_size = args.batch_size num_classes = args.num_classes threshold = args.threshold cabin_clips, face_clips, indices_in_cabin_clips = clip_generation(cabin_video_path, face_video_path, clip_length, clip_stride) model = TAL_Net(num_classes) ckp = torch.load(checkpoint) model.load_state_dict(ckp['model']) model.to(device) model.eval() clip_transforms = transforms.Compose([videotransforms.CenterCrop(224), videotransforms.ToTensor(), videotransforms.ClipNormalize() ]) all_clips = [] all_predict_classes = [] all_start_scores = [] all_end_scores = [] n = len(cabin_clips) // batch_size for i in range(n): cabin_video_frames_batch = [] face_video_frames_batch = [] for j in range(i * batch_size, (i + 1) * batch_size): cabin_clip = cabin_clips[j] cabin_video_frames = load_rgb_frames(cabin_video_path, cabin_clip) cabin_video_frames = clip_transforms(cabin_video_frames) cabin_video_frames_batch.append(cabin_video_frames) face_clip = face_clips[j] face_video_frames = load_rgb_frames(face_video_path, face_clip) face_video_frames = clip_transforms(face_video_frames) face_video_frames_batch.append(face_video_frames) cabin_video_frames_batch = torch.stack(cabin_video_frames_batch) face_video_frames_batch = torch.stack(face_video_frames_batch) cabin_video_frames_batch = cabin_video_frames_batch.to(device) face_video_frames_batch = face_video_frames_batch.to(device) with torch.no_grad(): class_scores, start_scores, end_scores = model(cabin_video_frames_batch, face_video_frames_batch) pred_classes = torch.argmax(class_scores, dim=1) pred_classes = pred_classes.cpu().numpy() start_scores = start_scores.cpu().numpy() end_scores = end_scores.cpu().numpy() all_predict_classes.append(pred_classes) all_start_scores.append(start_scores) all_end_scores.append(end_scores) if len(cabin_clips) % batch_size != 0: cabin_video_frames_batch = [] face_video_frames_batch = [] for k in range(n * batch_size, len(cabin_clips)): cabin_clip = cabin_clips[k] cabin_video_frames = load_rgb_frames(cabin_video_path, cabin_clip) cabin_video_frames = clip_transforms(cabin_video_frames) cabin_video_frames_batch.append(cabin_video_frames) face_clip = face_clips[k] face_video_frames = load_rgb_frames(face_video_path, face_clip) face_video_frames = clip_transforms(face_video_frames) face_video_frames_batch.append(face_video_frames) cabin_video_frames_batch = torch.stack(cabin_video_frames_batch) face_video_frames_batch = torch.stack(face_video_frames_batch) cabin_video_frames_batch = cabin_video_frames_batch.to(device) face_video_frames_batch = face_video_frames_batch.to(device) with torch.no_grad(): class_scores, start_scores, end_scores = model(cabin_video_frames_batch, face_video_frames_batch) pred_classes = torch.argmax(class_scores, dim=1) pred_classes = pred_classes.cpu().numpy() start_scores = start_scores.cpu().numpy() end_scores = end_scores.cpu().numpy() all_predict_classes.append(pred_classes) all_start_scores.append(start_scores) all_end_scores.append(end_scores) all_predict_classes = np.concatenate(all_predict_classes) print(all_predict_classes) # rough chunk aggregation cabin_frames = os.listdir(cabin_video_path) cabin_frame_length = len(cabin_frames) cabin_indices = np.arange(start=0, stop=cabin_frame_length - clip_stride + 1, step=clip_stride) indices_in_shorter_clips = [list(range(idx, idx + clip_stride)) for idx in cabin_indices] # remainder = cabin_frame_length % clip_stride # if remainder != 0: # indices_in_shorter_clips.append(list(range(cabin_frame_length-remainder, cabin_frame_length))) # print(len(indices_in_shorter_clips)) # print(len(indices_in_cabin_clips)) shorter_clip_predict_classes = [] for i in range(len(indices_in_shorter_clips)): if i == 0: shorter_clip_predict_classes.append(all_predict_classes[0]) elif i == 1: l = [all_predict_classes[0], all_predict_classes[1]] shorter_clip_predict_classes.append(max(set(l), key = l.count)) elif i == 2: l = [all_predict_classes[0], all_predict_classes[1], all_predict_classes[2]] shorter_clip_predict_classes.append(max(set(l), key = l.count)) # elif i == len(indices_in_shorter_clips) - 3: # l = [all_predict_classes[i], all_predict_classes[i+1], all_predict_classes[i+2]] # shorter_clip_predict_classes.append(max(set(l), key = l.count)) # elif i == len(indices_in_shorter_clips) - 2: # l = [all_predict_classes[i], all_predict_classes[i+1]] # shorter_clip_predict_classes.append(max(set(l), key = l.count)) # elif i == len(indices_in_shorter_clips) - 1: # shorter_clip_predict_classes.append(all_predict_classes[i]) elif i < len(indices_in_cabin_clips): l = [all_predict_classes[j] for j in range(i-3, i+1)] shorter_clip_predict_classes.append(max(set(l), key = l.count)) elif i == len(indices_in_cabin_clips): index = len(indices_in_cabin_clips) - 1 l = [all_predict_classes[index-2], all_predict_classes[index-1], all_predict_classes[index]] shorter_clip_predict_classes.append(max(set(l), key = l.count)) elif i == len(indices_in_cabin_clips) + 1: index = len(indices_in_cabin_clips) - 1 l = [all_predict_classes[index-1], all_predict_classes[index]] shorter_clip_predict_classes.append(max(set(l), key = l.count)) elif i == len(indices_in_cabin_clips) + 2: index = len(indices_in_cabin_clips) - 1 shorter_clip_predict_classes.append(all_predict_classes[index]) print(shorter_clip_predict_classes) rough_clip_groups = defaultdict(list) for i in range(len(shorter_clip_predict_classes)): if shorter_clip_predict_classes[i] != 0: rough_clip_groups[shorter_clip_predict_classes[i]].append(i) print(rough_clip_groups) all_refined_clip_groups = dict() for key in rough_clip_groups.keys(): clip_group = rough_clip_groups[key] refined_groups = [] previous = 0 i = 0 while i < len(clip_group) - 1: if clip_group[i+1] - clip_group[i] >= 4: refined_groups.append(clip_group[previous:(i+1)]) previous = i+1 i += 1 refined_groups.append(clip_group[previous:]) all_refined_clip_groups[key] = refined_groups print(all_refined_clip_groups) # all_classes = all_clip_frame_groups.keys() keys = list(all_refined_clip_groups) if len(keys) == 2: k1 = keys[0] k2 = keys[1] groups1 = all_refined_clip_groups[k1] groups2 = all_refined_clip_groups[k2] i = 0 j = 0 while i < len(groups1): while j < len(groups2): min_index1 = min(groups1[i]) max_index1 = max(groups1[i]) min_index2 = min(groups2[j]) max_index2 = max(groups2[j]) set1 = set(range(min_index1, max_index1+1)) set2 = set(range(min_index2, max_index2+1)) if set1.issubset(set2) == True: groups1.remove(groups1[i]) break elif set2.issubset(set1) == True: groups2.remove(groups2[j]) else: intersec = set1.intersection(set2) for item in intersec: set1.discard(item) set2.discard(item) groups1[i] = list(set1) groups2[j] = list(set2) if max_index1 > max_index2: j += 1 else: i += 1 break if j == len(groups2): break final_all_clip_groups = { k1:groups1, k2:groups2 } else: final_all_clip_groups = all_refined_clip_groups print(final_all_clip_groups) all_clip_frame_groups = {} for key in final_all_clip_groups.keys(): final_groups = final_all_clip_groups[key] clip_frame_groups = [] for group in final_groups: clip_frame_group = set() for index in group: clip_frame_group = clip_frame_group.union(set(indices_in_shorter_clips[index])) start_frame = min(clip_frame_group) + 1 end_frame = max(clip_frame_group) + 1 clip_frame_groups.append([start_frame, end_frame]) all_clip_frame_groups[key] = clip_frame_groups return all_clip_frame_groups
def run(init_lr=0.1, max_steps=64e3, mode='rgb', root='../../SSBD/ssbd_clip_segment/data/', train_split='../../SSBD/Annotations/annotations_charades.json', batch_size=1, save_model=''): # setup dataset train_transforms = transforms.Compose([ videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(train_split, 'training', root, mode, train_transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True) val_dataset = Dataset(train_split, 'testing', root, mode, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True) dataloaders = {'train': dataloader, 'val': val_dataloader} datasets = {'train': dataset, 'val': val_dataset} # dataloaders = {'train': dataloader} # datasets = {'train': dataset} # setup the model xdc = torch.hub.load('HumamAlwassel/XDC', 'xdc_video_encoder', pretraining='r2plus1d_18_xdc_ig65m_kinetics', num_classes=3) # if mode == 'flow': # i3d = InceptionI3d(400, in_channels=2) # i3d.load_state_dict(torch.load('models/flow_imagenet.pt')) # else: # i3d = InceptionI3d(400, in_channels=3) # i3d.load_state_dict(torch.load('models/rgb_imagenet.pt')) # i3d.replace_logits(8) # #i3d.load_state_dict(torch.load('/ssd/models/000920.pt')) # i3d.cuda() # i3d = nn.DataParallel(i3d) xdc.cuda() xdc = nn.DataParallel(xdc).cuda() for name, param in xdc.named_parameters(): if 'fc' not in name and '4.1' not in name: param.requires_grad = False lr = init_lr optimizer = optim.SGD(xdc.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000]) num_steps_per_update = 4 # accum gradient steps = 0 best_val = 0 # new_flag = 0 # train it while steps < max_steps: #for epoch in range(num_epochs): print('Step {}/{}'.format(steps, max_steps)) print('-' * 10) # new_state_dict = OrderedDict() # state_dict = torch.load(save_model+'.pt') # for k, v in state_dict.items(): # name = "module."+k # add module. # new_state_dict[name] = v # xdc.load_state_dict(new_state_dict) # new_flag = 0 # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': xdc.train(True) else: xdc.train(False) # Set model to evaluate mode tot_loss = 0.0 # tot_loc_loss = 0.0 # tot_cls_loss = 0.0 num_iter = 0 total = 0 n = 0 optimizer.zero_grad() # Iterate over data. for data in dataloaders[phase]: num_iter += 1 # get the inputs inputs, labels = data # wrap them in Variable inputs = Variable(inputs.cuda()) t = inputs.size(2) labels = Variable(labels.cuda()) per_frame_logits = xdc(inputs) # print(per_frame_logits.shape) # print(labels.shape) # upsample to input size # per_frame_logits = F.upsample(per_frame_logits, t, mode='linear') # compute localization loss # loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels) # tot_loc_loss += loc_loss.data.item() # compute classification loss (with max-pooling along time B x C x T) # cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0]) # print(torch.max(per_frame_logits, dim=2)[0]) # print(torch.max(labels, dim=2)[0]) correct = per_frame_logits.argmax(1).eq(labels.argmax(1)) total += correct.float().sum().item() n += batch_size # tot_cls_loss += cls_loss.data.item() loss = F.binary_cross_entropy_with_logits( per_frame_logits, labels) / num_steps_per_update tot_loss += loss.data.item() loss.backward() if num_iter == num_steps_per_update and phase == 'train': steps += 1 num_iter = 0 optimizer.step() optimizer.zero_grad() lr_sched.step() if steps % 10 == 0: print('{} Tot Loss: {:.4f} Accuracy: {:.4f}'.format( phase, tot_loss / 10, total / n)) # save model # if(steps % 10000 == 0): # torch.save(xdc.module.state_dict(), save_model+str(steps).zfill(6)+'.pt') # tot_loss = tot_loc_loss = tot_cls_loss = 0. tot_loss = 0 total = 0 n = 0 if phase == 'val': print('{} Tot Loss: {:.4f} Accuracy: {:.4f}'.format( phase, (tot_loss * num_steps_per_update) / num_iter, total / n)) if (total / n > best_val): best_val = total / n torch.save(xdc.module.state_dict(), save_model + '.pt')