def extract_feature(opt, video_dir, C3D_model): assert opt.mode in ['score', 'feature'] spatial_transform = Compose([Scale(opt.sample_size), CenterCrop(opt.sample_size), ToTensor(), Normalize(opt.mean, [1, 1, 1])]) temporal_transform = LoopPadding(opt.sample_duration) load_image_fn = None data = Video(opt, video_dir, load_image_fn, spatial_transform=spatial_transform, temporal_transform=temporal_transform, sample_duration=opt.sample_duration) data_loader = torch.utils.data.DataLoader(data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) c3d_features = [] for i, clip in enumerate(data_loader): print(clip.mean()) ## c3d feats clip = clip.to(opt.device) with torch.no_grad(): c3d_outputs = C3D_model(clip) # 汇总 c3d_features.append(c3d_outputs.cpu().data) # torch.Size([8, 512, 14, 14]) c3d_features = torch.cat(c3d_features, 0) # c3d feature of one video return c3d_features.cpu().numpy()
def classify_video(video_dir, video_name, class_names, model, opt): # print("video_dir: {}, video_name: {}".format(video_dir,video_name)); assert opt.mode in ['score', 'feature'] spatial_transform = Compose([Scale(opt.sample_size), CenterCrop(opt.sample_size), ToTensor(), Normalize(opt.mean, [1, 1, 1])]) temporal_transform = LoopPadding(opt.sample_duration) data = Video(video_dir, spatial_transform=spatial_transform, temporal_transform=temporal_transform, sample_duration=opt.sample_duration) data_loader = torch.utils.data.DataLoader(data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) video_outputs = [] # video_segments = [] for i, (inputs, segments) in enumerate(data_loader): inputs = Variable(inputs, volatile=True) outputs = model(inputs) video_outputs.append(outputs.cpu().data) # video_segments.append(segments) if len(video_outputs) != 0: video_outputs = torch.cat(video_outputs) return video_outputs.numpy() else: return None
def classify_video(video_dir, video_name, class_names, model, opt, annotation_digit=5): assert opt.mode in ['score', 'feature'] spatial_transform = Compose([ Scale(opt.sample_size), CenterCrop(opt.sample_size), ToTensor(), Normalize(opt.mean, [1, 1, 1]) ]) temporal_transform = LoopPadding(opt.sample_duration) data = Video(video_dir, spatial_transform=spatial_transform, temporal_transform=temporal_transform, sample_duration=opt.sample_duration) data_loader = torch.utils.data.DataLoader(data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) print('reading file from: ', video_dir, 'file name: ', video_name) video_outputs = [] video_segments = [] shit_lol = enumerate(data_loader) for i, (inputs, segments) in enumerate(data_loader): inputs = Variable(inputs, volatile=True) outputs = model(inputs) video_outputs.append(outputs.cpu().data) video_segments.append(segments) video_outputs = torch.cat(video_outputs) video_segments = torch.cat(video_segments) results = {'video': video_name, 'clips': []} _, max_indices = video_outputs.max(dim=1) for i in range(video_outputs.size(0)): clip_results = { 'segment': video_segments[i].tolist(), } if opt.mode == 'score': clip_results['label'] = class_names[max_indices[i]] clip_results['scores'] = video_outputs[i].tolist() elif opt.mode == 'feature': clip_results['features'] = video_outputs[i].tolist() clip_results['ground_truth_annotaion'] = annotation_digit results['clips'].append(clip_results) return results
def classify_video(video_dir, video_name, class_names, model, opt): assert opt.mode in ['score', 'feature'] print('video_name, class_names', video_name) spatial_transform = Compose([ Scale(opt.sample_size), CenterCrop(opt.sample_size), ToTensor(), Normalize(opt.mean, [1, 1, 1]) ]) temporal_transform = LoopPadding(opt.sample_duration) data = Video(video_dir, spatial_transform=spatial_transform, temporal_transform=temporal_transform, sample_duration=opt.sample_duration) data_loader = torch.utils.data.DataLoader(data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) video_outputs = [] video_segments = [] print('Running on video', video_dir) #print ('Data loader size', len(data_loader)) for i, (inputs, segments) in enumerate(data_loader): inputs = Variable(inputs, volatile=True) print(i, inputs.size(), segments.shape) outputs = model(inputs) video_outputs.append(outputs.cpu().data) video_segments.append(segments) #print('Video outputs and segments', video_outputs) results = {'video': video_name, 'clips': []} if len(video_outputs) > 0: print('Video outputs and segments: ', video_outputs[0].shape) video_outputs = torch.cat(video_outputs) video_segments = torch.cat(video_segments) _, max_indices = video_outputs.max(dim=1) print('Video outputs', video_outputs.size()) for i in range(video_outputs.size(0)): clip_results = { 'segment': video_segments[i].tolist(), } if opt.mode == 'score': clip_results['label'] = class_names[max_indices[i]] clip_results['scores'] = video_outputs[i].tolist() elif opt.mode == 'feature': clip_results['features'] = video_outputs[i].tolist() results['clips'].append(clip_results) return results
def classify_video(video_dir, video_name, class_names, model, opt): assert opt.mode in ['score', 'feature'] spatial_transform = Compose([Scale(opt.sample_size), CenterCrop(opt.sample_size), ToTensor(), Normalize(opt.mean, [1, 1, 1])]) temporal_transform = LoopPadding(opt.sample_duration) data = Video(video_dir, spatial_transform=spatial_transform, temporal_transform=temporal_transform, sample_duration=opt.sample_duration, stride=opt.stride) data_loader = torch.utils.data.DataLoader(data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) video_outputs = [] video_segments = [] for i, (inputs, segments) in enumerate(data_loader): inputs = Variable(inputs, volatile=True) outputs = model(inputs) video_outputs.append(outputs.cpu().data) video_segments.append(segments) if len(video_outputs) == 0: with open("error.list", 'a') as fout: fout.write("{}\n".format(video_name)) return {} video_outputs = torch.cat(video_outputs) video_segments = torch.cat(video_segments) results = { 'video': video_name, 'clips': [] } _, max_indices = video_outputs.max(dim=1) for i in range(video_outputs.size(0)): clip_results = { 'segment': video_segments[i].tolist(), } if opt.mode == 'score': clip_results['label'] = class_names[max_indices[i]] clip_results['scores'] = video_outputs[i].tolist() elif opt.mode == 'feature': clip_results['features'] = video_outputs[i].tolist() results['clips'].append(clip_results) return results
def extract_feature(opt, video_dir, C3D_model, load_image_fn, C2D_model, c2d_shape, duration): assert opt.mode in ['score', 'feature'] C, H, W = c2d_shape spatial_transform = Compose([ Scale(opt.sample_size), CenterCrop(opt.sample_size), ToTensor(), Normalize(opt.mean, [1, 1, 1]) ]) temporal_transform = LoopPadding(opt.sample_duration) opt.num_segments = max(int(duration / opt.clip_len), 1) data = Video(opt, video_dir, load_image_fn, spatial_transform=spatial_transform, temporal_transform=temporal_transform, sample_duration=opt.sample_duration) data_loader = torch.utils.data.DataLoader(data, batch_size=opt.batch_size, shuffle=False, num_workers=0, pin_memory=True) c3d_features = [] c2d_features = [] for i, (clip, frames_npy_data) in enumerate(data_loader): ## c3d feats clip = clip.to(opt.device) with torch.no_grad(): c3d_outputs = C3D_model(clip) frames = frames_npy_data.to(opt.device) with torch.no_grad(): c2d_outputs = C2D_model(frames).squeeze() if len(c2d_outputs.shape) == 1: c2d_outputs = c2d_outputs.unsqueeze(0) # 汇总 c3d_features.append(c3d_outputs.cpu().data) c2d_features.append(c2d_outputs.cpu().data) try: c3d_features = torch.cat(c3d_features) # c3d feature of one video c2d_features = torch.cat(c2d_features) # c3d feature of one video except: return None, None return c3d_features.cpu().numpy(), c2d_features.cpu().numpy()
def classify_video(video_dir, video_name, class_names, model, opt): assert opt.mode in ['score', 'feature'] spatial_transform = Compose([ Scale(opt.sample_size), CenterCrop(opt.sample_size), ToTensor(), Normalize(opt.mean, [1, 1, 1]) ]) temporal_transform = LoopPadding(opt.sample_duration) data = Video(video_dir, spatial_transform=spatial_transform, temporal_transform=temporal_transform, sample_duration=opt.sample_duration) data_loader = torch.utils.data.DataLoader(data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) video_outputs = [] video_segments = [] with torch.no_grad(): for i, (inputs, segments) in enumerate(data_loader): inputs = Variable(inputs) outputs = model(inputs) video_outputs.append(outputs.cpu().data) video_segments.append(segments) video_outputs = torch.cat(video_outputs) video_segments = torch.cat(video_segments) results = {'video': video_name, 'clips': []} os.mkdir('features/' + video_name.split('.')[0]) mypath = 'features/' + video_name.split('.')[0] + '/' _, max_indices = video_outputs.max(dim=1) for i in range(video_outputs.size(0)): with open(mypath + str(i) + '.txt', 'w+') as f: f.write(' '.join(map(str, video_outputs[i].tolist()))) return results
def classify_video(video_dir, video_name, model, opt): assert opt.mode in ['score', 'feature'] spatial_transform = Compose([ Scale(opt.sample_size), CenterCrop(opt.sample_size), ToTensor(), Normalize(opt.mean, [1, 1, 1]) ]) temporal_transform = LoopPadding(opt.sample_duration) data = Video(video_dir, spatial_transform=spatial_transform, temporal_transform=temporal_transform, sample_duration=opt.sample_duration) data_loader = torch.utils.data.DataLoader(data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) video_outputs = [] video_segments = [] for i, (inputs, segments) in enumerate(data_loader): inputs = Variable(inputs, volatile=True) outputs = model(inputs) video_outputs.append(outputs.cpu().data) video_segments.append(segments) video_outputs = torch.cat(video_outputs) video_segments = torch.cat(video_segments) # results = { # 'video': video_name, # 'clips': [] # } clips = [] _, max_indices = video_outputs.max(dim=1) for i in range(video_outputs.size(0)): clip_results = { 'segment': video_segments[i].tolist(), } clip_results['features'] = video_outputs[i].tolist() clips.append(clip_results) return video_name, clips
def classify_video(video_dir, video_name, model, opt): assert opt.mode in ['score', 'feature'] spatial_transform = Compose([ Scale(opt.sample_size), CenterCrop(opt.sample_size), ToTensor(), Normalize(opt.mean, [1, 1, 1]) ]) temporal_transform = LoopPadding(opt.sample_duration) data = Video(video_dir, spatial_transform=spatial_transform, temporal_transform=temporal_transform, sample_duration=opt.sample_duration) data_loader = torch.utils.data.DataLoader(data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=False) video_outputs = [] video_segments = [] with torch.no_grad(): for i, (inputs, segments) in enumerate(data_loader): inputs = Variable(inputs) outputs = model(inputs) video_outputs.append(outputs.cpu().data) video_segments.append(segments) if video_outputs: video_outputs = torch.cat(video_outputs) video_segments = torch.cat(video_segments) results = dict() results['video'] = video_name results['features'] = video_outputs results['clips'] = video_segments return results
def classify_video(video_dir, video_name, class_names, model, opt): assert opt.mode == 'feature' spatial_transform = Compose([ Scale(opt.sample_size), CenterCrop(opt.sample_size), ToTensor(), Normalize(opt.mean, [1, 1, 1]) ]) temporal_transform = LoopPadding(opt.sample_duration) data = Video(video_dir, spatial_transform=spatial_transform, temporal_transform=temporal_transform, sample_duration=opt.sample_duration) data_loader = torch.utils.data.DataLoader(data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) video_outputs = [] video_segments = [] with torch.no_grad(): for i, (inputs, segments) in enumerate(data_loader): inputs = Variable(inputs) outputs = model(inputs) video_outputs.append(outputs.cpu().data) video_segments.append(segments) video_outputs = torch.cat(video_outputs) # video_segments = torch.cat(video_segments) results = [] for i in range(video_outputs.size(0)): clip_results = np.expand_dims(video_outputs[i].numpy(), axis=0) results.append(clip_results) results = np.concatenate(results, axis=0) return results
cls_acc = cls_hit / cls_cnt print(cls_acc) print('Accuracy {:.02f}%'.format(np.mean(cls_acc) * 100)) with open(opt.result_path, 'w') as f: json.dump(test_results, f) if __name__ == '__main__': opt = parse_opts() opt.mean = get_mean() opt.arch = '{}-{}'.format(opt.model_name, opt.model_depth) opt.sample_duration = 16 spatial_transform = Compose([Scale(opt.sample_size), CenterCrop(opt.sample_size), ToTensor(1), Normalize(opt.mean, [1, 1, 1])]) temporal_transform = LoopPadding(opt.sample_duration) data = Video(opt.val_list, spatial_transform=spatial_transform, temporal_transform=temporal_transform, sample_duration=opt.sample_duration, n_samples_for_each_video=0) data_loader = torch.utils.data.DataLoader(data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) model, _ = generate_model(opt) model = nn.DataParallel(model, device_ids=opt.gpus).cuda() print('loading model {}'.format(opt.model)) model_data = torch.load(opt.model) assert opt.arch == model_data['arch'] model.load_state_dict(model_data['state_dict']) model.eval() test(data_loader, model, opt)
def classify_video(video_dir, video_name, class_names, model, opt): assert opt.mode in ['score', 'feature'] spatial_transform = Compose([ Scale(opt.sample_size), CenterCrop(opt.sample_size), ToTensor(), Normalize(opt.mean, [1, 1, 1]) ]) temporal_transform = LoopPadding(opt.sample_duration) data = Video(video_dir, spatial_transform=spatial_transform, temporal_transform=temporal_transform, sample_duration=opt.sample_duration) data_loader = torch.utils.data.DataLoader(data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) video_outputs = [] video_segments = [] for i, (inputs, segments) in enumerate(data_loader): inputs = Variable(inputs, volatile=True) outputs = model(inputs) outputs = F.softmax(outputs, dim=1) video_outputs.append(outputs.cpu().data) video_segments.append(segments) video_outputs = torch.cat(video_outputs) video_segments = torch.cat(video_segments) results = {'video': video_name, 'clips': []} for i in range(video_outputs.size(0)): clip_results = { 'segment': video_segments[i].tolist(), } label = get_video_results(video_outputs[i], class_names, 5) clip_results['label'] = label results['clips'].append(clip_results) # _, max_indices = video_outputs.max(dim=1) # for i in range(video_outputs.size(0)): # clip_results = { # 'segment': video_segments[i].tolist(), # } # if opt.mode == 'score': # clip_results['label'] = class_names[max_indices[i]] # clip_results['scores'] = video_outputs[i, max_indices[i]].item() # elif opt.mode == 'feature': # clip_results['features'] = video_outputs[i].tolist() # results['clips'].append(clip_results) # average_scores = torch.mean(video_outputs, dim=0) # video_results, predicted_labels = get_video_results(average_scores, class_names, 1) # video_results = get_video_results(average_scores, class_names, 5) # results = { # 'video': video_name, # 'result': video_results, # # 'predicted_labels': predicted_labels # } return results
subprocess.call('ffmpeg -i {} tmp/image_%05d.jpg'.format(test_video), shell=True) # In[173]: test_results = {'results': {}} end_time = time.time() output_buffer = [] previous_video_id = '' batch_time = AverageMeter(name='Meter', length=10) data_time = AverageMeter(name='Meter', length=10) # In[171]: data = Video('tmp', spatial_transform=spatial_transform, temporal_transform=temporal_transform, sample_duration=sample_duration) # In[172]: data_loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True) # In[174]: videoPath = "../dataset/{}/*".format("hmdb") activity_classes = [i.split(os.path.sep)[3] for i in glob.glob(videoPath)] print(activity_classes)
opt.sample_duration = 16 opt.scales = [opt.initial_scale] for i in range(1, opt.n_scales): opt.scales.append(opt.scales[-1] * opt.scale_step) print('#####', opt.scales) print(opt.mean) spatial_transform = Compose([ MultiScaleCornerCrop(opt.scales, opt.sample_size), RandomHorizontalFlip(), ToTensor(1), Normalize(opt.mean, [1, 1, 1]) ]) temporal_transform = TemporalRandomCrop(opt.sample_duration) train_data = Video(opt.train_list, spatial_transform=spatial_transform, temporal_transform=temporal_transform, sample_duration=opt.sample_duration, n_samples_for_each_video=1) train_loader = torch.utils.data.DataLoader(train_data, batch_size=opt.batch_size, shuffle=True, num_workers=opt.n_threads, pin_memory=True) val_spatial_transform = Compose([ Scale(opt.sample_size), CenterCrop(opt.sample_size), ToTensor(1), Normalize(opt.mean, [1, 1, 1]) ]) val_temporal_transform = LoopPadding(opt.sample_duration)
def train_main_multi_batch(model, input_root_dir, opt): #### epoch_logger = logging.getLogger('info') batch_logger = logging.getLogger('info') elogHandler = logging.StreamHandler() eformatter = jsonlogger.JsonFormatter() elogHandler.setFormatter(eformatter) epoch_logger.addHandler(elogHandler) blogHandler = logging.StreamHandler() bformatter = jsonlogger.JsonFormatter() blogHandler.setFormatter(bformatter) batch_logger.addHandler(blogHandler) spatial_transform = Compose([ Scale(opt.sample_size), CenterCrop(opt.sample_size), ToTensor(), Normalize(opt.mean, [1, 1, 1]) ]) temporal_transform = LoopPadding(opt.sample_duration) # criterion = nn.CrossEntropyLoss() criterion = nn.MSELoss() if not opt.no_cuda: criterion = criterion.cuda() optimizer = optim.Adam(model.parameters(), lr=1e-3) epoch = 1 model.train() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() accuracies = AverageMeter() end_time = time.time() ii = 0 previous_label = "FAKE" pre_previous_label = "FAKE" for files_dir in os.listdir(input_root_dir): sub_path = os.path.join(input_root_dir, files_dir) print("Files dir: " + files_dir) print("Sub path:" + sub_path) data_file_path = os.path.join(sub_path, 'metadata.json') with open(data_file_path, 'r') as data_file: labels = json.load(data_file) opt.batch_size = 36 total_batch_size = len(os.listdir(sub_path)) i = 0 input_files = os.listdir(sub_path) for inp_num in range(1, len(input_files), 2): print("Lala: " + str(inp_num)) # print(input_files) input_file1 = input_files[inp_num] input_file2 = input_files[inp_num - 1] if input_file1.endswith(".mp4") and input_file2.endswith(".mp4"): video_path1 = os.path.join(sub_path, input_file1) video_path2 = os.path.join(sub_path, input_file2) label1 = labels[input_file1] label2 = labels[input_file2] if label1['label'] != previous_label or label1[ 'label'] != pre_previous_label: previous_label = label1['label'] subprocess.call('mkdir tmp', shell=True) subprocess.call( 'ffmpeg -hide_banner -loglevel panic -i {} -vframes 288 tmp/image_%05d.jpg' .format(video_path1), shell=True) subprocess.call( 'ffmpeg -hide_banner -loglevel panic -i {} -vframes 288 -start_number 289 tmp/image_%05d.jpg' .format(video_path2), shell=True) video_dir = '{}tmp/'.format( '/data/codebases/video_classification/') data = Video(video_dir, spatial_transform=spatial_transform, temporal_transform=temporal_transform, sample_duration=opt.sample_duration) data_loader = torch.utils.data.DataLoader( data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) for k, (inputs, targets) in enumerate(data_loader): data_time.update(time.time() - end_time) print("Label: " + label1['label'] + ", " + label2['label']) # # FOR CROSS ENTROPY LOSS # targets = torch.zeros([18, 1], dtype=torch.long) # for j in range(0,18): # if(label['label'] == 'FAKE'): # targets[j][0] = 0 # # targets[j][1] = 1 # else: # targets[j][0] = 1 # # targets[j][1] = 0 # FOR MSE LOSS targets = torch.zeros([opt.batch_size, opt.n_classes], dtype=torch.float) for j in range(0, int(opt.batch_size / 2)): if (label1['label'] == 'FAKE'): targets[j][0] = 0.0 targets[j][1] = 1.0 else: targets[j][0] = 1.0 targets[j][1] = 0.0 for j in range(int(opt.batch_size / 2), opt.batch_size): if (label2['label'] == 'FAKE'): targets[j][0] = 0.0 targets[j][1] = 1.0 else: targets[j][0] = 1.0 targets[j][1] = 0.0 if not opt.no_cuda: targets = targets.cuda(non_blocking=True) inputs = Variable(inputs) targets = Variable(targets) outputs = model(inputs) print(outputs.t()) print(targets.t()) # FOR CROSS ENTROPY LOSS # loss = criterion(outputs, torch.max(targets, 1)[1]) # FOR MSE LOSS loss = criterion(outputs, targets) print(loss) # FOR CROSS ENTROPY LOSS # acc = calculate_accuracy(outputs, targets) # FOR MSE LOSS acc = calculate_accuracy_mse(outputs, targets) print(acc) try: losses.update(loss.data[0], inputs.size(0)) except: losses.update(loss.data, inputs.size(0)) accuracies.update(acc, inputs.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() batch_time.update(time.time() - end_time) end_time = time.time() batch_logger.log( 1, { 'epoch': epoch, 'batch': i + 1, 'iter': (epoch - 1) * opt.batch_size + (i + 1), 'loss': losses.val, 'acc': accuracies.val, 'lr': optimizer.param_groups[0]['lr'] }) print( 'Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format( epoch, i + 1, opt.batch_size, batch_time=batch_time, data_time=data_time, loss=losses, acc=accuracies)) ii += 1 subprocess.call('rm -rf tmp', shell=True) i += 1 if ii % 100 == 0: save_loc = '/data/codebases/video_classification/model{}.pth'.format( ii) torch.save(model.state_dict(), save_loc) epoch_logger.log( 1, { 'epoch': epoch, 'loss': losses.avg, 'acc': accuracies.avg, 'lr': optimizer.param_groups[0]['lr'] }) print('XXX Epoch: [{0}]\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format(epoch, i + 1, opt.batch_size, batch_time=batch_time, data_time=data_time, loss=losses, acc=accuracies)) exit(1)
def extract_features(video_dir, video_name, class_names, model, opt, annotation_digit=5): assert opt.mode in ['score', 'feature'] spatial_transform = Compose([ Scale(opt.sample_size), CenterCrop(opt.sample_size), ToTensor(), Normalize(opt.mean, [1, 1, 1]) ]) temporal_transform = LoopPadding(opt.sample_duration) data = Video(video_dir, spatial_transform=spatial_transform, temporal_transform=temporal_transform, sample_duration=opt.sample_duration) data_loader = torch.utils.data.DataLoader(data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) # print('reading file from: ', video_dir, 'file name: ', video_name) video_outputs = [] video_segments = [] model.eval() for i, (inputs, segments) in enumerate(data_loader): # inputs = Variable(inputs, volatile=True) inputs = inputs.cuda() outputs = model(inputs) # outputs_cpu = outputs.cpu().data.numpy() # video_outputs += outputs_cpu # video_outputs += outputs.cpu().data # np.vstack([video_outputs, outputs_cpu]) video_outputs.append(outputs.cpu().data) # video_outputs.cat(video_outputs, outputs.cpu().data) video_segments.append(segments) video_outputs = torch.cat(video_outputs) video_segments = torch.cat(video_segments) results = {'video': video_name, 'clips': []} _, max_indices = video_outputs.max(dim=1) for i in range(video_outputs.size(0)): clip_results = { 'segment': video_segments[i].tolist(), } if opt.mode == 'score': clip_results['label'] = class_names[max_indices[i]] clip_results['scores'] = video_outputs[i].tolist() elif opt.mode == 'feature': clip_results['features'] = video_outputs[i].tolist() clip_results['ground_truth_annotaion'] = annotation_digit results['clips'].append(clip_results) total_feature_vectors = len(results["clips"]) np_data = np.array([], dtype=np.float64).reshape(0, 2048) for features_in_one_video in range(total_feature_vectors): # for i in result[1]["clips"]: # print (i["scores"]) one_feature_vector = results["clips"][features_in_one_video][ "features"] a = np.asarray(one_feature_vector) # print(a) np_data = np.vstack([np_data, a]) return np_data