def __getitem__(self, index): os.chdir(self._data_root) # siamese label, '0' means same, '1' means diff video_pairs = self._videos_list[index] pairs_data = [] # ( img1,img2 ) for video in video_pairs: # shapes (nums,height,width,channels) # # process iframe frames, _, _ = read_video(video) # print(frames.shape) if len(frames) == 0: print("decode frame failed") frames = np.array(np.zeros((self._num_segments, 256, 340, 3)), dtype=np.float32) frames = random_sample( frames, self._num_segments) if self._is_train else fix_sample( frames, self._num_segments) frames = np.asarray(frames, dtype=np.float32) frames = self._iframe_transform( frames) if self._is_train else self._infer_transform(frames) frames = np.asarray(frames, dtype=np.float32) / 255.0 frames = np.transpose(frames, (3, 0, 1, 2)) frames = (frames - self._input_mean) / self._input_std pairs_data.append(frames) return pairs_data, self._labels_list[index]
def decode_video_with_av( encoded_video: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]: with unittest.mock.patch("torchvision.io.video.os.path.exists", return_value=True): return read_video( ReadOnlyTensorBuffer(encoded_video)) # type: ignore[arg-type]
def __getitem__(self, index): #cut each video to K segments, then extract a random snippit #from each of them, and add these with the label video_name = self.videos[index] video_frames, _, _ = read_video(video_name) length_video = video_frames.shape[0] length_of_segment = length_video // self.no_segments snippets = torch.zeros((self.no_segments, video_frames.shape[3], video_frames.shape[1], video_frames.shape[2])) for i in range(self.no_segments): if (self.training): start = i * length_of_segment finish = min(start + length_of_segment, length_video) # for training index is random idx = np.random.randint(start, finish) snippet = video_frames[idx].permute((2, 0, 1)) snippets[i] = snippet else: start = i * length_of_segment finish = min(start + length_of_segment, length_video) # for testing index is middle of segment idx = int(start + length_of_segment // 2) snippet = video_frames[idx].permute((2, 0, 1)) snippets[i] = snippet label = self.labels[index] # return same labels n_segments time for all segments for training purposes returned_labels = [label] * self.no_segments return snippets, torch.LongTensor(returned_labels)
def extract(self, device: torch.device, model: torch.nn.Module, classifier: torch.nn.Module, video_path: Union[str, None] = None) -> Dict[str, np.ndarray]: '''The extraction call. Made to clean the forward call a bit. Arguments: device {torch.device} model {torch.nn.Module} classifier {torch.nn.Module} -- pre-trained classification layer, will be used if show_pred is True Keyword Arguments: video_path {Union[str, None]} -- if you would like to use import it and use it as "path -> model features"-fashion (default: {None}) Returns: Dict[str, np.ndarray] -- the dict with numpy feature ''' # take the video, change fps and save to the tmp folder if self.extraction_fps is not None: video_path = reencode_video_with_diff_fps(video_path, self.tmp_path, self.extraction_fps) # read a video rgb, audio, info = read_video(video_path, pts_unit='sec') # prepare data (first -- transform, then -- unsqueeze) rgb = self.transforms(rgb) rgb = rgb.unsqueeze(0) # slice the slices = form_slices(rgb.size(2), self.stack_size, self.step_size) vid_feats = [] for stack_idx, (start_idx, end_idx) in enumerate(slices): # inference with torch.no_grad(): output = model(rgb[:, :, start_idx:end_idx, :, :].to(device)) vid_feats.extend(output.tolist()) # show predicitons on kinetics dataset (might be useful for debugging) if self.show_pred: logits = classifier(output) print(f'{video_path} @ frames ({start_idx}, {end_idx})') show_predictions_on_dataset(logits, 'kinetics') feats_dict = { self.feature_type: np.array(vid_feats), } return feats_dict
def example_load_frame(): v, a, info = read_video("/home/tt/Videos/VID_20201202_133703_090.mp4", pts_unit='sec') print(v.shape) # torch.Size([467, 1080, 1920, 3]) # write a frame single_frame = v[100] print(single_frame.shape) # torch.Size([1080, 1920, 3]) single_frame = single_frame.permute(2, 0, 1) # to CHW print(single_frame.shape) file_out = os.path.join(out_path, "single_frame.png") write_png(single_frame, file_out) print("done write to ", file_out)
def video_test(): """Test video read/write functions.""" transformer = transforms.ToPILImage() vframes, aframes, info = video.read_video("/tmp/a.mp4") # vframes format: [T, H, W, C], data range is [0, 255] # H, W, C ==> C, H, W image = transformer(vframes[0].permute(2, 0, 1)) # image data range: [0, 255] # tensor = transforms.ToTensor()(image) # ==> tensor data range: [0.0, 1.0] image.show()
def predict_sr(model, device, input_video, output_dir): """Predict SR model.""" vframes, aframes, info = read_video(input_video, pts_unit='sec') # vframe format: [T, H, W, C], data range:[0,255], good for h5! for i in tqdm(range(len(vframes))): input_tensor = vframes[i].permute(2, 0, 1).float() # input_tensor is tensor, format CxHxW, data range [0.0, 1.0] input_tensor.unsqueeze_(0) input_tensor = input_tensor.to(device) with torch.no_grad(): output_tensor = model(input_tensor) output_tensor.squeeze_(0) output_image = tensor_to_image(output_tensor.cpu()) output_image.save("{}/{:03d}.png".format(output_dir, i))
from torchvision.io.video import read_video, write_video from torchvision.io.image import write_jpeg import torch from args_util import meow_parse from data_flow import get_predict_video_dataloader from models import create_model import os from visualize_util import save_density_map_normalize, save_density_map VIDEO_PATH = "/home/tt/Videos/VID_20201204_133931_404.mp4" OUTPUT_PATH = "/data/my_crowd_image/video_bike_q100" v, a, info = read_video(VIDEO_PATH, pts_unit='sec') print(info) print(v.shape) length = v.shape[0] print(length) count = 0 for i in range(length): # if (i% 20 == 0): frame = v[i] frame = frame.permute(2, 0, 1) file_out_path = os.path.join(OUTPUT_PATH, "IMG_" + str(i) + ".jpg") write_jpeg(frame, file_out_path, quality=100) print(file_out_path)
def __getitem__(self, index): # ============= EXTRACT RGB SNIPPETS ================ video_name = self.videos_rgb[index] video_frames, _, _ = read_video(video_name) length_video = video_frames.shape[0] length_of_segment = length_video // self.no_segments snippets_rgb = torch.zeros( (self.no_segments, video_frames.shape[3], video_frames.shape[1], video_frames.shape[2])) for i in range(self.no_segments): start = i * length_of_segment # for testing index is middle of segment idx = int(start + length_of_segment // 2) snippet = video_frames[idx].permute((2, 0, 1)) snippets_rgb[i] = snippet # ============= EXTRACT FLOW SNIPPETS ================ video_name_flow = self.videos_flow[index] #get list of all files in video folder flow_images = [ f for f in listdir(video_name_flow) if isfile(join(video_name_flow, f)) ] # read first flow image to get dimensions first_image_flow = tf.to_tensor( Image.open(video_name_flow + '/' + flow_images[0])) length_video = len(flow_images) // 2 length_of_segment = length_video // self.no_segments snippets_flow = torch.zeros( (self.no_segments, 5 * 2, first_image_flow.shape[1], first_image_flow.shape[2])) for i in range(self.no_segments): start = i * length_of_segment for flow_idx in range(5): idx = start + flow_idx # add path here x_flow_image = Image.open(video_name_flow + '/' + flow_images[idx]) x_flow_image = tf.to_tensor(x_flow_image) y_flow_image = Image.open(video_name_flow + '/' + flow_images[idx + length_video]) y_flow_image = tf.to_tensor(y_flow_image) snippets_flow[i][2 * flow_idx] = x_flow_image snippets_flow[i][2 * flow_idx + 1] = y_flow_image label = self.labels[index] # return same labels n_segments time for all segments for training purposes returned_labels = [label] * self.no_segments return (snippets_rgb, snippets_flow), torch.LongTensor(returned_labels)