def transform_screen_data(self, screen): # Convert to float, rescale, convert to tensor screen = np.ascontigousarray(screen, dtype=np.float32) / 255 screen = torch.from_numpy(screen) # Use torchvision package to compose image transforms resize = T.compose([T.ToPILImage(), T.Resize((40, 90)), T.Totensor()]) return resize(screen).unsqueeze(0).to( self.device) # add a batch dimension
def image2Tensor(cv2im, resize_im=True): mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] if resize_im: cv2im = cv2.resize(cv2im, (224, 224)) im_as_arr = np.float32(cv2im) im_as_arr = np.ascontigousarray(im_as_arr[..., ::-1]) im_as_arr = im_as_arr.transpose(2, 0, 1) for channel, _ in enumerate(im_as_arr): im_as_arr[channel] /= 255 im_as_arr[channel] -= mean[channel] im_as_arr[channel] /= std[channel] im_as_ten = torch.from_numpy(im_as_arr).float() im_as_ten.unsqueeze_(0) im_as_var = Variable(im_as_ten, requires_grad=True) return im_as_var
def read_feat(self, video_name, f_init=None, duration=None, return_reshaped=True): """Read visual encoder features and stack them into memory. Parameters ---------- video_name : str. Video identifier. f_init : int, optional. Initial frame index. By default the feature is sliced from frame 1. duration : int, optional. Duration in term of number of frames. By default it is set till the last feature. return_reshaped : bool. Return stack of features reshaped after processing. """ if not self.fobj: raise ValueError('The object instance is not open.') T = self.t_size s = self.t_delta if f_init and duration: frames_of_interest = range(f_init, f_init + duration - T + 1, s) feat = self.fobj[video_name][self.feat_id][frames_of_interest, :] elif f_init and (not duration): feat = self.fobj[video_name][self.feat_id][f_init:-T + 1:s, :] elif (not f_init) and duration: feat = self.fobj[video_name][self.feat_id][:duration - T + 1:s, :] else: feat = self.fobj[video_name][self.feat_id][:-T + 1:s, :] pooled_feat = self._feature_pooling(feat) if not return_reshaped: feat_dim = feat.shape[1] pooled_feat = pooled_feat.reshape((-1, feat_dim)) if not pooled_feat.flags['C_CONTIGUOUS']: return np.ascontigousarray(pooled_feat) return pooled_feat
def read_feat(self, video_name, f_init=None, duration=None, return_reshaped=True): """Read C3D features and stack them into memory. Parameters ---------- video-name : str. Video identifier. f_init : int, optional. Initial frame index. By default the feature is sliced from frame 1. duration : int, optional. Duration in term of number of frames. By default it is set till the last feature. return_reshaped : bool. Return stack of features reshaped when pooling is applied. """ if not self.fobj: raise ValueError('The object instance is not open.') s = self.t_stride t_size = self.t_size if f_init and duration: frames_of_interest = range(f_init, f_init + duration - t_size + 1, s) feat = self.fobj[video_name][self.feat_id][frames_of_interest, :] elif f_init and (not duration): feat = self.fobj[video_name][self.feat_id][f_init:-t_size+1:s, :] elif (not f_init) and duration: feat = self.fobj[video_name][self.feat_id][:duration-t_size+1:s, :] else: feat = self.fobj[video_name][self.feat_id][:-t_size+1:s, :] pooled_feat = self._feature_pooling(feat) if not return_reshaped: feat_dim = feat.shape[1] pooled_feat = pooled_feat.reshape((-1, feat_dim)) if not pooled_feat.flags['C_CONTIGUOUS']: return np.ascontigousarray(pooled_feat) return pooled_feat