def generateSampleFace(self, idx): sf = self.scale_factor rf = self.rot_factor main_pts = load_lua( os.path.join(self.img_folder, 'landmarks_t7', self.anno[idx].split('_')[0], self.anno[idx][:-4] + '.t7')) pts = main_pts[0] if self.pointType == '2D' else main_pts[1] #pts2 = main_pts[1] c = torch.Tensor((450 / 2, 450 / 2 + 50)) s = 1.8 img = load_image( os.path.join(self.img_folder, self.anno[idx].split('_')[0], self.anno[idx][:-8] + '.jpg')) r = 0 if self.is_train: s = s * torch.randn(1).mul_(sf).add_(1).clamp(1 - sf, 1 + sf)[0] r = torch.randn(1).mul_(rf).clamp( -2 * rf, 2 * rf)[0] if random.random() <= 0.6 else 0 if random.random() <= 0.5: img = torch.from_numpy(fliplr(img.numpy())).float() pts = shufflelr(pts, width=img.size(2), dataset='w300lp') c[0] = img.size(2) - c[0] img[0, :, :].mul_(random.uniform(0.7, 1.3)).clamp_(0, 1) img[1, :, :].mul_(random.uniform(0.7, 1.3)).clamp_(0, 1) img[2, :, :].mul_(random.uniform(0.7, 1.3)).clamp_(0, 1) # Prepare image and groundtruth map inp = HumanAug.crop(imutils.im_to_numpy(img), c.numpy(), s, r, 256, 200) inp = imutils.im_to_torch(inp).float() pts_input_res = HumanAug.TransformPts(pts.numpy(), c.numpy(), s, r, 256, 200) pts_aug = pts_input_res * (1. * 64 / 256) # Generate ground truth heatmap, pts_aug = HumanPts.pts2heatmap(pts_aug, [64, 64], sigma=1) heatmap = torch.from_numpy(heatmap).float() # inp = crop(img, c, s, [256, 256], rot=r) # # inp = color_normalize(inp, self.mean, self.std) # tpts = pts.clone() # out = torch.zeros(self.nParts, 64, 64) # for i in range(self.nParts): # if tpts[i, 0] > 0: # tpts[i, 0:2] = to_torch(transform(tpts[i, 0:2] + 1, c, s, [64, 64], rot=r)) # out[i] = draw_labelmap(out[i], tpts[i] - 1, sigma=1) return inp, heatmap, pts, c, s, pts_input_res
def __getitem__(self, index): # print('loading image', index) if self.is_train: a = self.anno[self.train[index]] else: a = self.anno[self.valid[index]] img_path = os.path.join(self.img_folder, a['img_paths']) pts = torch.Tensor(a['joint_self']) # pts[:, 0:2] -= 1 # Convert pts to zero based pts = pts[:, 0:2] # c = torch.Tensor(a['objpos']) - 1 c = torch.Tensor(a['objpos']) # print(c) s = torch.Tensor([a['scale_provided']]) # exit() if a['dataset'] == 'MPII': c[1] = c[1] + 15 * s[0] s = s * 1.25 normalizer = a['normalizer'] * 0.6 elif a['dataset'] == 'LEEDS': print('using lsp data') s = s * 1.4375 normalizer = torch.dist(pts[2, :], pts[13, :]) else: print('no such dataset {}'.format(a['dataset'])) # For single-person pose estimation with a centered/scaled figure img = imutils.load_image(img_path) # img = Image.open(img_path) inp = HumanAug.crop(imutils.im_to_numpy(img), c.numpy(), s.numpy(), 0, self.inp_res, self.std_size) inp = imutils.im_to_torch(inp).float() # inp = self.color_normalize(inp, self.mean, self.std) # pts_aug = HumanAug.TransformPts(pts.numpy(), c.numpy(), # s.numpy(), 0, self.out_res, self.std_size) # # # Generate ground truth # heatmap, pts_aug = HumanPts.pts2heatmap(pts_aug, [self.out_res, self.out_res], sigma=1) # heatmap = torch.from_numpy(heatmap).float() tmp_scale_distri = self.grnd_scale_distri[ index] / self.grnd_scale_distri[index].sum() tmp_rot_distri = self.grnd_rotation_distri[ index] / self.grnd_rotation_distri[index].sum() return inp, tmp_scale_distri, tmp_rot_distri, index
def load_rgb_video(video_path: Path, video_url: str, fps: int) -> torch.Tensor: """ Load frames of a video using cv2 (fetch from provided URL if file is not found at given location). """ fetch_from_url(url=video_url, dest_path=video_path) cap = cv2.VideoCapture(str(video_path)) cap_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) cap_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) cap_fps = cap.get(cv2.CAP_PROP_FPS) # cv2 won't be able to change frame rates for all encodings, so we use ffmpeg if cap_fps != fps: tmp_video_path = f"{video_path}.tmp.{video_path.suffix}" shutil.move(video_path, tmp_video_path) cmd = (f"ffmpeg -i {tmp_video_path} -pix_fmt yuv420p " f"-filter:v fps=fps={fps} {video_path}") print(f"Generating new copy of video with frame rate {fps}") os.system(cmd) Path(tmp_video_path).unlink() cap = cv2.VideoCapture(str(video_path)) cap_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) cap_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) cap_fps = cap.get(cv2.CAP_PROP_FPS) assert cap_fps == fps, f"ffmpeg failed to produce a video at {fps}" f = 0 rgb = [] while True: # frame: BGR, (h, w, 3), dtype=uint8 0..255 ret, frame = cap.read() if not ret: break # BGR (OpenCV) to RGB (Torch) frame = frame[:, :, [2, 1, 0]] rgb_t = im_to_torch(frame) rgb.append(rgb_t) f += 1 cap.release() # (nframes, 3, cap_height, cap_width) => (3, nframes, cap_height, cap_width) rgb = torch.stack(rgb).permute(1, 0, 2, 3) print(f"Loaded video {video_path} with {f} frames [{cap_height}hx{cap_width}w] res. " f"at {cap_fps}") return rgb
def gen_img_heatmap(self, c, s, r, img, pts): # Prepare image and groundtruth map # print s[0]/s0[0], r inp = HumanAug.crop(imutils.im_to_numpy(img), c.numpy(), s.numpy(), r, self.inp_res, self.std_size) inp = imutils.im_to_torch(inp).float() # inp = self.color_normalize(inp, self.mean, self.std) pts_aug = HumanAug.TransformPts(pts.numpy(), c.numpy(), s.numpy(), r, self.out_res, self.std_size) idx_indicator = (pts[:, 0] <= 0) | (pts[:, 1] <= 0) idx = torch.arange(0, pts.size(0)).long() idx = idx[idx_indicator] pts_aug[idx, :] = 0 # Generate ground truth heatmap, pts_aug = HumanPts.pts2heatmap(pts_aug, [self.out_res, self.out_res], sigma=1) heatmap = torch.from_numpy(heatmap).float() # pts_aug = torch.from_numpy(pts_aug).float() return inp, heatmap
def create_UV_maps(self, index=0): data = {} image_path = self.images[index] image = cv2.imread(image_path) h, w = image.shape[:2] if h != 256 or w != 256: max_size = max(h, w) ratio = 256 / max_size image = cv2.resize(image, None, fx=ratio, fy=ratio, interpolation=cv2.INTER_CUBIC) image = img_reshape(image) assert image.shape[0] == 256 and image.shape[ 1] == 256, "The image size must be 256*256*3" dst_image = image inp = im_to_torch(dst_image) data['img'] = inp return data
def main(args): # create model model = network.__dict__[cfg.model](cfg.output_shape, cfg.num_class, pretrained=False) model = torch.nn.DataParallel(model).cuda() test_loader = torch.utils.data.DataLoader(MscocoMulti(cfg, train=False), batch_size=args.batch * args.num_gpus, shuffle=False, num_workers=args.workers, pin_memory=True) # load trainning weights checkpoint_file = os.path.join(args.checkpoint, args.test + '.pth.tar') checkpoint = torch.load(checkpoint_file) model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_file, checkpoint['epoch'])) # change to evaluation mode model.eval() print('testing...') full_result = [] for i, (inputs, meta) in tqdm(enumerate(test_loader)): with torch.no_grad(): input_var = torch.autograd.Variable(inputs.cuda()) if args.flip == True: flip_inputs = inputs.clone() for i, finp in enumerate(flip_inputs): finp = im_to_numpy(finp) finp = cv2.flip(finp, 1) flip_inputs[i] = im_to_torch(finp) flip_input_var = torch.autograd.Variable(flip_inputs.cuda()) # compute output global_outputs, refine_output = model(input_var) score_map = refine_output.data.cpu() score_map = score_map.numpy() if args.flip == True: flip_global_outputs, flip_output = model(flip_input_var) flip_score_map = flip_output.data.cpu() flip_score_map = flip_score_map.numpy() for i, fscore in enumerate(flip_score_map): fscore = fscore.transpose((1, 2, 0)) fscore = cv2.flip(fscore, 1) fscore = list(fscore.transpose((2, 0, 1))) for (q, w) in cfg.symmetry: fscore[q], fscore[w] = fscore[w], fscore[q] fscore = np.array(fscore) score_map[i] += fscore score_map[i] /= 2 ids = meta['imgID'].numpy() det_scores = meta['det_scores'] for b in range(inputs.size(0)): details = meta['augmentation_details'] single_result_dict = {} single_result = [] single_map = score_map[b] r0 = single_map.copy() r0 /= 255 r0 += 0.5 v_score = np.zeros(17) for p in range(17): single_map[p] /= np.amax(single_map[p]) border = 10 dr = np.zeros((cfg.output_shape[0] + 2 * border, cfg.output_shape[1] + 2 * border)) dr[border:-border, border:-border] = single_map[p].copy() dr = cv2.GaussianBlur(dr, (21, 21), 0) lb = dr.argmax() y, x = np.unravel_index(lb, dr.shape) dr[y, x] = 0 lb = dr.argmax() py, px = np.unravel_index(lb, dr.shape) y -= border x -= border py -= border + y px -= border + x ln = (px**2 + py**2)**0.5 delta = 0.25 if ln > 1e-3: x += delta * px / ln y += delta * py / ln x = max(0, min(x, cfg.output_shape[1] - 1)) y = max(0, min(y, cfg.output_shape[0] - 1)) resy = float((4 * y + 2) / cfg.data_shape[0] * (details[b][3] - details[b][1]) + details[b][1]) resx = float((4 * x + 2) / cfg.data_shape[1] * (details[b][2] - details[b][0]) + details[b][0]) v_score[p] = float(r0[p, int(round(y) + 1e-10), int(round(x) + 1e-10)]) single_result.append(resx) single_result.append(resy) single_result.append(1) if len(single_result) != 0: single_result_dict['image_id'] = int(ids[b]) single_result_dict['category_id'] = 1 single_result_dict['keypoints'] = single_result single_result_dict['score'] = float( det_scores[b]) * v_score.mean() full_result.append(single_result_dict) result_path = args.result if not isdir(result_path): mkdir_p(result_path) result_file = os.path.join(result_path, 'result.json') with open(result_file, 'w') as wf: json.dump(full_result, wf) # evaluate on COCO eval_gt = COCO(cfg.ori_gt_path) eval_dt = eval_gt.loadRes(result_file) cocoEval = COCOeval(eval_gt, eval_dt, iouType='keypoints') cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize()
def crop(img, center, scale, res, rot=0): img = im_to_numpy(img) # Preprocessing for efficient cropping ht, wd = img.shape[0], img.shape[1] sf = scale * 200.0 / res[0] if sf < 2: # res / img scale 비율이 너무 작으면 그대로 진행 sf = 1 else: # res scale로 mapping할때 적절한 사람 크기가 되도록 resize new_size = int(np.math.floor(max(ht, wd) / sf)) new_ht = int(np.math.floor(ht / sf)) new_wd = int(np.math.floor(wd / sf)) if new_size < 2: print('cannot cropping') return torch.zeros(res[0], res[1], img.shape[2]) if len( img.shape) > 2 else torch.zeros(res[0], res[1]) else: # res scale 좌표계의 center와 scale을 구한다. img = cv2.resize(img, dsize=(new_wd, new_ht), interpolation=cv2.INTER_LINEAR ) # cv2 img format: width x height center = center * 1.0 / sf scale = scale / sf # 1.28 # Upper left point ul = np.array(affine_transform([0, 0], center, scale, res, invert=1)) # invert가 1인 이유는 new img의 [0, 0] 좌표가 old img의 좌표 어디인지 알기위함. # Bottom right point br = np.array(affine_transform(res, center, scale, res, invert=1)) pad = int(np.linalg.norm(br - ul) / 2 - float(br[1] - ul[1]) / 2) if rot != 0: ul -= pad br += pad # crop area new_shape = [br[1] - ul[1], br[0] - ul[0]] if len(img.shape) > 2: new_shape += [img.shape[2]] new_img = np.zeros(new_shape) # HWC # Range to fill new array (boundary zero area) new_x = max(0, -ul[0]), min(br[0], img.shape[1]) - ul[0] new_y = max(0, -ul[1]), min(br[1], img.shape[0]) - ul[1] # Range to sample from original image old_x = max(0, ul[0]), min(img.shape[1], br[0]) old_y = max(0, ul[1]), min(img.shape[0], br[1]) new_img[new_y[0]:new_y[1], new_x[0]:new_x[1]] = img[old_y[0]:old_y[1], old_x[0]:old_x[1]] if rot != 0: # Remove padding rmat = cv2.getRotationMatrix2D((new_shape[1] / 2, new_shape[0] / 2), rot, 1) new_img = cv2.warpAffine(new_img, rmat, (new_shape[1], new_shape[0])) new_img = new_img[pad:-pad, pad:-pad] new_img = im_to_torch( cv2.resize(new_img, dsize=(res[1], res[0]), interpolation=cv2.INTER_LINEAR)) return new_img
def test(test_loader, model): model.eval() print('testing...') full_result = [] flip = True for i, (inputs, meta) in tqdm(enumerate(test_loader)): with torch.no_grad(): input_var = torch.autograd.Variable(inputs.cuda()) if flip == True: flip_inputs = inputs.clone() for i, finp in enumerate(flip_inputs): finp = im_to_numpy(finp) finp = cv2.flip(finp, 1) flip_inputs[i] = im_to_torch(finp) flip_input_var = torch.autograd.Variable(flip_inputs.cuda()) # compute output global_outputs, refine_output = model(input_var) score_map = refine_output.data.cpu() score_map = score_map.numpy() if flip == True: flip_global_outputs, flip_output = model(flip_input_var) flip_score_map = flip_output.data.cpu() flip_score_map = flip_score_map.numpy() for i, fscore in enumerate(flip_score_map): fscore = fscore.transpose((1, 2, 0)) fscore = cv2.flip(fscore, 1) fscore = list(fscore.transpose((2, 0, 1))) for (q, w) in test_cfg.symmetry: fscore[q], fscore[w] = fscore[w], fscore[q] fscore = np.array(fscore) score_map[i] += fscore score_map[i] /= 2 ids = meta['imgID'].numpy() det_scores = meta['det_scores'] for b in range(inputs.size(0)): details = meta['augmentation_details'] single_result_dict = {} single_result = [] single_map = score_map[b] r0 = single_map.copy() r0 /= 255 r0 += 0.5 v_score = np.zeros(17) for p in range(17): single_map[p] /= np.amax(single_map[p]) border = 10 dr = np.zeros((test_cfg.output_shape[0] + 2 * border, test_cfg.output_shape[1] + 2 * border)) dr[border:-border, border:-border] = single_map[p].copy() dr = cv2.GaussianBlur(dr, (21, 21), 0) lb = dr.argmax() y, x = np.unravel_index(lb, dr.shape) dr[y, x] = 0 lb = dr.argmax() py, px = np.unravel_index(lb, dr.shape) y -= border x -= border py -= border + y px -= border + x ln = (px**2 + py**2)**0.5 delta = 0.25 if ln > 1e-3: x += delta * px / ln y += delta * py / ln x = max(0, min(x, test_cfg.output_shape[1] - 1)) y = max(0, min(y, test_cfg.output_shape[0] - 1)) resy = float((4 * y + 2) / test_cfg.data_shape[0] * (details[b][3] - details[b][1]) + details[b][1]) resx = float((4 * x + 2) / test_cfg.data_shape[1] * (details[b][2] - details[b][0]) + details[b][0]) v_score[p] = float(r0[p, int(round(y) + 1e-10), int(round(x) + 1e-10)]) single_result.append(resx) single_result.append(resy) single_result.append(1) if len(single_result) != 0: single_result_dict['image_id'] = int(ids[b]) single_result_dict['category_id'] = 1 single_result_dict['keypoints'] = single_result single_result_dict['score'] = float( det_scores[b]) * v_score.mean() full_result.append(single_result_dict) result_path = 'result' if not isdir(result_path): mkdir_p(result_path) result_file = os.path.join(result_path, 'result.json') with open(result_file, 'w') as wf: json.dump(full_result, wf) # evaluate on COCO eval_gt = COCO(test_cfg.ori_gt_path) eval_dt = eval_gt.loadRes(result_file) cocoEval = COCOeval(eval_gt, eval_dt, iouType='keypoints') cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize()
def __getitem__(self, index): if self.is_train: a = self.anno[self.train[index]] else: a = self.anno[self.valid[index]] img_path = os.path.join(self.img_folder, a['img_paths']) pts = torch.Tensor(a['joint_self']) # pts[:, 0:2] -= 1 # Convert pts to zero based pts = pts[:, 0:2] # c = torch.Tensor(a['objpos']) - 1 c = torch.Tensor(a['objpos']) # print c s = torch.Tensor([a['scale_provided']]) # print s # exit() if a['dataset'] == 'MPII': c[1] = c[1] + 15 * s[0] s = s * 1.25 normalizer = a['normalizer'] * 0.6 elif a['dataset'] == 'LEEDS': print 'using lsp data' s = s * 1.4375 normalizer = torch.dist(pts[2, :], pts[13, :]) else: print 'no such dataset {}'.format(a['dataset']) # For single-person pose estimation with a centered/scaled figure img = imutils.load_image(img_path) # print img.size() # exit() # img = scipy.misc.imread(img_path, mode='RGB') # CxHxW # img = torch.from_numpy(img) r = 0 if self.is_train: s = s * (2**(sample_from_bounded_gaussian(self.scale_factor))) r = sample_from_bounded_gaussian(self.rot_factor) if np.random.uniform(0, 1, 1) <= 0.6: r = 0 # Flip if np.random.random() <= 0.5: img = torch.from_numpy(HumanAug.fliplr(img.numpy())).float() pts = HumanAug.shufflelr(pts, width=img.size(2), dataset='mpii') c[0] = img.size(2) - c[0] # Color img[0, :, :].mul_(np.random.uniform(0.6, 1.4)).clamp_(0, 1) img[1, :, :].mul_(np.random.uniform(0.6, 1.4)).clamp_(0, 1) img[2, :, :].mul_(np.random.uniform(0.6, 1.4)).clamp_(0, 1) # Prepare image and groundtruth map inp = HumanAug.crop(imutils.im_to_numpy(img), c.numpy(), s.numpy(), r, self.inp_res, self.std_size) inp = imutils.im_to_torch(inp).float() # inp = self.color_normalize(inp, self.mean, self.std) pts_aug = HumanAug.TransformPts(pts.numpy(), c.numpy(), s.numpy(), r, self.out_res, self.std_size) #idx_indicator = (pts[:, 0] <= 0) | (pts[:, 1] <= 0) #idx = torch.arange(0, pts.size(0)).long() #idx = idx[idx_indicator] #pts_aug[idx, :] = 0 # Generate ground truth heatmap, pts_aug = HumanPts.pts2heatmap(pts_aug, [self.out_res, self.out_res], sigma=1) heatmap = torch.from_numpy(heatmap).float() # pts_aug = torch.from_numpy(pts_aug).float() r = torch.FloatTensor([r]) #normalizer = torch.FloatTensor([normalizer]) if self.is_train: #print 'inp size: ', inp.size() #print 'heatmap size: ', heatmap.size() #print 'c size: ', c.size() #print 's size: ', s.size() #print 'r size: ', r.size() #print 'pts size: ', pts.size() #print 'normalizer size: ', normalizer.size() #print 'r: ', r # if len(r.size()) != 1: # print 'r: ', r # if len(c.size()) != 1: # print 'c: ', c return inp, heatmap, c, s, r, pts, normalizer else: # Meta info #meta = {'index': index, 'center': c, 'scale': s, # 'pts': pts, 'tpts': pts_aug} return inp, heatmap, c, s, r, pts, normalizer, index
def __getitem__(self, index): if self.is_train: a = self.anno[self.train[index]] else: a = self.anno[self.valid[index]] img_path = os.path.join(self.img_folder, a['img_paths']) pts_path = os.path.join(self.img_folder, a['pts_paths']) skip_pts = [33, 36, 39, 42, 45, 48, 51, 54, 57] if pts_path[-4:] == '.txt': pts = np.loadtxt(pts_path) # L x 2 #pts = pts[skip_pts, :] elif pts_path[-4:] == '.pts': pts = FacePts.Pts2Lmk(pts_path) # L x 2 #pts = pts[skip_pts, :] #print(pts) pts = torch.Tensor(pts) assert torch.sum(pts - torch.Tensor(a['pts'])) == 0 s = torch.Tensor([a['scale_provided_det']]) * 1.1 c = torch.Tensor(a['objpos_det']) # For single-person pose estimation with a centered/scaled figure img = imutils.load_image(img_path) # print img.size() # exit() # img = scipy.misc.imread(img_path, mode='RGB') # CxHxW # img = torch.from_numpy(img) r = 0 if self.is_train: s = s * (2**(sample_from_bounded_gaussian(self.scale_factor))) r = sample_from_bounded_gaussian(self.rot_factor) if np.random.uniform(0, 1, 1) <= 0.6: r = np.array([0]) # Flip #if np.random.random() <= 0.5: # img = torch.from_numpy(HumanAug.fliplr(img.numpy())).float() # pts = HumanAug.shufflelr(pts, width=img.size(2), dataset='face') # c[0] = img.size(2) - c[0] # Color img[0, :, :].mul_(np.random.uniform(0.6, 1.4)).clamp_(0, 1) img[1, :, :].mul_(np.random.uniform(0.6, 1.4)).clamp_(0, 1) img[2, :, :].mul_(np.random.uniform(0.6, 1.4)).clamp_(0, 1) # Prepare image and groundtruth map inp = HumanAug.crop(imutils.im_to_numpy(img), c.numpy(), s.numpy(), r, self.inp_res, self.std_size) inp = imutils.im_to_torch(inp).float() # inp = self.color_normalize(inp, self.mean, self.std) # pts_aug = HumanAug.TransformPts(pts.numpy(), c.numpy(), # s.numpy(), r, self.out_res, self.std_size) pts_input_res = HumanAug.TransformPts(pts.numpy(), c.numpy(), s.numpy(), r, self.inp_res, self.std_size) pts_aug = pts_input_res * (1. * self.out_res / self.inp_res) #check_res = pts_input_res - pts #print('diff.... -> {}'.format(check_res)) # Generate ground truth heatmap, pts_aug = HumanPts.pts2heatmap(pts_aug, [self.out_res, self.out_res], sigma=1) heatmap = torch.from_numpy(heatmap).float() # pts_aug = torch.from_numpy(pts_aug).float() if self.is_train: return inp, heatmap, pts_input_res else: # Meta info #meta = {'index': index, 'center': c, 'scale': s, # 'pts': pts, 'tpts': pts_aug} return inp, heatmap, pts, index, c, s, img_path
def _load_rgb(self, ind, frame_ix): """Loads the video frames from file frame_ix could be range(t, t + nframes) for consecutive reading or a random sorted subset of [0, video_length] of size nframes """ is_consecutive = range(min(frame_ix), max(frame_ix) + 1) == frame_ix nframes = len(frame_ix) videofile = self._get_video_file(ind) use_cv2 = True if getattr(self, "video_data_dict", False): use_cv2 = False compressed_frames = self.video_data_dict[videofile]["data"] elif getattr(self, "featurize_mode", False) and not DISABLE_CACHING: cap = None if not hasattr(self, "cached_caps"): self.cached_caps = defaultdict(OrderedDict) if videofile in self.cached_caps: cap = self.cached_caps[videofile].pop(frame_ix.start, None) assert is_consecutive, "capture caching should only use consecutive ims" if not cap: cap = cv2.VideoCapture(videofile) # Do the frame setting only once if the rest are consecutive if is_consecutive: cap.set(propId=1, value=frame_ix[0]) else: cap = cv2.VideoCapture(videofile) # Do the frame setting only once if the rest are consecutive if is_consecutive: cap.set(propId=1, value=frame_ix[0]) # Frame reads if getattr(self, "gpu_collation", False): msg = "expected collation dim == 256" assert self.gpu_collation == 256, msg rgb = torch.zeros(3, nframes, self.gpu_collation, self.gpu_collation) else: rgb = torch.zeros( 3, nframes, self._get_img_height(ind), self._get_img_width(ind) ) # rgb = torch.zeros(3, nframes, self.img_height, self.img_width) for f, fix in enumerate(frame_ix): if use_cv2: if not is_consecutive: cap.set(propId=1, value=fix) # frame: BGR, (240, 320, 3), dtype=uint8 0..255 ret, frame = cap.read() else: ret = fix < len(compressed_frames) if ret: frame = Image.open(compressed_frames[fix]) if ret: if use_cv2: # BGR (OpenCV) to RGB (Torch) frame = frame[:, :, [2, 1, 0]] # CxHxW (3, 240, 320), 0..1 --> np.transpose(frame, [2, 0, 1]) / 255.0 rgb_t = im_to_torch(frame) rgb[:, f, :, :] = rgb_t else: # Copy last frame for temporal padding rgb[:, f, :, :] = rgb[:, f - 1, :, :] if use_cv2: if ( hasattr(self, "featurize_mode") and self.featurize_mode and not DISABLE_CACHING ): if fix == self._get_nframes(ind): cap.release() else: # store pointer to avoid duplicate decoding self.cached_caps[videofile][frame_ix.stop] = cap if len(self.cached_caps[videofile]) > CAP_CACHE_LIMIT: # we rely on OrderedDict to preserve key order oldest_keys = list(self.cached_caps.keys()) print( f"Cache overflow [{len(self.cached_caps[videofile])}]" f" >{CAP_CACHE_LIMIT}, clearing half of the keys" ) for old_key in oldest_keys[: CAP_CACHE_LIMIT // 2]: # To guard against race conditions we supply a default for # missing keys self.cached_caps[videofile].pop(old_key, None) else: cap.release() # rgb.view(3 * nframes, rgb.size(2), rgb.size(3)) rgb = video_to_im(rgb) return rgb
def _get_single_video(self, index, data_index, frame_ix): """Loads/augments/returns the video data :param index: Index wrt to the data loader :param data_index: Index wrt to train/valid list :param frame_ix: A list of frame indices to sample from the video :return data: Dictionary of input/output and other metadata """ # If the input is pose (Pose->Sign experiments) if hasattr(self, "input_type") and self.input_type == "pose": data = { "rgb": self._get_pose(data_index, frame_ix), "index": index, "data_index": data_index, "class": self._get_class(data_index, frame_ix), "class_names": self.class_names, "dataset": self.datasetname, } return data # Otherwise the input is RGB else: rgb = self._load_rgb(data_index, frame_ix) if getattr(self, "mask_rgb", False): rgb = self._mask_rgb( rgb, data_index, frame_ix, region=self.mask_rgb, mask_type=self.mask_type, ) if getattr(self, "gpu_collation", False): # Meta info data = { "rgb": rgb, "index": index, "data_index": data_index, "class": self._get_class(data_index, frame_ix), "class_names": self.class_names, "dataset": self.datasetname, } return data # Preparing RGB data if self.setname == "train": # Horizontal flip: disable for now, should be done after the bbox cropping is_hflip = random.random() < self.hflip if is_hflip: rgb = torch.flip(rgb, dims=[2]) # Color jitter rgb = im_color_jitter(rgb, num_in_frames=self.num_in_frames, thr=0.2) rgb = im_to_numpy(rgb) iH, iW, iC = rgb.shape if self.use_bbox: y0, x0, y1, x1 = self._get_bbox(data_index) y0 = max(0, int(y0 * iH)) y1 = min(iH, int(y1 * iH)) x0 = max(0, int(x0 * iW)) x1 = min(iW, int(x1 * iW)) if self.setname == "train" and is_hflip: x0 = iW - x0 x1 = iW - x1 x0, x1 = x1, x0 rgb = rgb[y0:y1, x0:x1, :] rgb = resize_generic( rgb, self.resize_res, self.resize_res, interp="bilinear", is_flow=False, ) iH, iW, iC = rgb.shape resol = self.resize_res # 300 for 256, 130 for 112 etc. if self.setname == "train": # Augment the scaled resolution between: # [1 - self.scale_factor, 1 + self.scale_factor) rand_scale = random.random() resol *= 1 - self.scale_factor + 2 * self.scale_factor * rand_scale resol = int(resol) if iW > iH: nH, nW = resol, int(resol * iW / iH) else: nH, nW = int(resol * iH / iW), resol # Resize to nH, nW resolution rgb = resize_generic(rgb, nH, nW, interp="bilinear", is_flow=False) # Crop if self.setname == "train": # Random crop coords ulx = random.randint(0, nW - self.inp_res) uly = random.randint(0, nH - self.inp_res) else: # Center crop coords ulx = int((nW - self.inp_res) / 2) uly = int((nH - self.inp_res) / 2) # Crop 256x256 rgb = rgb[uly : uly + self.inp_res, ulx : ulx + self.inp_res] rgb = im_to_torch(rgb) rgb = im_to_video(rgb) rgb = color_normalize(rgb, self.mean, self.std) # Return data = { "rgb": rgb, "class": self._get_class(data_index, frame_ix), "index": index, "class_names": self.class_names, "dataset": self.datasetname, } return data
def main(args): # create model model = network.__dict__[cfg.model](cfg.output_shape, cfg.num_class, pretrained=False) model = torch.nn.DataParallel(model).cuda() test_loader = torch.utils.data.DataLoader( MscocoMulti(cfg, train=False), batch_size=args.batch * args.num_gpus, shuffle=False, num_workers=args.workers, pin_memory=True) # load trainning weights # checkpoint_file = os.path.join(args.checkpoint, args.test + '.pth.tar') checkpoint_file = os.path.join('model', 'checkpoint', 'epoch9checkpoint.pth.tar') checkpoint = torch.load(checkpoint_file) model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format(checkpoint_file, checkpoint['epoch'])) # change to evaluation mode model.eval() print('testing...') full_result = [] for i, (inputs, meta) in tqdm(enumerate(test_loader)): with torch.no_grad(): input_var = torch.autograd.Variable(inputs.cuda()) if args.flip == True: flip_inputs = inputs.clone() for i, finp in enumerate(flip_inputs): finp = im_to_numpy(finp) finp = cv2.flip(finp, 1) flip_inputs[i] = im_to_torch(finp) flip_input_var = torch.autograd.Variable(flip_inputs.cuda()) # compute output global_outputs, refine_output = model(input_var) score_map = refine_output.data.cpu() score_map = score_map.numpy() if args.flip == True: flip_global_outputs, flip_output = model(flip_input_var) flip_score_map = flip_output.data.cpu() flip_score_map = flip_score_map.numpy() for i, fscore in enumerate(flip_score_map): fscore = fscore.transpose((1, 2, 0)) fscore = cv2.flip(fscore, 1) fscore = list(fscore.transpose((2, 0, 1))) for (q, w) in cfg.symmetry: fscore[q], fscore[w] = fscore[w], fscore[q] fscore = np.array(fscore) score_map[i] += fscore score_map[i] /= 2 # ids = meta['imgID'].numpy() det_scores = meta['det_scores'] for b in range(inputs.size(0)): details = meta['augmentation_details'] imgid = meta['imgid'][b] # print(imgid) category = meta['category'][b] # print(category) single_result_dict = {} single_result = [] single_map = score_map[b] r0 = single_map.copy() r0 /= 255 r0 += 0.5 v_score = np.zeros(24) for p in range(24): single_map[p] /= np.amax(single_map[p]) border = 10 dr = np.zeros((cfg.output_shape[0] + 2 * border, cfg.output_shape[1] + 2 * border)) dr[border:-border, border:-border] = single_map[p].copy() dr = cv2.GaussianBlur(dr, (21, 21), 0) lb = dr.argmax() y, x = np.unravel_index(lb, dr.shape) dr[y, x] = 0 lb = dr.argmax() py, px = np.unravel_index(lb, dr.shape) y -= border x -= border py -= border + y px -= border + x ln = (px ** 2 + py ** 2) ** 0.5 delta = 0.25 if ln > 1e-3: x += delta * px / ln y += delta * py / ln x = max(0, min(x, cfg.output_shape[1] - 1)) y = max(0, min(y, cfg.output_shape[0] - 1)) resy = float((4 * y + 2) / cfg.data_shape[0] * (details[b][3] - details[b][1]) + details[b][1]) resx = float((4 * x + 2) / cfg.data_shape[1] * (details[b][2] - details[b][0]) + details[b][0]) v_score[p] = float(r0[p, int(round(y) + 1e-10), int(round(x) + 1e-10)]) single_result.append(resx) single_result.append(resy) single_result.append(1) if len(single_result) != 0: result = [] result.append(imgid) result.append(category) j = 0 while j < len(single_result): result.append(str(int(single_result[j])) + '_' + str(int(single_result[j + 1])) + '_1') j += 3 full_result.append(result) result_path = args.result if not isdir(result_path): mkdir_p(result_path) result_file = os.path.join(result_path, 'result.csv') with open(result_file, 'w', newline='') as f: writer = csv.writer(f) writer.writerows(full_result) Evaluator = FaiKeypoint2018Evaluator(userAnswerFile=os.path.join(result_path, 'result9.csv'), standardAnswerFile="fashionAI_key_points_test_a_answer_20180426.csv") score = Evaluator.evaluate() print(score) Evaluator.writerror(result_path=os.path.join(result_path, "toperror1.csv"))
def main(args): # create model model = network.__dict__[cfg.model](cfg.output_shape, cfg.num_class, pretrained=False) model = torch.nn.DataParallel(model).cuda() test_loader = torch.utils.data.DataLoader(MscocoMulti(cfg, train=False), batch_size=args.batch * args.num_gpus, shuffle=False, num_workers=args.workers, pin_memory=True) # load trainning weights checkpoint_file = os.path.join(args.checkpoint, args.test + '.pth.tar') checkpoint = torch.load(checkpoint_file) model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_file, checkpoint['epoch'])) # change to evaluation mode model.eval() print('testing...') full_result = [] for i, (inputs, meta) in tqdm(enumerate(test_loader)): # print(i) # print(inputs.shape) with torch.no_grad(): input_var = torch.autograd.Variable(inputs.cuda()) if args.flip == True: flip_inputs = inputs.clone() # k = 0 for i, finp in enumerate(flip_inputs): finp = im_to_numpy(finp) finp = cv2.flip(finp, 1) flip_inputs[i] = im_to_torch(finp) # print(k) # print(1111111111111111) flip_input_var = torch.autograd.Variable(flip_inputs.cuda()) # compute output global_outputs, refine_output = model(input_var) score_map = refine_output.data.cpu() score_map = score_map.numpy() # print(score_map.shape) # score_map (128,2,64,48) # xx = inputs.numpy() # print(xx[0].transpose((1,2,0)).shape) # plt.figure(1) # plt.subplot(121) # plt.imshow(xx[0].transpose((1,2,0))) # # plt.subplot(122) # plt.imshow(score_map[0][0], cmap='gray', interpolation='nearest') # plt.show() if args.flip == True: flip_global_outputs, flip_output = model(flip_input_var) flip_score_map = flip_output.data.cpu() flip_score_map = flip_score_map.numpy() for i, fscore in enumerate(flip_score_map): fscore = fscore.transpose((1, 2, 0)) fscore = cv2.flip(fscore, 1) # fscore=fscore[:, :,np.newaxis] # print(fscore.shape) # (64,48,2) # print(2222222222222) fscore = list(fscore.transpose((2, 0, 1))) # for (q, w) in cfg.symmetry: # fscore[q], fscore[w] = fscore[w], fscore[q] fscore = np.array(fscore) score_map[i] += fscore score_map[i] /= 2 # print(score_map[i].shape) # print(score_map.shape) (128,2,64.48) ids = meta['imgID'].numpy() imgclass = meta['class'] # print(ids) det_scores = meta['det_scores'] for b in range(inputs.size(0)): # print(inputs.size(0)) details = meta['augmentation_details'] single_result_dict = {} single_result = [] single_map = score_map[b] #(2,64,48) # print(single_map.shape) r0 = single_map.copy() r0 /= 255 r0 += 0.5 v_score = np.zeros(10) if imgclass[b] == 'chair': c = 0 elif imgclass[b] == 'bed': c = 1 elif imgclass[b] == 'sofa': c = 2 single_map[c] /= np.amax(single_map[c]) border = 9 ps = parseHeatmap(single_map[c], thresh=0.20) #shape 2 # print(len(ps[0])) # print(len(ps[1])) # print(1111111111) # plt.imshow(single_map[c], cmap='gray', interpolation='nearest') # plt.show() # print(len(ps[0])) for k in range(len(ps[0])): x = ps[0][k] - border # height y = ps[1][k] - border # width # print(cfg.data_shape[0]) # height # print(cfg.data_shape[1]) # width resy = float((4 * x + 2) / cfg.data_shape[0] * (details[b][3] - details[b][1]) + details[b][1]) resx = float((4 * y + 2) / cfg.data_shape[1] * (details[b][2] - details[b][0]) + details[b][0]) # print(resx,resy) single_result.append(resx) single_result.append(resy) single_result.append(1) if len(single_result) != 0: single_result_dict['image_id'] = int(ids[b]) single_result_dict['class'] = imgclass[b] single_result_dict['keypoints'] = single_result # single_result_dict['score'] = float(det_scores[b])*v_score.mean() full_result.append(single_result_dict) result_path = args.result if not isdir(result_path): mkdir_p(result_path) result_file = os.path.join(result_path, 'result.json') with open(result_file, 'w') as wf: json.dump(full_result, wf)
def __getitem__(self, index): if self.is_train: a = self.anno[self.train[index]] else: a = self.anno[self.valid[index]] img_path = os.path.join(self.img_folder, a['img_paths']) if a['pts_paths'] == "unknown.xyz": pts = a['pts'] else: pts_path = os.path.join(self.img_folder, a['pts_paths']) if pts_path[-4:] == '.txt': pts = np.loadtxt(pts_path) # L x 2 else: pts = a['pts'] pts = np.array(pts) # Assume all points are visible for a dataset. This is a multiclass # visibility visible_multiclass = np.ones(pts.shape[0]) if a['dataset'] == 'aflw_ours' or a['dataset'] == 'cofw_68': # The pts which are labelled -1 in both x and y are not visible points self_occluded_landmark = (pts[:, 0] == -1) & (pts[:, 1] == -1) external_occluded_landmark = (pts[:, 0] < -1) & (pts[:, 1] < -1) visible_multiclass[self_occluded_landmark] = 0 visible_multiclass[external_occluded_landmark] = 2 # valid landmarks are those which are external occluded and not occluded valid_landmark = (pts[:, 0] != -1) & (pts[:, 1] != -1) # The points which are partially occluded have both coordinates as negative but not -1 # Make them positive pts = np.abs(pts) # valid_landmark is 0 for to be masked and 1 for not to be masked # mask is 1 for to be masked and 0 for not to be masked pts_masked = np.ma.array(pts, mask=np.column_stack( (1 - valid_landmark, 1 - valid_landmark))) pts_mean = np.mean(pts_masked, axis=0) # Replace -1 by mean of valid landmarks. Otherwise taking min for # calculating geomteric mean of the box can create issues later. pts[self_occluded_landmark] = pts_mean.data scale_mul_factor = 1.1 elif a['dataset'] == "aflw" or a['dataset'] == "wflw": self_occluded_landmark = (pts[:, 0] <= 0) | (pts[:, 1] <= 0) valid_landmark = 1 - self_occluded_landmark visible_multiclass[self_occluded_landmark] = 0 # valid_landmark is 0 for to be masked and 1 for not to be masked # mask is 1 for to be masked and 0 for not to be masked pts_masked = np.ma.array(pts, mask=np.column_stack( (1 - valid_landmark, 1 - valid_landmark))) pts_mean = np.mean(pts_masked, axis=0) # Replace -1 by mean of valid landmarks. Otherwise taking min for # calculating geomteric mean of the box can create issues later. pts[self_occluded_landmark] = pts_mean.data scale_mul_factor = 1.25 else: scale_mul_factor = 1.1 pts = torch.Tensor(pts) # size is 68*2 s = torch.Tensor([a['scale_provided_det']]) * scale_mul_factor c = torch.Tensor(a['objpos_det']) # For single-person pose estimation with a centered/scaled figure # the image in the original size img = imutils.load_image(img_path) r = 0 s_rand = 1 if self.is_train: #data augmentation for training data s_rand = (1 + sample_from_bounded_gaussian(self.scale_factor / 2.)) s = s * s_rand r = sample_from_bounded_gaussian(self.rot_factor / 2.) #print('s shape is ', s.size(), 's is ', s) #if np.random.uniform(0, 1, 1) <= 0.6: # r = np.array([0]) if self.use_flipping: # Flip if np.random.random() <= 0.5: img = torch.from_numpy(HumanAug.fliplr( img.numpy())).float() pts = HumanAug.shufflelr(pts, width=img.size(2), dataset='face') c[0] = img.size(2) - c[0] # Color img[0, :, :].mul_(np.random.uniform(0.6, 1.4)).clamp_(0, 1) img[1, :, :].mul_(np.random.uniform(0.6, 1.4)).clamp_(0, 1) img[2, :, :].mul_(np.random.uniform(0.6, 1.4)).clamp_(0, 1) if self.use_occlusion: # Apply a random black occlusion # C x H x W patch_center_row = randint(1, img.size(1)) patch_center_col = randint(1, img.size(2)) patch_height = randint(1, img.size(1) / 2) patch_width = randint(1, img.size(2) / 2) row_min = max(0, patch_center_row - patch_height) row_max = min(img.size(1), patch_center_row + patch_height) col_min = max(0, patch_center_col - patch_width) col_max = min(img.size(2), patch_center_col + patch_width) img[:, row_min:row_max, col_min:col_max] = 0 # Prepare points first pts_input_res = HumanAug.TransformPts(pts.numpy(), c.numpy(), s.numpy(), r, self.inp_res, self.std_size) # Some landmark points can go outside after transformation. Determine the # extra scaling required. # This can only be done for the training points. For validation, we do # not know the points location. if self.is_train and self.keep_pts_inside: # visible copy takes care of whether point is visible or not. visible_copy = visible_multiclass.copy() visible_copy[visible_multiclass > 1] = 1 scale_down = get_ideal_scale(pts_input_res, self.inp_res, img_path, visible=visible_copy) s = s / scale_down s_rand = s_rand / scale_down pts_input_res = HumanAug.TransformPts(pts.numpy(), c.numpy(), s.numpy(), r, self.inp_res, self.std_size) if a['dataset'] == "aflw": meta_box_size = a['box_size'] # We convert the meta_box size also to the input res. The meta_box # is not formed by the landmark point but is supplied externally. # We assume the meta_box as two points [meta_box_size, 0] and [0, 0] # apply the transformation on top of it temp = HumanAug.TransformPts( np.array([[meta_box_size, 0], [0, 0]]), c.numpy(), s.numpy(), r, self.inp_res, self.std_size) # Passed as array of 2 x 2 # we only want the transformed distance between the points meta_box_size_input_res = np.linalg.norm(temp[1] - temp[0]) else: meta_box_size_input_res = -10 # some invalid number # pts_input_res is in the size of 256 x 256 # Bring down to 64 x 64 since finally heatmap will be 64 x 64 pts_aug = pts_input_res * (1. * self.out_res / self.inp_res) # Prepare image inp = HumanAug.crop(imutils.im_to_numpy(img), c.numpy(), s.numpy(), r, self.inp_res, self.std_size) inp_vis = inp inp = imutils.im_to_torch(inp).float() # 3*256*256 # Generate proxy ground truth heatmap heatmap, pts_aug = HumanPts.pts2heatmap(pts_aug, [self.out_res, self.out_res], sigma=self.sigma) heatmap = torch.from_numpy(heatmap).float() heatmap_mask = HumanPts.pts2mask(pts_aug, [self.out_res, self.out_res], bb=10) if self.is_train: return inp, heatmap, pts_input_res, heatmap_mask, s_rand, visible_multiclass, meta_box_size_input_res else: return inp, heatmap, pts_input_res, c, s, index, inp_vis, s_rand, visible_multiclass, meta_box_size_input_res
def generateSampleFace(self, idx): sf = self.scale_factor rf = self.rot_factor #print('Filename -->{}'.format(self.anno[idx][:-4] + '.jpg')) main_pts = sio.loadmat(self.anno[idx]) pts = main_pts['pt3d_68'][0:2, :].transpose() #print(pts.dtype) pts = np.float32(pts) pts = torch.from_numpy(pts) #print('pts -> {}'.format(pts)) pts = torch.clamp(pts, min=0) mins_ = torch.min(pts, 0)[0].view(2) # min vals maxs_ = torch.max(pts, 0)[0].view(2) # max vals c = torch.FloatTensor((maxs_[0] - (maxs_[0] - mins_[0]) / 2, maxs_[1] - (maxs_[1] - mins_[1]) / 2)) #print('min Values format -> {}'.format(mins_.dtype)) #print('max Values format -> {}'.format(maxs_.dtype)) c[1] -= ((maxs_[1] - mins_[1]) * 0.12) s = (maxs_[0] - mins_[0] + maxs_[1] - mins_[1]) / 195 img = load_image(self.anno[idx][:-4] + '.jpg') r = 0 if self.is_train: s = s * torch.randn(1).mul_(sf).add_(1).clamp(1 - sf, 1 + sf)[0] r = torch.randn(1).mul_(rf).clamp( -2 * rf, 2 * rf)[0] if random.random() <= 0.6 else 0 if random.random() <= 0.5: img = torch.from_numpy(fliplr(img.numpy())).float() pts = shufflelr(pts, width=img.size(2), dataset='aflw2000') c[0] = img.size(2) - c[0] img[0, :, :].mul_(random.uniform(0.7, 1.3)).clamp_(0, 1) img[1, :, :].mul_(random.uniform(0.7, 1.3)).clamp_(0, 1) img[2, :, :].mul_(random.uniform(0.7, 1.3)).clamp_(0, 1) # Prepare image and groundtruth map inp = HumanAug.crop(imutils.im_to_numpy(img), c.numpy(), s, r, 256, 200) inp = imutils.im_to_torch(inp).float() pts_input_res = HumanAug.TransformPts(pts.numpy(), c.numpy(), s, r, 256, 200) pts_aug = pts_input_res * (1. * 64 / 256) # Generate ground truth heatmap, pts_aug = HumanPts.pts2heatmap(pts_aug, [64, 64], sigma=1) heatmap = torch.from_numpy(heatmap).float() # inp = crop(img, c, s, [256, 256], rot=r) # # inp = color_normalize(inp, self.mean, self.std) # tpts = pts.clone() # out = torch.zeros(self.nParts, 64, 64) # for i in range(self.nParts): # if tpts[i, 0] > 0: # tpts[i, 0:2] = to_torch(transform(tpts[i, 0:2] + 1, c, s, [64, 64], rot=r)) # out[i] = draw_labelmap(out[i], tpts[i] - 1, sigma=1) return inp, heatmap, pts, c, s, pts_input_res